From 7974891db234467eaf1fec613ec0129cb4ac2332 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 28 Jun 2010 14:15:54 +0200
Subject: x86: Always use irq stacks

IRQ stacks provide much better safety against unexpected stack use from
interrupts, at the minimal downside of slightly higher memory usage.
Enable irq stacks also for the default 8k stack on 32-bit kernels to
minimize the problem of stack overflows through interrupt activity.

This is what the 64-bit kernel and various other architectures already do.

Signed-off-by: Christoph Hellwig <hch@lst.de>
LKML-Reference: <20100628121554.GA6605@lst.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/Kconfig.debug     |  3 +--
 arch/x86/include/asm/irq.h | 12 +++++-------
 arch/x86/kernel/irq_32.c   |  6 ------
 3 files changed, 6 insertions(+), 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 75085080b63..badda8e20e7 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -128,8 +128,7 @@ config 4KSTACKS
 	  If you say Y here the kernel will use a 4Kb stacksize for the
 	  kernel stack attached to each process/thread. This facilitates
 	  running more threads on a system and also reduces the pressure
-	  on the VM subsystem for higher order allocations. This option
-	  will also use IRQ stacks to compensate for the reduced stackspace.
+	  on the VM subsystem for higher order allocations.
 
 config DOUBLEFAULT
 	default y
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index 5458380b6ef..0bf5b008365 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -19,18 +19,16 @@ static inline int irq_canonicalize(int irq)
 # define ARCH_HAS_NMI_WATCHDOG
 #endif
 
-#ifdef CONFIG_4KSTACKS
-  extern void irq_ctx_init(int cpu);
-  extern void irq_ctx_exit(int cpu);
-# define __ARCH_HAS_DO_SOFTIRQ
+#ifdef CONFIG_X86_32
+extern void irq_ctx_init(int cpu);
+extern void irq_ctx_exit(int cpu);
 #else
 # define irq_ctx_init(cpu) do { } while (0)
 # define irq_ctx_exit(cpu) do { } while (0)
-# ifdef CONFIG_X86_64
-#  define __ARCH_HAS_DO_SOFTIRQ
-# endif
 #endif
 
+#define __ARCH_HAS_DO_SOFTIRQ
+
 #ifdef CONFIG_HOTPLUG_CPU
 #include <linux/cpumask.h>
 extern void fixup_irqs(void);
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 10709f29d16..67f5f9f5299 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -49,7 +49,6 @@ static inline int check_stack_overflow(void) { return 0; }
 static inline void print_stack_overflow(void) { }
 #endif
 
-#ifdef CONFIG_4KSTACKS
 /*
  * per-CPU IRQ handling contexts (thread information and stack)
  */
@@ -187,11 +186,6 @@ asmlinkage void do_softirq(void)
 	local_irq_restore(flags);
 }
 
-#else
-static inline int
-execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) { return 0; }
-#endif
-
 bool handle_irq(unsigned irq, struct pt_regs *regs)
 {
 	struct irq_desc *desc;
-- 
cgit v1.2.3-18-g5258


From dcfa726280116dd31adad37da940f542663567d0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 28 Jun 2010 14:16:14 +0200
Subject: x86: Remove CONFIG_4KSTACKS

These days 4 kilobytes of stack just aren't enough for reliably operation,
and people using lots of threads have long switched to 64-bit kernels, so
remove the CONFIG_4KSTACKS option.

Signed-off-by: Christoph Hellwig <hch@lst.de>
LKML-Reference: <20100628121614.GB6605@lst.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/Kconfig.debug               | 9 ---------
 arch/x86/include/asm/module.h        | 7 +------
 arch/x86/include/asm/page_32_types.h | 4 ----
 3 files changed, 1 insertion(+), 19 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index badda8e20e7..7f1530838bc 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -121,15 +121,6 @@ config DEBUG_NX_TEST
 	  and the software setup of this feature.
 	  If in doubt, say "N"
 
-config 4KSTACKS
-	bool "Use 4Kb for kernel stacks instead of 8Kb"
-	depends on X86_32
-	---help---
-	  If you say Y here the kernel will use a 4Kb stacksize for the
-	  kernel stack attached to each process/thread. This facilitates
-	  running more threads on a system and also reduces the pressure
-	  on the VM subsystem for higher order allocations.
-
 config DOUBLEFAULT
 	default y
 	bool "Enable doublefault exception handler" if EMBEDDED
diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h
index 3e2ce58a31a..67763c5d8b4 100644
--- a/arch/x86/include/asm/module.h
+++ b/arch/x86/include/asm/module.h
@@ -60,12 +60,7 @@
 #endif
 
 #ifdef CONFIG_X86_32
-# ifdef CONFIG_4KSTACKS
-#  define MODULE_STACKSIZE "4KSTACKS "
-# else
-#  define MODULE_STACKSIZE ""
-# endif
-# define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY MODULE_STACKSIZE
+# define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY
 #endif
 
 #endif /* _ASM_X86_MODULE_H */
diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h
index 6f1b7331313..ade619ff9e2 100644
--- a/arch/x86/include/asm/page_32_types.h
+++ b/arch/x86/include/asm/page_32_types.h
@@ -15,11 +15,7 @@
  */
 #define __PAGE_OFFSET		_AC(CONFIG_PAGE_OFFSET, UL)
 
-#ifdef CONFIG_4KSTACKS
-#define THREAD_ORDER	0
-#else
 #define THREAD_ORDER	1
-#endif
 #define THREAD_SIZE 	(PAGE_SIZE << THREAD_ORDER)
 
 #define STACKFAULT_STACK 0
-- 
cgit v1.2.3-18-g5258


From 25897374297906eeebef8864299406bdcb5859c3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 27 Jul 2010 14:13:13 +0200
Subject: x86-32: Align IRQ stacks properly

As suggested by Steven Rostedt we need to align the IRQ stacks to the
stack size, not just the page size to make them work for stack traces
and other things that depend on finding the stack slot itself with 8k
stacks.

Signed-off-by: Christoph Hellwig <hch@lst.de>
LKML-Reference: <20100727121313.GA19976@lst.de>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/irq_32.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 67f5f9f5299..3b5609f54c4 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -55,7 +55,7 @@ static inline void print_stack_overflow(void) { }
 union irq_ctx {
 	struct thread_info      tinfo;
 	u32                     stack[THREAD_SIZE/sizeof(u32)];
-} __attribute__((aligned(PAGE_SIZE)));
+} __attribute__((aligned(THREAD_SIZE)));
 
 static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx);
 static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx);
-- 
cgit v1.2.3-18-g5258


From f6e9456c9272bb570df6e217cdbe007e270b1c4e Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 21 Jul 2010 19:03:58 +0200
Subject: x86, cleanup: Remove obsolete boot_cpu_id variable

boot_cpu_id is there for historical reasons and was renamed to
boot_cpu_physical_apicid in patch:

 c70dcb7 x86: change boot_cpu_id to boot_cpu_physical_apicid

However, there are some remaining occurrences of boot_cpu_id that are
never touched in the kernel and thus its value is always 0.

This patch removes boot_cpu_id completely.

Signed-off-by: Robert Richter <robert.richter@amd.com>
LKML-Reference: <1279731838-1522-8-git-send-email-robert.richter@amd.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/apb_timer.h |  1 -
 arch/x86/include/asm/cpu.h       |  1 -
 arch/x86/kernel/apb_timer.c      |  2 +-
 arch/x86/kernel/apic/io_apic.c   | 12 ++++++------
 arch/x86/kernel/cpu/amd.c        |  2 +-
 arch/x86/kernel/cpu/common.c     |  2 +-
 arch/x86/kernel/cpu/intel.c      |  2 +-
 arch/x86/kernel/reboot.c         |  2 +-
 arch/x86/kernel/setup.c          |  1 -
 arch/x86/kernel/setup_percpu.c   |  2 +-
 arch/x86/mm/k8topology_64.c      |  6 +++---
 11 files changed, 15 insertions(+), 18 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/apb_timer.h b/arch/x86/include/asm/apb_timer.h
index a69b1ac9eaf..2fefa501d3b 100644
--- a/arch/x86/include/asm/apb_timer.h
+++ b/arch/x86/include/asm/apb_timer.h
@@ -54,7 +54,6 @@ extern struct clock_event_device *global_clock_event;
 extern unsigned long apbt_quick_calibrate(void);
 extern int arch_setup_apbt_irqs(int irq, int trigger, int mask, int cpu);
 extern void apbt_setup_secondary_clock(void);
-extern unsigned int boot_cpu_id;
 
 extern struct sfi_timer_table_entry *sfi_get_mtmr(int hint);
 extern void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr);
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index b185091bf19..4fab24de26b 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -32,6 +32,5 @@ extern void arch_unregister_cpu(int);
 
 DECLARE_PER_CPU(int, cpu_state);
 
-extern unsigned int boot_cpu_id;
 
 #endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index 8dd77800ff5..08f75fb4f50 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -343,7 +343,7 @@ void apbt_setup_secondary_clock(void)
 
 	/* Don't register boot CPU clockevent */
 	cpu = smp_processor_id();
-	if (cpu == boot_cpu_id)
+	if (!cpu)
 		return;
 	/*
 	 * We need to calculate the scaled math multiplication factor for
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 4dc0084ec1b..8884928d7bc 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -162,7 +162,7 @@ int __init arch_early_irq_init(void)
 
 	cfg = irq_cfgx;
 	count = ARRAY_SIZE(irq_cfgx);
-	node= cpu_to_node(boot_cpu_id);
+	node = cpu_to_node(0);
 
 	for (i = 0; i < count; i++) {
 		desc = irq_to_desc(i);
@@ -1483,7 +1483,7 @@ static void __init setup_IO_APIC_irqs(void)
 	int notcon = 0;
 	struct irq_desc *desc;
 	struct irq_cfg *cfg;
-	int node = cpu_to_node(boot_cpu_id);
+	int node = cpu_to_node(0);
 
 	apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
 
@@ -1548,7 +1548,7 @@ static void __init setup_IO_APIC_irqs(void)
 void setup_IO_APIC_irq_extra(u32 gsi)
 {
 	int apic_id = 0, pin, idx, irq;
-	int node = cpu_to_node(boot_cpu_id);
+	int node = cpu_to_node(0);
 	struct irq_desc *desc;
 	struct irq_cfg *cfg;
 
@@ -2925,7 +2925,7 @@ static inline void __init check_timer(void)
 {
 	struct irq_desc *desc = irq_to_desc(0);
 	struct irq_cfg *cfg = desc->chip_data;
-	int node = cpu_to_node(boot_cpu_id);
+	int node = cpu_to_node(0);
 	int apic1, pin1, apic2, pin2;
 	unsigned long flags;
 	int no_pin1 = 0;
@@ -3279,7 +3279,7 @@ unsigned int create_irq_nr(unsigned int irq_want, int node)
 
 int create_irq(void)
 {
-	int node = cpu_to_node(boot_cpu_id);
+	int node = cpu_to_node(0);
 	unsigned int irq_want;
 	int irq;
 
@@ -3901,7 +3901,7 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq,
 	if (dev)
 		node = dev_to_node(dev);
 	else
-		node = cpu_to_node(boot_cpu_id);
+		node = cpu_to_node(0);
 
 	desc = irq_to_desc_alloc_node(irq, node);
 	if (!desc) {
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 60a57b13082..3a7c852f021 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -148,7 +148,7 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
 {
 #ifdef CONFIG_SMP
 	/* calling is from identify_secondary_cpu() ? */
-	if (c->cpu_index == boot_cpu_id)
+	if (!c->cpu_index)
 		return;
 
 	/*
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 490dac63c2d..787b3c7c662 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -665,7 +665,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 		this_cpu->c_early_init(c);
 
 #ifdef CONFIG_SMP
-	c->cpu_index = boot_cpu_id;
+	c->cpu_index = 0;
 #endif
 	filter_cpuid_features(c, false);
 }
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 85f69cdeae1..3a683ea5267 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -169,7 +169,7 @@ static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c)
 {
 #ifdef CONFIG_SMP
 	/* calling is from identify_secondary_cpu() ? */
-	if (c->cpu_index == boot_cpu_id)
+	if (!c->cpu_index)
 		return;
 
 	/*
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index e3af342fe83..7a4cf14223b 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -84,7 +84,7 @@ static int __init reboot_setup(char *str)
 			}
 				/* we will leave sorting out the final value
 				   when we are ready to reboot, since we might not
-				   have set up boot_cpu_id or smp_num_cpu */
+				   have detected BSP APIC ID or smp_num_cpu */
 			break;
 #endif /* CONFIG_SMP */
 
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b008e788320..dede5c4bae4 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -125,7 +125,6 @@ unsigned long max_pfn_mapped;
 RESERVE_BRK(dmi_alloc, 65536);
 #endif
 
-unsigned int boot_cpu_id __read_mostly;
 
 static __initdata unsigned long _brk_start = (unsigned long)__brk_base;
 unsigned long _brk_end = (unsigned long)__brk_base;
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index a60df9ae645..2335c15c93a 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -253,7 +253,7 @@ void __init setup_per_cpu_areas(void)
 		 * Up to this point, the boot CPU has been using .init.data
 		 * area.  Reload any changed state for the boot CPU.
 		 */
-		if (cpu == boot_cpu_id)
+		if (!cpu)
 			switch_to_new_gdt(cpu);
 	}
 
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c
index 970ed579d4e..240f86462a8 100644
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/k8topology_64.c
@@ -54,8 +54,8 @@ static __init int find_northbridge(void)
 static __init void early_get_boot_cpu_id(void)
 {
 	/*
-	 * need to get boot_cpu_id so can use that to create apicid_to_node
-	 * in k8_scan_nodes()
+	 * need to get the APIC ID of the BSP so can use that to
+	 * create apicid_to_node in k8_scan_nodes()
 	 */
 #ifdef CONFIG_X86_MPPARSE
 	/*
@@ -212,7 +212,7 @@ int __init k8_scan_nodes(void)
 	bits = boot_cpu_data.x86_coreid_bits;
 	cores = (1<<bits);
 	apicid_base = 0;
-	/* need to get boot_cpu_id early for system with apicid lifting */
+	/* get the APIC ID of the BSP early for systems with apicid lifting */
 	early_get_boot_cpu_id();
 	if (boot_cpu_physical_apicid > 0) {
 		pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid);
-- 
cgit v1.2.3-18-g5258


From 1f999ab5a1360afc388868cc0ef9afe8edeef3be Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 21 Jul 2010 19:03:57 +0200
Subject: x86, xsave: Disable xsave in i387 emulation mode

xsave is broken for (!HAVE_HWFP). This is the case if config
MATH_EMULATION is enabled, 'no387' kernel parameter is set and xsave
exists. xsave will not work because x86/math-emu and xsave share the
same memory. As this case can be treated as corner case we simply
disable xsave then.

Signed-off-by: Robert Richter <robert.richter@amd.com>
LKML-Reference: <1279731838-1522-7-git-send-email-robert.richter@amd.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/i387.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 1f11f5ce668..2605c50b11d 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -67,6 +67,12 @@ static void __cpuinit init_thread_xstate(void)
 	 */
 
 	if (!HAVE_HWFP) {
+		/*
+		 * Disable xsave as we do not support it if i387
+		 * emulation is enabled.
+		 */
+		setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+		setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
 		xstate_size = sizeof(struct i387_soft_struct);
 		return;
 	}
-- 
cgit v1.2.3-18-g5258


From c1a65932fd7216fdc9a0db8bbffe1d47842f862c Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 29 Jun 2010 18:08:13 +0200
Subject: perf: Drop unappropriate tests on arch callchains

Drop the TASK_RUNNING test on user tasks for callchains as
this check doesn't seem to make any sense.

Also remove the tests for !current that is not supposed to
happen and current->pid as this should be handled at the
generic level, with exclude_idle attribute.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Tested-by: Will Deacon <will.deacon@arm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: David Miller <davem@davemloft.net>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Borislav Petkov <bp@amd64.org>
---
 arch/x86/kernel/cpu/perf_event.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index f2da20fda02..4a4d191f949 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1703,9 +1703,6 @@ perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
 
 	is_user = user_mode(regs);
 
-	if (is_user && current->state != TASK_RUNNING)
-		return;
-
 	if (!is_user)
 		perf_callchain_kernel(regs, entry);
 
-- 
cgit v1.2.3-18-g5258


From 70791ce9ba68a5921c9905ef05d23f62a90bc10c Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 29 Jun 2010 19:34:05 +0200
Subject: perf: Generalize callchain_store()

callchain_store() is the same on every archs, inline it in
perf_event.h and rename it to perf_callchain_store() to avoid
any collision.

This removes repetitive code.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Paul Mackerras <paulus@samba.org>
Tested-by: Will Deacon <will.deacon@arm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: David Miller <davem@davemloft.net>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Borislav Petkov <bp@amd64.org>
---
 arch/x86/kernel/cpu/perf_event.c | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 4a4d191f949..8af28caeafc 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1571,12 +1571,6 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
  * callchain support
  */
 
-static inline
-void callchain_store(struct perf_callchain_entry *entry, u64 ip)
-{
-	if (entry->nr < PERF_MAX_STACK_DEPTH)
-		entry->ip[entry->nr++] = ip;
-}
 
 static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
 static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
@@ -1602,7 +1596,7 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
 {
 	struct perf_callchain_entry *entry = data;
 
-	callchain_store(entry, addr);
+	perf_callchain_store(entry, addr);
 }
 
 static const struct stacktrace_ops backtrace_ops = {
@@ -1616,8 +1610,8 @@ static const struct stacktrace_ops backtrace_ops = {
 static void
 perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
 {
-	callchain_store(entry, PERF_CONTEXT_KERNEL);
-	callchain_store(entry, regs->ip);
+	perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
+	perf_callchain_store(entry, regs->ip);
 
 	dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry);
 }
@@ -1646,7 +1640,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
 		if (fp < compat_ptr(regs->sp))
 			break;
 
-		callchain_store(entry, frame.return_address);
+		perf_callchain_store(entry, frame.return_address);
 		fp = compat_ptr(frame.next_frame);
 	}
 	return 1;
@@ -1670,8 +1664,8 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
 
 	fp = (void __user *)regs->bp;
 
-	callchain_store(entry, PERF_CONTEXT_USER);
-	callchain_store(entry, regs->ip);
+	perf_callchain_store(entry, PERF_CONTEXT_USER);
+	perf_callchain_store(entry, regs->ip);
 
 	if (perf_callchain_user32(regs, entry))
 		return;
@@ -1688,7 +1682,7 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
 		if ((unsigned long)fp < regs->sp)
 			break;
 
-		callchain_store(entry, frame.return_address);
+		perf_callchain_store(entry, frame.return_address);
 		fp = frame.next_frame;
 	}
 }
-- 
cgit v1.2.3-18-g5258


From 56962b4449af34070bb1994621ef4f0265eed4d8 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 30 Jun 2010 23:03:51 +0200
Subject: perf: Generalize some arch callchain code

- Most archs use one callchain buffer per cpu, except x86 that needs
  to deal with NMIs. Provide a default perf_callchain_buffer()
  implementation that x86 overrides.

- Centralize all the kernel/user regs handling and invoke new arch
  handlers from there: perf_callchain_user() / perf_callchain_kernel()
  That avoid all the user_mode(), current->mm checks and so...

- Invert some parameters in perf_callchain_*() helpers: entry to the
  left, regs to the right, following the traditional (dst, src).

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Paul Mackerras <paulus@samba.org>
Tested-by: Will Deacon <will.deacon@arm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: David Miller <davem@davemloft.net>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Borislav Petkov <bp@amd64.org>
---
 arch/x86/kernel/cpu/perf_event.c | 45 +++++++---------------------------------
 1 file changed, 8 insertions(+), 37 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 8af28caeafc..39f8421b86e 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1571,9 +1571,7 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
  * callchain support
  */
 
-
-static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
-static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
+static DEFINE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry_nmi);
 
 
 static void
@@ -1607,8 +1605,8 @@ static const struct stacktrace_ops backtrace_ops = {
 	.walk_stack		= print_context_stack_bp,
 };
 
-static void
-perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
+void
+perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
 {
 	perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
 	perf_callchain_store(entry, regs->ip);
@@ -1653,14 +1651,12 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
 }
 #endif
 
-static void
-perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
+void
+perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 {
 	struct stack_frame frame;
 	const void __user *fp;
 
-	if (!user_mode(regs))
-		regs = task_pt_regs(current);
 
 	fp = (void __user *)regs->bp;
 
@@ -1687,42 +1683,17 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
 	}
 }
 
-static void
-perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
-{
-	int is_user;
-
-	if (!regs)
-		return;
-
-	is_user = user_mode(regs);
-
-	if (!is_user)
-		perf_callchain_kernel(regs, entry);
-
-	if (current->mm)
-		perf_callchain_user(regs, entry);
-}
-
-struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+struct perf_callchain_entry *perf_callchain_buffer(void)
 {
-	struct perf_callchain_entry *entry;
-
 	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
 		/* TODO: We don't support guest os callchain now */
 		return NULL;
 	}
 
 	if (in_nmi())
-		entry = &__get_cpu_var(pmc_nmi_entry);
-	else
-		entry = &__get_cpu_var(pmc_irq_entry);
-
-	entry->nr = 0;
-
-	perf_do_callchain(regs, entry);
+		return &__get_cpu_var(perf_callchain_entry_nmi);
 
-	return entry;
+	return &__get_cpu_var(perf_callchain_entry);
 }
 
 unsigned long perf_instruction_pointer(struct pt_regs *regs)
-- 
cgit v1.2.3-18-g5258


From f72c1a931e311bb7780fee19e41a89ac42cab50e Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 1 Jul 2010 02:31:21 +0200
Subject: perf: Factorize callchain context handling

Store the kernel and user contexts from the generic layer instead
of archs, this gathers some repetitive code.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Paul Mackerras <paulus@samba.org>
Tested-by: Will Deacon <will.deacon@arm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: David Miller <davem@davemloft.net>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Borislav Petkov <bp@amd64.org>
---
 arch/x86/kernel/cpu/perf_event.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 39f8421b86e..a3c922288cc 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1608,7 +1608,6 @@ static const struct stacktrace_ops backtrace_ops = {
 void
 perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
 {
-	perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
 	perf_callchain_store(entry, regs->ip);
 
 	dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry);
@@ -1660,7 +1659,6 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 
 	fp = (void __user *)regs->bp;
 
-	perf_callchain_store(entry, PERF_CONTEXT_USER);
 	perf_callchain_store(entry, regs->ip);
 
 	if (perf_callchain_user32(regs, entry))
-- 
cgit v1.2.3-18-g5258


From 927c7a9e92c4f69097a6e9e086d11fc2f8a5b40b Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 1 Jul 2010 16:20:36 +0200
Subject: perf: Fix race in callchains

Now that software events don't have interrupt disabled anymore in
the event path, callchains can nest on any context. So seperating
nmi and others contexts in two buffers has become racy.

Fix this by providing one buffer per nesting level. Given the size
of the callchain entries (2040 bytes * 4), we now need to allocate
them dynamically.

v2: Fixed put_callchain_entry call after recursion.
    Fix the type of the recursion, it must be an array.

v3: Use a manual pr cpu allocation (temporary solution until NMIs
    can safely access vmalloc'ed memory).
    Do a better separation between callchain reference tracking and
    allocation. Make the "put" path lockless for non-release cases.

v4: Protect the callchain buffers with rcu.

v5: Do the cpu buffers allocations node affine.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Tested-by: Will Deacon <will.deacon@arm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: David Miller <davem@davemloft.net>
Cc: Borislav Petkov <bp@amd64.org>
---
 arch/x86/kernel/cpu/perf_event.c | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index a3c922288cc..8e91cf34a9c 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1608,6 +1608,11 @@ static const struct stacktrace_ops backtrace_ops = {
 void
 perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
 {
+	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+		/* TODO: We don't support guest os callchain now */
+		return NULL;
+	}
+
 	perf_callchain_store(entry, regs->ip);
 
 	dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry);
@@ -1656,6 +1661,10 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 	struct stack_frame frame;
 	const void __user *fp;
 
+	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+		/* TODO: We don't support guest os callchain now */
+		return NULL;
+	}
 
 	fp = (void __user *)regs->bp;
 
@@ -1681,19 +1690,6 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 	}
 }
 
-struct perf_callchain_entry *perf_callchain_buffer(void)
-{
-	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
-		/* TODO: We don't support guest os callchain now */
-		return NULL;
-	}
-
-	if (in_nmi())
-		return &__get_cpu_var(perf_callchain_entry_nmi);
-
-	return &__get_cpu_var(perf_callchain_entry);
-}
-
 unsigned long perf_instruction_pointer(struct pt_regs *regs)
 {
 	unsigned long ip;
-- 
cgit v1.2.3-18-g5258


From ed8052616680e72f58bca678d4d1678cb12a7e47 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 20 Aug 2010 14:30:41 +0200
Subject: perf: Remove superfluous return values from perf_callchain_*()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes these build warnings introduced by the callchain
rework:

 arch/x86/kernel/cpu/perf_event.c: In function ‘perf_callchain_kernel’:
 arch/x86/kernel/cpu/perf_event.c:1646: warning: ‘return’ with a value, in function returning void
 arch/x86/kernel/cpu/perf_event.c: In function ‘perf_callchain_user’:
 arch/x86/kernel/cpu/perf_event.c:1699: warning: ‘return’ with a value, in function returning void
 arch/x86/kernel/cpu/perf_event.c: At top level:
 arch/x86/kernel/cpu/perf_event.c:1607: warning: ‘perf_callchain_entry_nmi’ defined but not used

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 8e91cf34a9c..acc52afaf7b 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1610,7 +1610,7 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
 {
 	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
 		/* TODO: We don't support guest os callchain now */
-		return NULL;
+		return;
 	}
 
 	perf_callchain_store(entry, regs->ip);
@@ -1663,7 +1663,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 
 	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
 		/* TODO: We don't support guest os callchain now */
-		return NULL;
+		return;
 	}
 
 	fp = (void __user *)regs->bp;
-- 
cgit v1.2.3-18-g5258


From 61c77326d1df079f202fa79403c3ccd8c5966a81 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Mon, 16 Aug 2010 09:16:55 +0800
Subject: x86, mm: Avoid unnecessary TLB flush

In x86, access and dirty bits are set automatically by CPU when CPU accesses
memory. When we go into the code path of below flush_tlb_fix_spurious_fault(),
we already set dirty bit for pte and don't need flush tlb. This might mean
tlb entry in some CPUs hasn't dirty bit set, but this doesn't matter. When
the CPUs do page write, they will automatically check the bit and no software
involved.

On the other hand, flush tlb in below position is harmful. Test creates CPU
number of threads, each thread writes to a same but random address in same vma
range and we measure the total time. Under a 4 socket system, original time is
1.96s, while with the patch, the time is 0.8s. Under a 2 socket system, there is
20% time cut too. perf shows a lot of time are taking to send ipi/handle ipi for
tlb flush.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
LKML-Reference: <20100816011655.GA362@sli10-desk.sh.intel.com>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Andrea Archangeli <aarcange@redhat.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/pgtable.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index a34c785c5a6..2d0a33bd297 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -603,6 +603,8 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
 	pte_update(mm, addr, ptep);
 }
 
+#define flush_tlb_fix_spurious_fault(vma, address)
+
 /*
  * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
  *
-- 
cgit v1.2.3-18-g5258


From fdf4289679fd41d76553ce224750e9737cd80eea Mon Sep 17 00:00:00 2001
From: "Ma, Ling" <ling.ma@intel.com>
Date: Mon, 23 Aug 2010 14:11:12 -0700
Subject: x86, mem: Don't implement forward memmove() as memcpy()

memmove() allow source and destination address to be overlap, but
there is no such limitation for memcpy().  Therefore, explicitly
implement memmove() in both the forwards and backward directions, to
give us the ability to optimize memcpy().

Signed-off-by: Ma Ling <ling.ma@intel.com>
LKML-Reference: <C10D3FB0CD45994C8A51FEC1227CE22F0E483AD86A@shsmsx502.ccr.corp.intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/lib/memcpy_32.c  | 38 +++++++++++++++++++++++++++-----------
 arch/x86/lib/memmove_64.c | 46 +++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 68 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c
index 5415a9d06f5..be424dfcf36 100644
--- a/arch/x86/lib/memcpy_32.c
+++ b/arch/x86/lib/memcpy_32.c
@@ -25,19 +25,35 @@ void *memmove(void *dest, const void *src, size_t n)
 	int d0, d1, d2;
 
 	if (dest < src) {
-		memcpy(dest, src, n);
+		if ((dest + n) < src)
+			 return memcpy(dest, src, n);
+		else
+			__asm__ __volatile__(
+				"rep\n\t"
+				"movsb\n\t"
+				: "=&c" (d0), "=&S" (d1), "=&D" (d2)
+				:"0" (n),
+				 "1" (src),
+				 "2" (dest)
+				:"memory");
+
 	} else {
-		__asm__ __volatile__(
-			"std\n\t"
-			"rep\n\t"
-			"movsb\n\t"
-			"cld"
-			: "=&c" (d0), "=&S" (d1), "=&D" (d2)
-			:"0" (n),
-			 "1" (n-1+src),
-			 "2" (n-1+dest)
-			:"memory");
+
+		if((src + count) < dest)
+			return memcpy(dest, src, count);
+		else
+			__asm__ __volatile__(
+				"std\n\t"
+				"rep\n\t"
+				"movsb\n\t"
+				"cld"
+				: "=&c" (d0), "=&S" (d1), "=&D" (d2)
+				:"0" (n),
+				 "1" (n-1+src),
+				 "2" (n-1+dest)
+				:"memory");
 	}
+
 	return dest;
 }
 EXPORT_SYMBOL(memmove);
diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c
index 0a33909bf12..ecacc4b3d9e 100644
--- a/arch/x86/lib/memmove_64.c
+++ b/arch/x86/lib/memmove_64.c
@@ -8,13 +8,49 @@
 #undef memmove
 void *memmove(void *dest, const void *src, size_t count)
 {
+	unsigned long d0, d1, d2, d3;
 	if (dest < src) {
-		return memcpy(dest, src, count);
+		if ((dest + count) < src)
+			 return memcpy(dest, src, count);
+		else
+			__asm__ __volatile__(
+				"movq %0, %3\n\t"
+				"shr $3, %0\n\t"
+				"andq $7, %3\n\t"
+				"rep\n\t"
+				"movsq\n\t"
+				"movq %3, %0\n\t"
+				"rep\n\t"
+				"movsb"
+				: "=&c" (d0), "=&S" (d1), "=&D" (d2), "=r" (d3)
+				:"0" (count),
+				 "1" (src),
+				 "2" (dest)
+				:"memory");
 	} else {
-		char *p = dest + count;
-		const char *s = src + count;
-		while (count--)
-			*--p = *--s;
+		if((src + count) < dest)
+			return memcpy(dest, src, count);
+		else
+			__asm__ __volatile__(
+				"movq %0, %3\n\t"
+				"lea -8(%1, %0), %1\n\t"
+				"lea -8(%2, %0), %2\n\t"
+				"shr $3, %0\n\t"
+				"andq $7, %3\n\t"
+				"std\n\t"
+				"rep\n\t"
+				"movsq\n\t"
+				"lea 7(%1), %1\n\t"
+				"lea 7(%2), %2\n\t"
+				"movq %3, %0\n\t"
+				"rep\n\t"
+				"movsb\n\t"
+				"cld"
+				: "=&c" (d0), "=&S" (d1), "=&D" (d2), "=r" (d3)
+				:"0" (count),
+				 "1" (src),
+				 "2" (dest)
+				:"memory");
 	}
 	return dest;
 }
-- 
cgit v1.2.3-18-g5258


From 59daa706fbec745684702741b9f5373142dd9fdc Mon Sep 17 00:00:00 2001
From: Ma Ling <ling.ma@intel.com>
Date: Tue, 29 Jun 2010 03:24:25 +0800
Subject: x86, mem: Optimize memcpy by avoiding memory false dependece

All read operations after allocation stage can run speculatively,
all write operation will run in program order, and if addresses are
different read may run before older write operation, otherwise wait
until write commit. However CPU don't check each address bit,
so read could fail to recognize different address even they
are in different page.For example if rsi is 0xf004, rdi is 0xe008,
in following operation there will generate big performance latency.
1. movq (%rsi),	%rax
2. movq %rax,	(%rdi)
3. movq 8(%rsi), %rax
4. movq %rax,	8(%rdi)

If %rsi and rdi were in really the same meory page, there are TRUE
read-after-write dependence because instruction 2 write 0x008 and
instruction 3 read 0x00c, the two address are overlap partially.
Actually there are in different page and no any issues,
but without checking each address bit CPU could think they are
in the same page, and instruction 3 have to wait for instruction 2
to write data into cache from write buffer, then load data from cache,
the cost time read spent is equal to mfence instruction. We may avoid it by
tuning operation sequence as follow.

1. movq 8(%rsi), %rax
2. movq %rax,	8(%rdi)
3. movq (%rsi),	%rax
4. movq %rax,	(%rdi)

Instruction 3 read 0x004, instruction 2 write address 0x010, no any
dependence.  At last on Core2 we gain 1.83x speedup compared with
original instruction sequence.  In this patch we first handle small
size(less 20bytes), then jump to different copy mode. Based on our
micro-benchmark small bytes from 1 to 127 bytes, we got up to 2X
improvement, and up to 1.5X improvement for 1024 bytes on Corei7.  (We
use our micro-benchmark, and will do further test according to your
requirment)

Signed-off-by: Ma Ling <ling.ma@intel.com>
LKML-Reference: <1277753065-18610-1-git-send-email-ling.ma@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/lib/memcpy_32.c |   6 +-
 arch/x86/lib/memcpy_64.S | 158 ++++++++++++++++++++++++++++++-----------------
 2 files changed, 105 insertions(+), 59 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c
index be424dfcf36..81130d477ee 100644
--- a/arch/x86/lib/memcpy_32.c
+++ b/arch/x86/lib/memcpy_32.c
@@ -36,11 +36,9 @@ void *memmove(void *dest, const void *src, size_t n)
 				 "1" (src),
 				 "2" (dest)
 				:"memory");
-
 	} else {
-
-		if((src + count) < dest)
-			return memcpy(dest, src, count);
+		if((src + n) < dest)
+			return memcpy(dest, src, n);
 		else
 			__asm__ __volatile__(
 				"std\n\t"
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index bcbcd1e0f7d..75ef61e35e3 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -40,84 +40,132 @@
 ENTRY(__memcpy)
 ENTRY(memcpy)
 	CFI_STARTPROC
+	movq %rdi, %rax
 
 	/*
-	 * Put the number of full 64-byte blocks into %ecx.
-	 * Tail portion is handled at the end:
+	 * Use 32bit CMP here to avoid long NOP padding.
 	 */
-	movq %rdi, %rax
-	movl %edx, %ecx
-	shrl   $6, %ecx
-	jz .Lhandle_tail
+	cmp  $0x20, %edx
+	jb .Lhandle_tail
 
-	.p2align 4
-.Lloop_64:
 	/*
-	 * We decrement the loop index here - and the zero-flag is
-	 * checked at the end of the loop (instructions inbetween do
-	 * not change the zero flag):
+	 * We check whether memory false dependece could occur,
+	 * then jump to corresponding copy mode.
 	 */
-	decl %ecx
+	cmp  %dil, %sil
+	jl .Lcopy_backward
+	subl $0x20, %edx
+.Lcopy_forward_loop:
+	subq $0x20,	%rdx
 
 	/*
-	 * Move in blocks of 4x16 bytes:
+	 * Move in blocks of 4x8 bytes:
 	 */
-	movq 0*8(%rsi),		%r11
-	movq 1*8(%rsi),		%r8
-	movq %r11,		0*8(%rdi)
-	movq %r8,		1*8(%rdi)
-
-	movq 2*8(%rsi),		%r9
-	movq 3*8(%rsi),		%r10
-	movq %r9,		2*8(%rdi)
-	movq %r10,		3*8(%rdi)
-
-	movq 4*8(%rsi),		%r11
-	movq 5*8(%rsi),		%r8
-	movq %r11,		4*8(%rdi)
-	movq %r8,		5*8(%rdi)
-
-	movq 6*8(%rsi),		%r9
-	movq 7*8(%rsi),		%r10
-	movq %r9,		6*8(%rdi)
-	movq %r10,		7*8(%rdi)
-
-	leaq 64(%rsi), %rsi
-	leaq 64(%rdi), %rdi
-
-	jnz  .Lloop_64
+	movq 0*8(%rsi),	%r8
+	movq 1*8(%rsi),	%r9
+	movq 2*8(%rsi),	%r10
+	movq 3*8(%rsi),	%r11
+	leaq 4*8(%rsi),	%rsi
+
+	movq %r8,	0*8(%rdi)
+	movq %r9,	1*8(%rdi)
+	movq %r10,	2*8(%rdi)
+	movq %r11,	3*8(%rdi)
+	leaq 4*8(%rdi),	%rdi
+	jae  .Lcopy_forward_loop
+	addq $0x20,	%rdx
+	jmp  .Lhandle_tail
+
+.Lcopy_backward:
+	/*
+	 * Calculate copy position to tail.
+	 */
+	addq %rdx,	%rsi
+	addq %rdx,	%rdi
+	subq $0x20,	%rdx
+	/*
+	 * At most 3 ALU operations in one cycle,
+	 * so append NOPS in the same 16bytes trunk.
+	 */
+	.p2align 4
+.Lcopy_backward_loop:
+	subq $0x20,	%rdx
+	movq -1*8(%rsi),	%r8
+	movq -2*8(%rsi),	%r9
+	movq -3*8(%rsi),	%r10
+	movq -4*8(%rsi),	%r11
+	leaq -4*8(%rsi),	%rsi
+	movq %r8,		-1*8(%rdi)
+	movq %r9,		-2*8(%rdi)
+	movq %r10,		-3*8(%rdi)
+	movq %r11,		-4*8(%rdi)
+	leaq -4*8(%rdi),	%rdi
+	jae  .Lcopy_backward_loop
 
+	/*
+	 * Calculate copy position to head.
+	 */
+	addq $0x20,	%rdx
+	subq %rdx,	%rsi
+	subq %rdx,	%rdi
 .Lhandle_tail:
-	movl %edx, %ecx
-	andl  $63, %ecx
-	shrl   $3, %ecx
-	jz   .Lhandle_7
+	cmpq $16,	%rdx
+	jb   .Lless_16bytes
 
+	/*
+	 * Move data from 16 bytes to 31 bytes.
+	 */
+	movq 0*8(%rsi), %r8
+	movq 1*8(%rsi),	%r9
+	movq -2*8(%rsi, %rdx),	%r10
+	movq -1*8(%rsi, %rdx),	%r11
+	movq %r8,	0*8(%rdi)
+	movq %r9,	1*8(%rdi)
+	movq %r10,	-2*8(%rdi, %rdx)
+	movq %r11,	-1*8(%rdi, %rdx)
+	retq
 	.p2align 4
-.Lloop_8:
-	decl %ecx
-	movq (%rsi),		%r8
-	movq %r8,		(%rdi)
-	leaq 8(%rdi),		%rdi
-	leaq 8(%rsi),		%rsi
-	jnz  .Lloop_8
-
-.Lhandle_7:
-	movl %edx, %ecx
-	andl $7, %ecx
-	jz .Lend
+.Lless_16bytes:
+	cmpq $8,	%rdx
+	jb   .Lless_8bytes
+	/*
+	 * Move data from 8 bytes to 15 bytes.
+	 */
+	movq 0*8(%rsi),	%r8
+	movq -1*8(%rsi, %rdx),	%r9
+	movq %r8,	0*8(%rdi)
+	movq %r9,	-1*8(%rdi, %rdx)
+	retq
+	.p2align 4
+.Lless_8bytes:
+	cmpq $4,	%rdx
+	jb   .Lless_3bytes
 
+	/*
+	 * Move data from 4 bytes to 7 bytes.
+	 */
+	movl (%rsi), %ecx
+	movl -4(%rsi, %rdx), %r8d
+	movl %ecx, (%rdi)
+	movl %r8d, -4(%rdi, %rdx)
+	retq
 	.p2align 4
+.Lless_3bytes:
+	cmpl $0, %edx
+	je .Lend
+	/*
+	 * Move data from 1 bytes to 3 bytes.
+	 */
 .Lloop_1:
 	movb (%rsi), %r8b
 	movb %r8b, (%rdi)
 	incq %rdi
 	incq %rsi
-	decl %ecx
+	decl %edx
 	jnz .Lloop_1
 
 .Lend:
-	ret
+	retq
 	CFI_ENDPROC
 ENDPROC(memcpy)
 ENDPROC(__memcpy)
-- 
cgit v1.2.3-18-g5258


From 9863c90f682fba34cdc26c3437e8c00da6c83fa4 Mon Sep 17 00:00:00 2001
From: Alok Kataria <akataria@vmware.com>
Date: Mon, 23 Aug 2010 14:49:11 -0700
Subject: x86, vmware: Remove deprecated VMI kernel support

With the recent innovations in CPU hardware acceleration technologies
from Intel and AMD, VMware ran a few experiments to compare these
techniques to guest paravirtualization technique on VMware's platform.
These hardware assisted virtualization techniques have outperformed the
performance benefits provided by VMI in most of the workloads. VMware
expects that these hardware features will be ubiquitous in a couple of
years, as a result, VMware has started a phased retirement of this
feature from the hypervisor.

Please note that VMI has always been an optimization and non-VMI kernels
still work fine on VMware's platform.
Latest versions of VMware's product which support VMI are,
Workstation 7.0 and VSphere 4.0 on ESX side, future maintainence
releases for these products will continue supporting VMI.

For more details about VMI retirement take a look at this,
http://blogs.vmware.com/guestosguide/2009/09/vmi-retirement.html

This feature removal was scheduled for 2.6.37 back in September 2009.

Signed-off-by: Alok N Kataria <akataria@vmware.com>
LKML-Reference: <1282600151.19396.22.camel@ank32.eng.vmware.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/Kconfig                |  19 -
 arch/x86/include/asm/vmi.h      | 269 ------------
 arch/x86/include/asm/vmi_time.h |  98 -----
 arch/x86/kernel/Makefile        |   1 -
 arch/x86/kernel/setup.c         |  12 +-
 arch/x86/kernel/smpboot.c       |   2 -
 arch/x86/kernel/vmi_32.c        | 893 ----------------------------------------
 arch/x86/kernel/vmiclock_32.c   | 317 --------------
 8 files changed, 4 insertions(+), 1607 deletions(-)
 delete mode 100644 arch/x86/include/asm/vmi.h
 delete mode 100644 arch/x86/include/asm/vmi_time.h
 delete mode 100644 arch/x86/kernel/vmi_32.c
 delete mode 100644 arch/x86/kernel/vmiclock_32.c

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cea0cd9a316..f0ee331feea 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -517,25 +517,6 @@ if PARAVIRT_GUEST
 
 source "arch/x86/xen/Kconfig"
 
-config VMI
-	bool "VMI Guest support (DEPRECATED)"
-	select PARAVIRT
-	depends on X86_32
-	---help---
-	  VMI provides a paravirtualized interface to the VMware ESX server
-	  (it could be used by other hypervisors in theory too, but is not
-	  at the moment), by linking the kernel to a GPL-ed ROM module
-	  provided by the hypervisor.
-
-	  As of September 2009, VMware has started a phased retirement
-	  of this feature from VMware's products. Please see
-	  feature-removal-schedule.txt for details.  If you are
-	  planning to enable this option, please note that you cannot
-	  live migrate a VMI enabled VM to a future VMware product,
-	  which doesn't support VMI. So if you expect your kernel to
-	  seamlessly migrate to newer VMware products, keep this
-	  disabled.
-
 config KVM_CLOCK
 	bool "KVM paravirtualized clock"
 	select PARAVIRT
diff --git a/arch/x86/include/asm/vmi.h b/arch/x86/include/asm/vmi.h
deleted file mode 100644
index 61e08c0a290..00000000000
--- a/arch/x86/include/asm/vmi.h
+++ /dev/null
@@ -1,269 +0,0 @@
-/*
- * VMI interface definition
- *
- * Copyright (C) 2005, VMware, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Maintained by: Zachary Amsden zach@vmware.com
- *
- */
-#include <linux/types.h>
-
-/*
- *---------------------------------------------------------------------
- *
- *  VMI Option ROM API
- *
- *---------------------------------------------------------------------
- */
-#define VMI_SIGNATURE 0x696d5663   /* "cVmi" */
-
-#define PCI_VENDOR_ID_VMWARE            0x15AD
-#define PCI_DEVICE_ID_VMWARE_VMI        0x0801
-
-/*
- * We use two version numbers for compatibility, with the major
- * number signifying interface breakages, and the minor number
- * interface extensions.
- */
-#define VMI_API_REV_MAJOR       3
-#define VMI_API_REV_MINOR       0
-
-#define VMI_CALL_CPUID			0
-#define VMI_CALL_WRMSR			1
-#define VMI_CALL_RDMSR			2
-#define VMI_CALL_SetGDT			3
-#define VMI_CALL_SetLDT			4
-#define VMI_CALL_SetIDT			5
-#define VMI_CALL_SetTR			6
-#define VMI_CALL_GetGDT			7
-#define VMI_CALL_GetLDT			8
-#define VMI_CALL_GetIDT			9
-#define VMI_CALL_GetTR			10
-#define VMI_CALL_WriteGDTEntry		11
-#define VMI_CALL_WriteLDTEntry		12
-#define VMI_CALL_WriteIDTEntry		13
-#define VMI_CALL_UpdateKernelStack	14
-#define VMI_CALL_SetCR0			15
-#define VMI_CALL_SetCR2			16
-#define VMI_CALL_SetCR3			17
-#define VMI_CALL_SetCR4			18
-#define VMI_CALL_GetCR0			19
-#define VMI_CALL_GetCR2			20
-#define VMI_CALL_GetCR3			21
-#define VMI_CALL_GetCR4			22
-#define VMI_CALL_WBINVD			23
-#define VMI_CALL_SetDR			24
-#define VMI_CALL_GetDR			25
-#define VMI_CALL_RDPMC			26
-#define VMI_CALL_RDTSC			27
-#define VMI_CALL_CLTS			28
-#define VMI_CALL_EnableInterrupts	29
-#define VMI_CALL_DisableInterrupts	30
-#define VMI_CALL_GetInterruptMask	31
-#define VMI_CALL_SetInterruptMask	32
-#define VMI_CALL_IRET			33
-#define VMI_CALL_SYSEXIT		34
-#define VMI_CALL_Halt			35
-#define VMI_CALL_Reboot			36
-#define VMI_CALL_Shutdown		37
-#define VMI_CALL_SetPxE			38
-#define VMI_CALL_SetPxELong		39
-#define VMI_CALL_UpdatePxE		40
-#define VMI_CALL_UpdatePxELong		41
-#define VMI_CALL_MachineToPhysical	42
-#define VMI_CALL_PhysicalToMachine	43
-#define VMI_CALL_AllocatePage		44
-#define VMI_CALL_ReleasePage		45
-#define VMI_CALL_InvalPage		46
-#define VMI_CALL_FlushTLB		47
-#define VMI_CALL_SetLinearMapping	48
-
-#define VMI_CALL_SetIOPLMask		61
-#define VMI_CALL_SetInitialAPState	62
-#define VMI_CALL_APICWrite		63
-#define VMI_CALL_APICRead		64
-#define VMI_CALL_IODelay		65
-#define VMI_CALL_SetLazyMode		73
-
-/*
- *---------------------------------------------------------------------
- *
- * MMU operation flags
- *
- *---------------------------------------------------------------------
- */
-
-/* Flags used by VMI_{Allocate|Release}Page call */
-#define VMI_PAGE_PAE             0x10  /* Allocate PAE shadow */
-#define VMI_PAGE_CLONE           0x20  /* Clone from another shadow */
-#define VMI_PAGE_ZEROED          0x40  /* Page is pre-zeroed */
-
-
-/* Flags shared by Allocate|Release Page and PTE updates */
-#define VMI_PAGE_PT              0x01
-#define VMI_PAGE_PD              0x02
-#define VMI_PAGE_PDP             0x04
-#define VMI_PAGE_PML4            0x08
-
-#define VMI_PAGE_NORMAL          0x00 /* for debugging */
-
-/* Flags used by PTE updates */
-#define VMI_PAGE_CURRENT_AS      0x10 /* implies VMI_PAGE_VA_MASK is valid */
-#define VMI_PAGE_DEFER           0x20 /* may queue update until TLB inval */
-#define VMI_PAGE_VA_MASK         0xfffff000
-
-#ifdef CONFIG_X86_PAE
-#define VMI_PAGE_L1		(VMI_PAGE_PT | VMI_PAGE_PAE | VMI_PAGE_ZEROED)
-#define VMI_PAGE_L2		(VMI_PAGE_PD | VMI_PAGE_PAE | VMI_PAGE_ZEROED)
-#else
-#define VMI_PAGE_L1		(VMI_PAGE_PT | VMI_PAGE_ZEROED)
-#define VMI_PAGE_L2		(VMI_PAGE_PD | VMI_PAGE_ZEROED)
-#endif
-
-/* Flags used by VMI_FlushTLB call */
-#define VMI_FLUSH_TLB            0x01
-#define VMI_FLUSH_GLOBAL         0x02
-
-/*
- *---------------------------------------------------------------------
- *
- *  VMI relocation definitions for ROM call get_reloc
- *
- *---------------------------------------------------------------------
- */
-
-/* VMI Relocation types */
-#define VMI_RELOCATION_NONE     0
-#define VMI_RELOCATION_CALL_REL 1
-#define VMI_RELOCATION_JUMP_REL 2
-#define VMI_RELOCATION_NOP	3
-
-#ifndef __ASSEMBLY__
-struct vmi_relocation_info {
-	unsigned char           *eip;
-	unsigned char           type;
-	unsigned char           reserved[3];
-};
-#endif
-
-
-/*
- *---------------------------------------------------------------------
- *
- *  Generic ROM structures and definitions
- *
- *---------------------------------------------------------------------
- */
-
-#ifndef __ASSEMBLY__
-
-struct vrom_header {
-	u16     rom_signature;  /* option ROM signature */
-	u8      rom_length;     /* ROM length in 512 byte chunks */
-	u8      rom_entry[4];   /* 16-bit code entry point */
-	u8      rom_pad0;       /* 4-byte align pad */
-	u32     vrom_signature; /* VROM identification signature */
-	u8      api_version_min;/* Minor version of API */
-	u8      api_version_maj;/* Major version of API */
-	u8      jump_slots;     /* Number of jump slots */
-	u8      reserved1;      /* Reserved for expansion */
-	u32     virtual_top;    /* Hypervisor virtual address start */
-	u16     reserved2;      /* Reserved for expansion */
-	u16	license_offs;	/* Offset to License string */
-	u16     pci_header_offs;/* Offset to PCI OPROM header */
-	u16     pnp_header_offs;/* Offset to PnP OPROM header */
-	u32     rom_pad3;       /* PnP reserverd / VMI reserved */
-	u8      reserved[96];   /* Reserved for headers */
-	char    vmi_init[8];    /* VMI_Init jump point */
-	char    get_reloc[8];   /* VMI_GetRelocationInfo jump point */
-} __attribute__((packed));
-
-struct pnp_header {
-	char sig[4];
-	char rev;
-	char size;
-	short next;
-	short res;
-	long devID;
-	unsigned short manufacturer_offset;
-	unsigned short product_offset;
-} __attribute__((packed));
-
-struct pci_header {
-	char sig[4];
-	short vendorID;
-	short deviceID;
-	short vpdData;
-	short size;
-	char rev;
-	char class;
-	char subclass;
-	char interface;
-	short chunks;
-	char rom_version_min;
-	char rom_version_maj;
-	char codetype;
-	char lastRom;
-	short reserved;
-} __attribute__((packed));
-
-/* Function prototypes for bootstrapping */
-#ifdef CONFIG_VMI
-extern void vmi_init(void);
-extern void vmi_activate(void);
-extern void vmi_bringup(void);
-#else
-static inline void vmi_init(void) {}
-static inline void vmi_activate(void) {}
-static inline void vmi_bringup(void) {}
-#endif
-
-/* State needed to start an application processor in an SMP system. */
-struct vmi_ap_state {
-	u32 cr0;
-	u32 cr2;
-	u32 cr3;
-	u32 cr4;
-
-	u64 efer;
-
-	u32 eip;
-	u32 eflags;
-	u32 eax;
-	u32 ebx;
-	u32 ecx;
-	u32 edx;
-	u32 esp;
-	u32 ebp;
-	u32 esi;
-	u32 edi;
-	u16 cs;
-	u16 ss;
-	u16 ds;
-	u16 es;
-	u16 fs;
-	u16 gs;
-	u16 ldtr;
-
-	u16 gdtr_limit;
-	u32 gdtr_base;
-	u32 idtr_base;
-	u16 idtr_limit;
-};
-
-#endif
diff --git a/arch/x86/include/asm/vmi_time.h b/arch/x86/include/asm/vmi_time.h
deleted file mode 100644
index c6e0bee93e3..00000000000
--- a/arch/x86/include/asm/vmi_time.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * VMI Time wrappers
- *
- * Copyright (C) 2006, VMware, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to dhecht@vmware.com
- *
- */
-
-#ifndef _ASM_X86_VMI_TIME_H
-#define _ASM_X86_VMI_TIME_H
-
-/*
- * Raw VMI call indices for timer functions
- */
-#define VMI_CALL_GetCycleFrequency	66
-#define VMI_CALL_GetCycleCounter	67
-#define VMI_CALL_SetAlarm		68
-#define VMI_CALL_CancelAlarm		69
-#define VMI_CALL_GetWallclockTime	70
-#define VMI_CALL_WallclockUpdated	71
-
-/* Cached VMI timer operations */
-extern struct vmi_timer_ops {
-	u64 (*get_cycle_frequency)(void);
-	u64 (*get_cycle_counter)(int);
-	u64 (*get_wallclock)(void);
-	int (*wallclock_updated)(void);
-	void (*set_alarm)(u32 flags, u64 expiry, u64 period);
-	void (*cancel_alarm)(u32 flags);
-} vmi_timer_ops;
-
-/* Prototypes */
-extern void __init vmi_time_init(void);
-extern unsigned long vmi_get_wallclock(void);
-extern int vmi_set_wallclock(unsigned long now);
-extern unsigned long long vmi_sched_clock(void);
-extern unsigned long vmi_tsc_khz(void);
-
-#ifdef CONFIG_X86_LOCAL_APIC
-extern void __devinit vmi_time_bsp_init(void);
-extern void __devinit vmi_time_ap_init(void);
-#endif
-
-/*
- * When run under a hypervisor, a vcpu is always in one of three states:
- * running, halted, or ready.  The vcpu is in the 'running' state if it
- * is executing.  When the vcpu executes the halt interface, the vcpu
- * enters the 'halted' state and remains halted until there is some work
- * pending for the vcpu (e.g. an alarm expires, host I/O completes on
- * behalf of virtual I/O).  At this point, the vcpu enters the 'ready'
- * state (waiting for the hypervisor to reschedule it).  Finally, at any
- * time when the vcpu is not in the 'running' state nor the 'halted'
- * state, it is in the 'ready' state.
- *
- * Real time is advances while the vcpu is 'running', 'ready', or
- * 'halted'.  Stolen time is the time in which the vcpu is in the
- * 'ready' state.  Available time is the remaining time -- the vcpu is
- * either 'running' or 'halted'.
- *
- * All three views of time are accessible through the VMI cycle
- * counters.
- */
-
-/* The cycle counters. */
-#define VMI_CYCLES_REAL         0
-#define VMI_CYCLES_AVAILABLE    1
-#define VMI_CYCLES_STOLEN       2
-
-/* The alarm interface 'flags' bits */
-#define VMI_ALARM_COUNTERS      2
-
-#define VMI_ALARM_COUNTER_MASK  0x000000ff
-
-#define VMI_ALARM_WIRED_IRQ0    0x00000000
-#define VMI_ALARM_WIRED_LVTT    0x00010000
-
-#define VMI_ALARM_IS_ONESHOT    0x00000000
-#define VMI_ALARM_IS_PERIODIC   0x00000100
-
-#define CONFIG_VMI_ALARM_HZ	100
-
-#endif /* _ASM_X86_VMI_TIME_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 0925676266b..801127cd975 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -91,7 +91,6 @@ obj-$(CONFIG_K8_NB)		+= k8.o
 obj-$(CONFIG_DEBUG_RODATA_TEST)	+= test_rodata.o
 obj-$(CONFIG_DEBUG_NX_TEST)	+= test_nx.o
 
-obj-$(CONFIG_VMI)		+= vmi_32.o vmiclock_32.o
 obj-$(CONFIG_KVM_GUEST)		+= kvm.o
 obj-$(CONFIG_KVM_CLOCK)		+= kvmclock.o
 obj-$(CONFIG_PARAVIRT)		+= paravirt.o paravirt_patch_$(BITS).o
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index c3a4fbb2b99..feb4c21e499 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -83,7 +83,6 @@
 #include <asm/dmi.h>
 #include <asm/io_apic.h>
 #include <asm/ist.h>
-#include <asm/vmi.h>
 #include <asm/setup_arch.h>
 #include <asm/bios_ebda.h>
 #include <asm/cacheflush.h>
@@ -734,10 +733,10 @@ void __init setup_arch(char **cmdline_p)
 	printk(KERN_INFO "Command line: %s\n", boot_command_line);
 #endif
 
-	/* VMI may relocate the fixmap; do this before touching ioremap area */
-	vmi_init();
-
-	/* OFW also may relocate the fixmap */
+	/*
+	 * If we have OLPC OFW, we might end up relocating the fixmap due to
+	 * reserve_top(), so do this before touching the ioremap area.
+	 */
 	olpc_ofw_detect();
 
 	early_trap_init();
@@ -838,9 +837,6 @@ void __init setup_arch(char **cmdline_p)
 
 	x86_report_nx();
 
-	/* Must be before kernel pagetables are setup */
-	vmi_activate();
-
 	/* after early param, so could get panic from serial */
 	reserve_early_setup_data();
 
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 8b3bfc4dd70..63a1a5596ac 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -62,7 +62,6 @@
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <asm/mtrr.h>
-#include <asm/vmi.h>
 #include <asm/apic.h>
 #include <asm/setup.h>
 #include <asm/uv/uv.h>
@@ -311,7 +310,6 @@ notrace static void __cpuinit start_secondary(void *unused)
 	__flush_tlb_all();
 #endif
 
-	vmi_bringup();
 	cpu_init();
 	preempt_disable();
 	smp_callin();
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
deleted file mode 100644
index ce9fbacb752..00000000000
--- a/arch/x86/kernel/vmi_32.c
+++ /dev/null
@@ -1,893 +0,0 @@
-/*
- * VMI specific paravirt-ops implementation
- *
- * Copyright (C) 2005, VMware, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to zach@vmware.com
- *
- */
-
-#include <linux/module.h>
-#include <linux/cpu.h>
-#include <linux/bootmem.h>
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/sched.h>
-#include <linux/gfp.h>
-#include <asm/vmi.h>
-#include <asm/io.h>
-#include <asm/fixmap.h>
-#include <asm/apicdef.h>
-#include <asm/apic.h>
-#include <asm/pgalloc.h>
-#include <asm/processor.h>
-#include <asm/timer.h>
-#include <asm/vmi_time.h>
-#include <asm/kmap_types.h>
-#include <asm/setup.h>
-
-/* Convenient for calling VMI functions indirectly in the ROM */
-typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void);
-typedef u64 __attribute__((regparm(2))) (VROMLONGFUNC)(int);
-
-#define call_vrom_func(rom,func) \
-   (((VROMFUNC *)(rom->func))())
-
-#define call_vrom_long_func(rom,func,arg) \
-   (((VROMLONGFUNC *)(rom->func)) (arg))
-
-static struct vrom_header *vmi_rom;
-static int disable_pge;
-static int disable_pse;
-static int disable_sep;
-static int disable_tsc;
-static int disable_mtrr;
-static int disable_noidle;
-static int disable_vmi_timer;
-
-/* Cached VMI operations */
-static struct {
-	void (*cpuid)(void /* non-c */);
-	void (*_set_ldt)(u32 selector);
-	void (*set_tr)(u32 selector);
-	void (*write_idt_entry)(struct desc_struct *, int, u32, u32);
-	void (*write_gdt_entry)(struct desc_struct *, int, u32, u32);
-	void (*write_ldt_entry)(struct desc_struct *, int, u32, u32);
-	void (*set_kernel_stack)(u32 selector, u32 sp0);
-	void (*allocate_page)(u32, u32, u32, u32, u32);
-	void (*release_page)(u32, u32);
-	void (*set_pte)(pte_t, pte_t *, unsigned);
-	void (*update_pte)(pte_t *, unsigned);
-	void (*set_linear_mapping)(int, void *, u32, u32);
-	void (*_flush_tlb)(int);
-	void (*set_initial_ap_state)(int, int);
-	void (*halt)(void);
-  	void (*set_lazy_mode)(int mode);
-} vmi_ops;
-
-/* Cached VMI operations */
-struct vmi_timer_ops vmi_timer_ops;
-
-/*
- * VMI patching routines.
- */
-#define MNEM_CALL 0xe8
-#define MNEM_JMP  0xe9
-#define MNEM_RET  0xc3
-
-#define IRQ_PATCH_INT_MASK 0
-#define IRQ_PATCH_DISABLE  5
-
-static inline void patch_offset(void *insnbuf,
-				unsigned long ip, unsigned long dest)
-{
-        *(unsigned long *)(insnbuf+1) = dest-ip-5;
-}
-
-static unsigned patch_internal(int call, unsigned len, void *insnbuf,
-			       unsigned long ip)
-{
-	u64 reloc;
-	struct vmi_relocation_info *const rel = (struct vmi_relocation_info *)&reloc;
-	reloc = call_vrom_long_func(vmi_rom, get_reloc,	call);
-	switch(rel->type) {
-		case VMI_RELOCATION_CALL_REL:
-			BUG_ON(len < 5);
-			*(char *)insnbuf = MNEM_CALL;
-			patch_offset(insnbuf, ip, (unsigned long)rel->eip);
-			return 5;
-
-		case VMI_RELOCATION_JUMP_REL:
-			BUG_ON(len < 5);
-			*(char *)insnbuf = MNEM_JMP;
-			patch_offset(insnbuf, ip, (unsigned long)rel->eip);
-			return 5;
-
-		case VMI_RELOCATION_NOP:
-			/* obliterate the whole thing */
-			return 0;
-
-		case VMI_RELOCATION_NONE:
-			/* leave native code in place */
-			break;
-
-		default:
-			BUG();
-	}
-	return len;
-}
-
-/*
- * Apply patch if appropriate, return length of new instruction
- * sequence.  The callee does nop padding for us.
- */
-static unsigned vmi_patch(u8 type, u16 clobbers, void *insns,
-			  unsigned long ip, unsigned len)
-{
-	switch (type) {
-		case PARAVIRT_PATCH(pv_irq_ops.irq_disable):
-			return patch_internal(VMI_CALL_DisableInterrupts, len,
-					      insns, ip);
-		case PARAVIRT_PATCH(pv_irq_ops.irq_enable):
-			return patch_internal(VMI_CALL_EnableInterrupts, len,
-					      insns, ip);
-		case PARAVIRT_PATCH(pv_irq_ops.restore_fl):
-			return patch_internal(VMI_CALL_SetInterruptMask, len,
-					      insns, ip);
-		case PARAVIRT_PATCH(pv_irq_ops.save_fl):
-			return patch_internal(VMI_CALL_GetInterruptMask, len,
-					      insns, ip);
-		case PARAVIRT_PATCH(pv_cpu_ops.iret):
-			return patch_internal(VMI_CALL_IRET, len, insns, ip);
-		case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit):
-			return patch_internal(VMI_CALL_SYSEXIT, len, insns, ip);
-		default:
-			break;
-	}
-	return len;
-}
-
-/* CPUID has non-C semantics, and paravirt-ops API doesn't match hardware ISA */
-static void vmi_cpuid(unsigned int *ax, unsigned int *bx,
-                               unsigned int *cx, unsigned int *dx)
-{
-	int override = 0;
-	if (*ax == 1)
-		override = 1;
-        asm volatile ("call *%6"
-                      : "=a" (*ax),
-                        "=b" (*bx),
-                        "=c" (*cx),
-                        "=d" (*dx)
-                      : "0" (*ax), "2" (*cx), "r" (vmi_ops.cpuid));
-	if (override) {
-		if (disable_pse)
-			*dx &= ~X86_FEATURE_PSE;
-		if (disable_pge)
-			*dx &= ~X86_FEATURE_PGE;
-		if (disable_sep)
-			*dx &= ~X86_FEATURE_SEP;
-		if (disable_tsc)
-			*dx &= ~X86_FEATURE_TSC;
-		if (disable_mtrr)
-			*dx &= ~X86_FEATURE_MTRR;
-	}
-}
-
-static inline void vmi_maybe_load_tls(struct desc_struct *gdt, int nr, struct desc_struct *new)
-{
-	if (gdt[nr].a != new->a || gdt[nr].b != new->b)
-		write_gdt_entry(gdt, nr, new, 0);
-}
-
-static void vmi_load_tls(struct thread_struct *t, unsigned int cpu)
-{
-	struct desc_struct *gdt = get_cpu_gdt_table(cpu);
-	vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 0, &t->tls_array[0]);
-	vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 1, &t->tls_array[1]);
-	vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 2, &t->tls_array[2]);
-}
-
-static void vmi_set_ldt(const void *addr, unsigned entries)
-{
-	unsigned cpu = smp_processor_id();
-	struct desc_struct desc;
-
-	pack_descriptor(&desc, (unsigned long)addr,
-			entries * sizeof(struct desc_struct) - 1,
-			DESC_LDT, 0);
-	write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, &desc, DESC_LDT);
-	vmi_ops._set_ldt(entries ? GDT_ENTRY_LDT*sizeof(struct desc_struct) : 0);
-}
-
-static void vmi_set_tr(void)
-{
-	vmi_ops.set_tr(GDT_ENTRY_TSS*sizeof(struct desc_struct));
-}
-
-static void vmi_write_idt_entry(gate_desc *dt, int entry, const gate_desc *g)
-{
-	u32 *idt_entry = (u32 *)g;
-	vmi_ops.write_idt_entry(dt, entry, idt_entry[0], idt_entry[1]);
-}
-
-static void vmi_write_gdt_entry(struct desc_struct *dt, int entry,
-				const void *desc, int type)
-{
-	u32 *gdt_entry = (u32 *)desc;
-	vmi_ops.write_gdt_entry(dt, entry, gdt_entry[0], gdt_entry[1]);
-}
-
-static void vmi_write_ldt_entry(struct desc_struct *dt, int entry,
-				const void *desc)
-{
-	u32 *ldt_entry = (u32 *)desc;
-	vmi_ops.write_ldt_entry(dt, entry, ldt_entry[0], ldt_entry[1]);
-}
-
-static void vmi_load_sp0(struct tss_struct *tss,
-				   struct thread_struct *thread)
-{
-	tss->x86_tss.sp0 = thread->sp0;
-
-	/* This can only happen when SEP is enabled, no need to test "SEP"arately */
-	if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
-		tss->x86_tss.ss1 = thread->sysenter_cs;
-		wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
-	}
-	vmi_ops.set_kernel_stack(__KERNEL_DS, tss->x86_tss.sp0);
-}
-
-static void vmi_flush_tlb_user(void)
-{
-	vmi_ops._flush_tlb(VMI_FLUSH_TLB);
-}
-
-static void vmi_flush_tlb_kernel(void)
-{
-	vmi_ops._flush_tlb(VMI_FLUSH_TLB | VMI_FLUSH_GLOBAL);
-}
-
-/* Stub to do nothing at all; used for delays and unimplemented calls */
-static void vmi_nop(void)
-{
-}
-
-static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn)
-{
-	vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
-}
-
-static void vmi_allocate_pmd(struct mm_struct *mm, unsigned long pfn)
-{
- 	/*
-	 * This call comes in very early, before mem_map is setup.
-	 * It is called only for swapper_pg_dir, which already has
-	 * data on it.
-	 */
-	vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
-}
-
-static void vmi_allocate_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count)
-{
-	vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
-}
-
-static void vmi_release_pte(unsigned long pfn)
-{
-	vmi_ops.release_page(pfn, VMI_PAGE_L1);
-}
-
-static void vmi_release_pmd(unsigned long pfn)
-{
-	vmi_ops.release_page(pfn, VMI_PAGE_L2);
-}
-
-/*
- * We use the pgd_free hook for releasing the pgd page:
- */
-static void vmi_pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
-	unsigned long pfn = __pa(pgd) >> PAGE_SHIFT;
-
-	vmi_ops.release_page(pfn, VMI_PAGE_L2);
-}
-
-/*
- * Helper macros for MMU update flags.  We can defer updates until a flush
- * or page invalidation only if the update is to the current address space
- * (otherwise, there is no flush).  We must check against init_mm, since
- * this could be a kernel update, which usually passes init_mm, although
- * sometimes this check can be skipped if we know the particular function
- * is only called on user mode PTEs.  We could change the kernel to pass
- * current->active_mm here, but in particular, I was unsure if changing
- * mm/highmem.c to do this would still be correct on other architectures.
- */
-#define is_current_as(mm, mustbeuser) ((mm) == current->active_mm ||    \
-                                       (!mustbeuser && (mm) == &init_mm))
-#define vmi_flags_addr(mm, addr, level, user)                           \
-        ((level) | (is_current_as(mm, user) ?                           \
-                (VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
-#define vmi_flags_addr_defer(mm, addr, level, user)                     \
-        ((level) | (is_current_as(mm, user) ?                           \
-                (VMI_PAGE_DEFER | VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
-
-static void vmi_update_pte(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
-	vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
-}
-
-static void vmi_update_pte_defer(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
-	vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0));
-}
-
-static void vmi_set_pte(pte_t *ptep, pte_t pte)
-{
-	/* XXX because of set_pmd_pte, this can be called on PT or PD layers */
-	vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT);
-}
-
-static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
-{
-	vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
-}
-
-static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval)
-{
-#ifdef CONFIG_X86_PAE
-	const pte_t pte = { .pte = pmdval.pmd };
-#else
-	const pte_t pte = { pmdval.pud.pgd.pgd };
-#endif
-	vmi_ops.set_pte(pte, (pte_t *)pmdp, VMI_PAGE_PD);
-}
-
-#ifdef CONFIG_X86_PAE
-
-static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval)
-{
-	/*
-	 * XXX This is called from set_pmd_pte, but at both PT
-	 * and PD layers so the VMI_PAGE_PT flag is wrong.  But
-	 * it is only called for large page mapping changes,
-	 * the Xen backend, doesn't support large pages, and the
-	 * ESX backend doesn't depend on the flag.
-	 */
-	set_64bit((unsigned long long *)ptep,pte_val(pteval));
-	vmi_ops.update_pte(ptep, VMI_PAGE_PT);
-}
-
-static void vmi_set_pud(pud_t *pudp, pud_t pudval)
-{
-	/* Um, eww */
-	const pte_t pte = { .pte = pudval.pgd.pgd };
-	vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP);
-}
-
-static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
-	const pte_t pte = { .pte = 0 };
-	vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
-}
-
-static void vmi_pmd_clear(pmd_t *pmd)
-{
-	const pte_t pte = { .pte = 0 };
-	vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD);
-}
-#endif
-
-#ifdef CONFIG_SMP
-static void __devinit
-vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
-		     unsigned long start_esp)
-{
-	struct vmi_ap_state ap;
-
-	/* Default everything to zero.  This is fine for most GPRs. */
-	memset(&ap, 0, sizeof(struct vmi_ap_state));
-
-	ap.gdtr_limit = GDT_SIZE - 1;
-	ap.gdtr_base = (unsigned long) get_cpu_gdt_table(phys_apicid);
-
-	ap.idtr_limit = IDT_ENTRIES * 8 - 1;
-	ap.idtr_base = (unsigned long) idt_table;
-
-	ap.ldtr = 0;
-
-	ap.cs = __KERNEL_CS;
-	ap.eip = (unsigned long) start_eip;
-	ap.ss = __KERNEL_DS;
-	ap.esp = (unsigned long) start_esp;
-
-	ap.ds = __USER_DS;
-	ap.es = __USER_DS;
-	ap.fs = __KERNEL_PERCPU;
-	ap.gs = __KERNEL_STACK_CANARY;
-
-	ap.eflags = 0;
-
-#ifdef CONFIG_X86_PAE
-	/* efer should match BSP efer. */
-	if (cpu_has_nx) {
-		unsigned l, h;
-		rdmsr(MSR_EFER, l, h);
-		ap.efer = (unsigned long long) h << 32 | l;
-	}
-#endif
-
-	ap.cr3 = __pa(swapper_pg_dir);
-	/* Protected mode, paging, AM, WP, NE, MP. */
-	ap.cr0 = 0x80050023;
-	ap.cr4 = mmu_cr4_features;
-	vmi_ops.set_initial_ap_state((u32)&ap, phys_apicid);
-}
-#endif
-
-static void vmi_start_context_switch(struct task_struct *prev)
-{
-	paravirt_start_context_switch(prev);
-	vmi_ops.set_lazy_mode(2);
-}
-
-static void vmi_end_context_switch(struct task_struct *next)
-{
-	vmi_ops.set_lazy_mode(0);
-	paravirt_end_context_switch(next);
-}
-
-static void vmi_enter_lazy_mmu(void)
-{
-	paravirt_enter_lazy_mmu();
-	vmi_ops.set_lazy_mode(1);
-}
-
-static void vmi_leave_lazy_mmu(void)
-{
-	vmi_ops.set_lazy_mode(0);
-	paravirt_leave_lazy_mmu();
-}
-
-static inline int __init check_vmi_rom(struct vrom_header *rom)
-{
-	struct pci_header *pci;
-	struct pnp_header *pnp;
-	const char *manufacturer = "UNKNOWN";
-	const char *product = "UNKNOWN";
-	const char *license = "unspecified";
-
-	if (rom->rom_signature != 0xaa55)
-		return 0;
-	if (rom->vrom_signature != VMI_SIGNATURE)
-		return 0;
-	if (rom->api_version_maj != VMI_API_REV_MAJOR ||
-	    rom->api_version_min+1 < VMI_API_REV_MINOR+1) {
-		printk(KERN_WARNING "VMI: Found mismatched rom version %d.%d\n",
-				rom->api_version_maj,
-				rom->api_version_min);
-		return 0;
-	}
-
-	/*
-	 * Relying on the VMI_SIGNATURE field is not 100% safe, so check
-	 * the PCI header and device type to make sure this is really a
-	 * VMI device.
-	 */
-	if (!rom->pci_header_offs) {
-		printk(KERN_WARNING "VMI: ROM does not contain PCI header.\n");
-		return 0;
-	}
-
-	pci = (struct pci_header *)((char *)rom+rom->pci_header_offs);
-	if (pci->vendorID != PCI_VENDOR_ID_VMWARE ||
-	    pci->deviceID != PCI_DEVICE_ID_VMWARE_VMI) {
-		/* Allow it to run... anyways, but warn */
-		printk(KERN_WARNING "VMI: ROM from unknown manufacturer\n");
-	}
-
-	if (rom->pnp_header_offs) {
-		pnp = (struct pnp_header *)((char *)rom+rom->pnp_header_offs);
-		if (pnp->manufacturer_offset)
-			manufacturer = (const char *)rom+pnp->manufacturer_offset;
-		if (pnp->product_offset)
-			product = (const char *)rom+pnp->product_offset;
-	}
-
-	if (rom->license_offs)
-		license = (char *)rom+rom->license_offs;
-
-	printk(KERN_INFO "VMI: Found %s %s, API version %d.%d, ROM version %d.%d\n",
-		manufacturer, product,
-		rom->api_version_maj, rom->api_version_min,
-		pci->rom_version_maj, pci->rom_version_min);
-
-	/* Don't allow BSD/MIT here for now because we don't want to end up
-	   with any binary only shim layers */
-	if (strcmp(license, "GPL") && strcmp(license, "GPL v2")) {
-		printk(KERN_WARNING "VMI: Non GPL license `%s' found for ROM. Not used.\n",
-			license);
-		return 0;
-	}
-
-	return 1;
-}
-
-/*
- * Probe for the VMI option ROM
- */
-static inline int __init probe_vmi_rom(void)
-{
-	unsigned long base;
-
-	/* VMI ROM is in option ROM area, check signature */
-	for (base = 0xC0000; base < 0xE0000; base += 2048) {
-		struct vrom_header *romstart;
-		romstart = (struct vrom_header *)isa_bus_to_virt(base);
-		if (check_vmi_rom(romstart)) {
-			vmi_rom = romstart;
-			return 1;
-		}
-	}
-	return 0;
-}
-
-/*
- * VMI setup common to all processors
- */
-void vmi_bringup(void)
-{
- 	/* We must establish the lowmem mapping for MMU ops to work */
-	if (vmi_ops.set_linear_mapping)
-		vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, MAXMEM_PFN, 0);
-}
-
-/*
- * Return a pointer to a VMI function or NULL if unimplemented
- */
-static void *vmi_get_function(int vmicall)
-{
-	u64 reloc;
-	const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
-	reloc = call_vrom_long_func(vmi_rom, get_reloc,	vmicall);
-	BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL);
-	if (rel->type == VMI_RELOCATION_CALL_REL)
-		return (void *)rel->eip;
-	else
-		return NULL;
-}
-
-/*
- * Helper macro for making the VMI paravirt-ops fill code readable.
- * For unimplemented operations, fall back to default, unless nop
- * is returned by the ROM.
- */
-#define para_fill(opname, vmicall)				\
-do {								\
-	reloc = call_vrom_long_func(vmi_rom, get_reloc,		\
-				    VMI_CALL_##vmicall);	\
-	if (rel->type == VMI_RELOCATION_CALL_REL) 		\
-		opname = (void *)rel->eip;			\
-	else if (rel->type == VMI_RELOCATION_NOP) 		\
-		opname = (void *)vmi_nop;			\
-	else if (rel->type != VMI_RELOCATION_NONE)		\
-		printk(KERN_WARNING "VMI: Unknown relocation "	\
-				    "type %d for " #vmicall"\n",\
-					rel->type);		\
-} while (0)
-
-/*
- * Helper macro for making the VMI paravirt-ops fill code readable.
- * For cached operations which do not match the VMI ROM ABI and must
- * go through a tranlation stub.  Ignore NOPs, since it is not clear
- * a NOP * VMI function corresponds to a NOP paravirt-op when the
- * functions are not in 1-1 correspondence.
- */
-#define para_wrap(opname, wrapper, cache, vmicall)		\
-do {								\
-	reloc = call_vrom_long_func(vmi_rom, get_reloc,		\
-				    VMI_CALL_##vmicall);	\
-	BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL);		\
-	if (rel->type == VMI_RELOCATION_CALL_REL) {		\
-		opname = wrapper;				\
-		vmi_ops.cache = (void *)rel->eip;		\
-	}							\
-} while (0)
-
-/*
- * Activate the VMI interface and switch into paravirtualized mode
- */
-static inline int __init activate_vmi(void)
-{
-	short kernel_cs;
-	u64 reloc;
-	const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
-
-	/*
-	 * Prevent page tables from being allocated in highmem, even if
-	 * CONFIG_HIGHPTE is enabled.
-	 */
-	__userpte_alloc_gfp &= ~__GFP_HIGHMEM;
-
-	if (call_vrom_func(vmi_rom, vmi_init) != 0) {
-		printk(KERN_ERR "VMI ROM failed to initialize!");
-		return 0;
-	}
-	savesegment(cs, kernel_cs);
-
-	pv_info.paravirt_enabled = 1;
-	pv_info.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK;
-	pv_info.name = "vmi [deprecated]";
-
-	pv_init_ops.patch = vmi_patch;
-
-	/*
-	 * Many of these operations are ABI compatible with VMI.
-	 * This means we can fill in the paravirt-ops with direct
-	 * pointers into the VMI ROM.  If the calling convention for
-	 * these operations changes, this code needs to be updated.
-	 *
-	 * Exceptions
-	 *  CPUID paravirt-op uses pointers, not the native ISA
-	 *  halt has no VMI equivalent; all VMI halts are "safe"
-	 *  no MSR support yet - just trap and emulate.  VMI uses the
-	 *    same ABI as the native ISA, but Linux wants exceptions
-	 *    from bogus MSR read / write handled
-	 *  rdpmc is not yet used in Linux
-	 */
-
-	/* CPUID is special, so very special it gets wrapped like a present */
-	para_wrap(pv_cpu_ops.cpuid, vmi_cpuid, cpuid, CPUID);
-
-	para_fill(pv_cpu_ops.clts, CLTS);
-	para_fill(pv_cpu_ops.get_debugreg, GetDR);
-	para_fill(pv_cpu_ops.set_debugreg, SetDR);
-	para_fill(pv_cpu_ops.read_cr0, GetCR0);
-	para_fill(pv_mmu_ops.read_cr2, GetCR2);
-	para_fill(pv_mmu_ops.read_cr3, GetCR3);
-	para_fill(pv_cpu_ops.read_cr4, GetCR4);
-	para_fill(pv_cpu_ops.write_cr0, SetCR0);
-	para_fill(pv_mmu_ops.write_cr2, SetCR2);
-	para_fill(pv_mmu_ops.write_cr3, SetCR3);
-	para_fill(pv_cpu_ops.write_cr4, SetCR4);
-
-	para_fill(pv_irq_ops.save_fl.func, GetInterruptMask);
-	para_fill(pv_irq_ops.restore_fl.func, SetInterruptMask);
-	para_fill(pv_irq_ops.irq_disable.func, DisableInterrupts);
-	para_fill(pv_irq_ops.irq_enable.func, EnableInterrupts);
-
-	para_fill(pv_cpu_ops.wbinvd, WBINVD);
-	para_fill(pv_cpu_ops.read_tsc, RDTSC);
-
-	/* The following we emulate with trap and emulate for now */
-	/* paravirt_ops.read_msr = vmi_rdmsr */
-	/* paravirt_ops.write_msr = vmi_wrmsr */
-	/* paravirt_ops.rdpmc = vmi_rdpmc */
-
-	/* TR interface doesn't pass TR value, wrap */
-	para_wrap(pv_cpu_ops.load_tr_desc, vmi_set_tr, set_tr, SetTR);
-
-	/* LDT is special, too */
-	para_wrap(pv_cpu_ops.set_ldt, vmi_set_ldt, _set_ldt, SetLDT);
-
-	para_fill(pv_cpu_ops.load_gdt, SetGDT);
-	para_fill(pv_cpu_ops.load_idt, SetIDT);
-	para_fill(pv_cpu_ops.store_gdt, GetGDT);
-	para_fill(pv_cpu_ops.store_idt, GetIDT);
-	para_fill(pv_cpu_ops.store_tr, GetTR);
-	pv_cpu_ops.load_tls = vmi_load_tls;
-	para_wrap(pv_cpu_ops.write_ldt_entry, vmi_write_ldt_entry,
-		  write_ldt_entry, WriteLDTEntry);
-	para_wrap(pv_cpu_ops.write_gdt_entry, vmi_write_gdt_entry,
-		  write_gdt_entry, WriteGDTEntry);
-	para_wrap(pv_cpu_ops.write_idt_entry, vmi_write_idt_entry,
-		  write_idt_entry, WriteIDTEntry);
-	para_wrap(pv_cpu_ops.load_sp0, vmi_load_sp0, set_kernel_stack, UpdateKernelStack);
-	para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask);
-	para_fill(pv_cpu_ops.io_delay, IODelay);
-
-	para_wrap(pv_cpu_ops.start_context_switch, vmi_start_context_switch,
-		  set_lazy_mode, SetLazyMode);
-	para_wrap(pv_cpu_ops.end_context_switch, vmi_end_context_switch,
-		  set_lazy_mode, SetLazyMode);
-
-	para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu,
-		  set_lazy_mode, SetLazyMode);
-	para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy_mmu,
-		  set_lazy_mode, SetLazyMode);
-
-	/* user and kernel flush are just handled with different flags to FlushTLB */
-	para_wrap(pv_mmu_ops.flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB);
-	para_wrap(pv_mmu_ops.flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB);
-	para_fill(pv_mmu_ops.flush_tlb_single, InvalPage);
-
-	/*
-	 * Until a standard flag format can be agreed on, we need to
-	 * implement these as wrappers in Linux.  Get the VMI ROM
-	 * function pointers for the two backend calls.
-	 */
-#ifdef CONFIG_X86_PAE
-	vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxELong);
-	vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxELong);
-#else
-	vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxE);
-	vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxE);
-#endif
-
-	if (vmi_ops.set_pte) {
-		pv_mmu_ops.set_pte = vmi_set_pte;
-		pv_mmu_ops.set_pte_at = vmi_set_pte_at;
-		pv_mmu_ops.set_pmd = vmi_set_pmd;
-#ifdef CONFIG_X86_PAE
-		pv_mmu_ops.set_pte_atomic = vmi_set_pte_atomic;
-		pv_mmu_ops.set_pud = vmi_set_pud;
-		pv_mmu_ops.pte_clear = vmi_pte_clear;
-		pv_mmu_ops.pmd_clear = vmi_pmd_clear;
-#endif
-	}
-
-	if (vmi_ops.update_pte) {
-		pv_mmu_ops.pte_update = vmi_update_pte;
-		pv_mmu_ops.pte_update_defer = vmi_update_pte_defer;
-	}
-
-	vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage);
-	if (vmi_ops.allocate_page) {
-		pv_mmu_ops.alloc_pte = vmi_allocate_pte;
-		pv_mmu_ops.alloc_pmd = vmi_allocate_pmd;
-		pv_mmu_ops.alloc_pmd_clone = vmi_allocate_pmd_clone;
-	}
-
-	vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage);
-	if (vmi_ops.release_page) {
-		pv_mmu_ops.release_pte = vmi_release_pte;
-		pv_mmu_ops.release_pmd = vmi_release_pmd;
-		pv_mmu_ops.pgd_free = vmi_pgd_free;
-	}
-
-	/* Set linear is needed in all cases */
-	vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping);
-
-	/*
-	 * These MUST always be patched.  Don't support indirect jumps
-	 * through these operations, as the VMI interface may use either
-	 * a jump or a call to get to these operations, depending on
-	 * the backend.  They are performance critical anyway, so requiring
-	 * a patch is not a big problem.
-	 */
-	pv_cpu_ops.irq_enable_sysexit = (void *)0xfeedbab0;
-	pv_cpu_ops.iret = (void *)0xbadbab0;
-
-#ifdef CONFIG_SMP
-	para_wrap(pv_apic_ops.startup_ipi_hook, vmi_startup_ipi_hook, set_initial_ap_state, SetInitialAPState);
-#endif
-
-#ifdef CONFIG_X86_LOCAL_APIC
-       para_fill(apic->read, APICRead);
-       para_fill(apic->write, APICWrite);
-#endif
-
-	/*
-	 * Check for VMI timer functionality by probing for a cycle frequency method
-	 */
-	reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_GetCycleFrequency);
-	if (!disable_vmi_timer && rel->type != VMI_RELOCATION_NONE) {
-		vmi_timer_ops.get_cycle_frequency = (void *)rel->eip;
-		vmi_timer_ops.get_cycle_counter =
-			vmi_get_function(VMI_CALL_GetCycleCounter);
-		vmi_timer_ops.get_wallclock =
-			vmi_get_function(VMI_CALL_GetWallclockTime);
-		vmi_timer_ops.wallclock_updated =
-			vmi_get_function(VMI_CALL_WallclockUpdated);
-		vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm);
-		vmi_timer_ops.cancel_alarm =
-			 vmi_get_function(VMI_CALL_CancelAlarm);
-		x86_init.timers.timer_init = vmi_time_init;
-#ifdef CONFIG_X86_LOCAL_APIC
-		x86_init.timers.setup_percpu_clockev = vmi_time_bsp_init;
-		x86_cpuinit.setup_percpu_clockev = vmi_time_ap_init;
-#endif
-		pv_time_ops.sched_clock = vmi_sched_clock;
-		x86_platform.calibrate_tsc = vmi_tsc_khz;
-		x86_platform.get_wallclock = vmi_get_wallclock;
-		x86_platform.set_wallclock = vmi_set_wallclock;
-
-		/* We have true wallclock functions; disable CMOS clock sync */
-		no_sync_cmos_clock = 1;
-	} else {
-		disable_noidle = 1;
-		disable_vmi_timer = 1;
-	}
-
-	para_fill(pv_irq_ops.safe_halt, Halt);
-
-	/*
-	 * Alternative instruction rewriting doesn't happen soon enough
-	 * to convert VMI_IRET to a call instead of a jump; so we have
-	 * to do this before IRQs get reenabled.  Fortunately, it is
-	 * idempotent.
-	 */
-	apply_paravirt(__parainstructions, __parainstructions_end);
-
-	vmi_bringup();
-
-	return 1;
-}
-
-#undef para_fill
-
-void __init vmi_init(void)
-{
-	if (!vmi_rom)
-		probe_vmi_rom();
-	else
-		check_vmi_rom(vmi_rom);
-
-	/* In case probing for or validating the ROM failed, basil */
-	if (!vmi_rom)
-		return;
-
-	reserve_top_address(-vmi_rom->virtual_top);
-
-#ifdef CONFIG_X86_IO_APIC
-	/* This is virtual hardware; timer routing is wired correctly */
-	no_timer_check = 1;
-#endif
-}
-
-void __init vmi_activate(void)
-{
-	unsigned long flags;
-
-	if (!vmi_rom)
-		return;
-
-	local_irq_save(flags);
-	activate_vmi();
-	local_irq_restore(flags & X86_EFLAGS_IF);
-}
-
-static int __init parse_vmi(char *arg)
-{
-	if (!arg)
-		return -EINVAL;
-
-	if (!strcmp(arg, "disable_pge")) {
-		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE);
-		disable_pge = 1;
-	} else if (!strcmp(arg, "disable_pse")) {
-		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PSE);
-		disable_pse = 1;
-	} else if (!strcmp(arg, "disable_sep")) {
-		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP);
-		disable_sep = 1;
-	} else if (!strcmp(arg, "disable_tsc")) {
-		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC);
-		disable_tsc = 1;
-	} else if (!strcmp(arg, "disable_mtrr")) {
-		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_MTRR);
-		disable_mtrr = 1;
-	} else if (!strcmp(arg, "disable_timer")) {
-		disable_vmi_timer = 1;
-		disable_noidle = 1;
-	} else if (!strcmp(arg, "disable_noidle"))
-		disable_noidle = 1;
-	return 0;
-}
-
-early_param("vmi", parse_vmi);
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
deleted file mode 100644
index 5e1ff66ecd7..00000000000
--- a/arch/x86/kernel/vmiclock_32.c
+++ /dev/null
@@ -1,317 +0,0 @@
-/*
- * VMI paravirtual timer support routines.
- *
- * Copyright (C) 2007, VMware, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- */
-
-#include <linux/smp.h>
-#include <linux/interrupt.h>
-#include <linux/cpumask.h>
-#include <linux/clocksource.h>
-#include <linux/clockchips.h>
-
-#include <asm/vmi.h>
-#include <asm/vmi_time.h>
-#include <asm/apicdef.h>
-#include <asm/apic.h>
-#include <asm/timer.h>
-#include <asm/i8253.h>
-#include <asm/irq_vectors.h>
-
-#define VMI_ONESHOT  (VMI_ALARM_IS_ONESHOT  | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
-#define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
-
-static DEFINE_PER_CPU(struct clock_event_device, local_events);
-
-static inline u32 vmi_counter(u32 flags)
-{
-	/* Given VMI_ONESHOT or VMI_PERIODIC, return the corresponding
-	 * cycle counter. */
-	return flags & VMI_ALARM_COUNTER_MASK;
-}
-
-/* paravirt_ops.get_wallclock = vmi_get_wallclock */
-unsigned long vmi_get_wallclock(void)
-{
-	unsigned long long wallclock;
-	wallclock = vmi_timer_ops.get_wallclock(); // nsec
-	(void)do_div(wallclock, 1000000000);       // sec
-
-	return wallclock;
-}
-
-/* paravirt_ops.set_wallclock = vmi_set_wallclock */
-int vmi_set_wallclock(unsigned long now)
-{
-	return 0;
-}
-
-/* paravirt_ops.sched_clock = vmi_sched_clock */
-unsigned long long vmi_sched_clock(void)
-{
-	return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE));
-}
-
-/* x86_platform.calibrate_tsc = vmi_tsc_khz */
-unsigned long vmi_tsc_khz(void)
-{
-	unsigned long long khz;
-	khz = vmi_timer_ops.get_cycle_frequency();
-	(void)do_div(khz, 1000);
-	return khz;
-}
-
-static inline unsigned int vmi_get_timer_vector(void)
-{
-	return IRQ0_VECTOR;
-}
-
-/** vmi clockchip */
-#ifdef CONFIG_X86_LOCAL_APIC
-static unsigned int startup_timer_irq(unsigned int irq)
-{
-	unsigned long val = apic_read(APIC_LVTT);
-	apic_write(APIC_LVTT, vmi_get_timer_vector());
-
-	return (val & APIC_SEND_PENDING);
-}
-
-static void mask_timer_irq(unsigned int irq)
-{
-	unsigned long val = apic_read(APIC_LVTT);
-	apic_write(APIC_LVTT, val | APIC_LVT_MASKED);
-}
-
-static void unmask_timer_irq(unsigned int irq)
-{
-	unsigned long val = apic_read(APIC_LVTT);
-	apic_write(APIC_LVTT, val & ~APIC_LVT_MASKED);
-}
-
-static void ack_timer_irq(unsigned int irq)
-{
-	ack_APIC_irq();
-}
-
-static struct irq_chip vmi_chip __read_mostly = {
-	.name 		= "VMI-LOCAL",
-	.startup 	= startup_timer_irq,
-	.mask	 	= mask_timer_irq,
-	.unmask	 	= unmask_timer_irq,
-	.ack 		= ack_timer_irq
-};
-#endif
-
-/** vmi clockevent */
-#define VMI_ALARM_WIRED_IRQ0    0x00000000
-#define VMI_ALARM_WIRED_LVTT    0x00010000
-static int vmi_wiring = VMI_ALARM_WIRED_IRQ0;
-
-static inline int vmi_get_alarm_wiring(void)
-{
-	return vmi_wiring;
-}
-
-static void vmi_timer_set_mode(enum clock_event_mode mode,
-			       struct clock_event_device *evt)
-{
-	cycle_t now, cycles_per_hz;
-	BUG_ON(!irqs_disabled());
-
-	switch (mode) {
-	case CLOCK_EVT_MODE_ONESHOT:
-	case CLOCK_EVT_MODE_RESUME:
-		break;
-	case CLOCK_EVT_MODE_PERIODIC:
-		cycles_per_hz = vmi_timer_ops.get_cycle_frequency();
-		(void)do_div(cycles_per_hz, HZ);
-		now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_PERIODIC));
-		vmi_timer_ops.set_alarm(VMI_PERIODIC, now, cycles_per_hz);
-		break;
-	case CLOCK_EVT_MODE_UNUSED:
-	case CLOCK_EVT_MODE_SHUTDOWN:
-		switch (evt->mode) {
-		case CLOCK_EVT_MODE_ONESHOT:
-			vmi_timer_ops.cancel_alarm(VMI_ONESHOT);
-			break;
-		case CLOCK_EVT_MODE_PERIODIC:
-			vmi_timer_ops.cancel_alarm(VMI_PERIODIC);
-			break;
-		default:
-			break;
-		}
-		break;
-	default:
-		break;
-	}
-}
-
-static int vmi_timer_next_event(unsigned long delta,
-				struct clock_event_device *evt)
-{
-	/* Unfortunately, set_next_event interface only passes relative
-	 * expiry, but we want absolute expiry.  It'd be better if were
-	 * were passed an absolute expiry, since a bunch of time may
-	 * have been stolen between the time the delta is computed and
-	 * when we set the alarm below. */
-	cycle_t now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_ONESHOT));
-
-	BUG_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
-	vmi_timer_ops.set_alarm(VMI_ONESHOT, now + delta, 0);
-	return 0;
-}
-
-static struct clock_event_device vmi_clockevent = {
-	.name		= "vmi-timer",
-	.features	= CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
-	.shift		= 22,
-	.set_mode	= vmi_timer_set_mode,
-	.set_next_event = vmi_timer_next_event,
-	.rating         = 1000,
-	.irq		= 0,
-};
-
-static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id)
-{
-	struct clock_event_device *evt = &__get_cpu_var(local_events);
-	evt->event_handler(evt);
-	return IRQ_HANDLED;
-}
-
-static struct irqaction vmi_clock_action  = {
-	.name 		= "vmi-timer",
-	.handler 	= vmi_timer_interrupt,
-	.flags 		= IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TIMER,
-};
-
-static void __devinit vmi_time_init_clockevent(void)
-{
-	cycle_t cycles_per_msec;
-	struct clock_event_device *evt;
-
-	int cpu = smp_processor_id();
-	evt = &__get_cpu_var(local_events);
-
-	/* Use cycles_per_msec since div_sc params are 32-bits. */
-	cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
-	(void)do_div(cycles_per_msec, 1000);
-
-	memcpy(evt, &vmi_clockevent, sizeof(*evt));
-	/* Must pick .shift such that .mult fits in 32-bits.  Choosing
-	 * .shift to be 22 allows 2^(32-22) cycles per nano-seconds
-	 * before overflow. */
-	evt->mult = div_sc(cycles_per_msec, NSEC_PER_MSEC, evt->shift);
-	/* Upper bound is clockevent's use of ulong for cycle deltas. */
-	evt->max_delta_ns = clockevent_delta2ns(ULONG_MAX, evt);
-	evt->min_delta_ns = clockevent_delta2ns(1, evt);
-	evt->cpumask = cpumask_of(cpu);
-
-	printk(KERN_WARNING "vmi: registering clock event %s. mult=%u shift=%u\n",
-	       evt->name, evt->mult, evt->shift);
-	clockevents_register_device(evt);
-}
-
-void __init vmi_time_init(void)
-{
-	unsigned int cpu;
-	/* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */
-	outb_pit(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
-
-	vmi_time_init_clockevent();
-	setup_irq(0, &vmi_clock_action);
-	for_each_possible_cpu(cpu)
-		per_cpu(vector_irq, cpu)[vmi_get_timer_vector()] = 0;
-}
-
-#ifdef CONFIG_X86_LOCAL_APIC
-void __devinit vmi_time_bsp_init(void)
-{
-	/*
-	 * On APIC systems, we want local timers to fire on each cpu.  We do
-	 * this by programming LVTT to deliver timer events to the IRQ handler
-	 * for IRQ-0, since we can't re-use the APIC local timer handler
-	 * without interfering with that code.
-	 */
-	clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
-	local_irq_disable();
-#ifdef CONFIG_SMP
-	/*
-	 * XXX handle_percpu_irq only defined for SMP; we need to switch over
-	 * to using it, since this is a local interrupt, which each CPU must
-	 * handle individually without locking out or dropping simultaneous
-	 * local timers on other CPUs.  We also don't want to trigger the
-	 * quirk workaround code for interrupts which gets invoked from
-	 * handle_percpu_irq via eoi, so we use our own IRQ chip.
-	 */
-	set_irq_chip_and_handler_name(0, &vmi_chip, handle_percpu_irq, "lvtt");
-#else
-	set_irq_chip_and_handler_name(0, &vmi_chip, handle_edge_irq, "lvtt");
-#endif
-	vmi_wiring = VMI_ALARM_WIRED_LVTT;
-	apic_write(APIC_LVTT, vmi_get_timer_vector());
-	local_irq_enable();
-	clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
-}
-
-void __devinit vmi_time_ap_init(void)
-{
-	vmi_time_init_clockevent();
-	apic_write(APIC_LVTT, vmi_get_timer_vector());
-}
-#endif
-
-/** vmi clocksource */
-static struct clocksource clocksource_vmi;
-
-static cycle_t read_real_cycles(struct clocksource *cs)
-{
-	cycle_t ret = (cycle_t)vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL);
-	return max(ret, clocksource_vmi.cycle_last);
-}
-
-static struct clocksource clocksource_vmi = {
-	.name			= "vmi-timer",
-	.rating			= 450,
-	.read			= read_real_cycles,
-	.mask			= CLOCKSOURCE_MASK(64),
-	.mult			= 0, /* to be set */
-	.shift			= 22,
-	.flags			= CLOCK_SOURCE_IS_CONTINUOUS,
-};
-
-static int __init init_vmi_clocksource(void)
-{
-	cycle_t cycles_per_msec;
-
-	if (!vmi_timer_ops.get_cycle_frequency)
-		return 0;
-	/* Use khz2mult rather than hz2mult since hz arg is only 32-bits. */
-	cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
-	(void)do_div(cycles_per_msec, 1000);
-
-	/* Note that clocksource.{mult, shift} converts in the opposite direction
-	 * as clockevents.  */
-	clocksource_vmi.mult = clocksource_khz2mult(cycles_per_msec,
-						    clocksource_vmi.shift);
-
-	printk(KERN_WARNING "vmi: registering clock source khz=%lld\n", cycles_per_msec);
-	return clocksource_register(&clocksource_vmi);
-
-}
-module_init(init_vmi_clocksource);
-- 
cgit v1.2.3-18-g5258


From b0f4c062fb6dd4c02b1fe6de73319ed50a09b27d Mon Sep 17 00:00:00 2001
From: Alok Kataria <akataria@vmware.com>
Date: Mon, 23 Aug 2010 17:05:57 -0700
Subject: x86, paravirt: Remove alloc_pmd_clone hook, only used by VMI

VMI was the only user of the alloc_pmd_clone hook, given that VMI
is now removed we can also remove this hook.

Signed-off-by: Alok N Kataria <akataria@vmware.com>
LKML-Reference: <1282608357.19396.36.camel@ank32.eng.vmware.com>
Cc: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/paravirt.h       | 5 -----
 arch/x86/include/asm/paravirt_types.h | 1 -
 arch/x86/kernel/paravirt.c            | 1 -
 arch/x86/mm/pgtable.c                 | 4 ----
 arch/x86/xen/mmu.c                    | 1 -
 5 files changed, 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 5653f43d90e..edecb4ed221 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -416,11 +416,6 @@ static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
 	PVOP_VCALL2(pv_mmu_ops.alloc_pmd, mm, pfn);
 }
 
-static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
-					    unsigned long start, unsigned long count)
-{
-	PVOP_VCALL4(pv_mmu_ops.alloc_pmd_clone, pfn, clonepfn, start, count);
-}
 static inline void paravirt_release_pmd(unsigned long pfn)
 {
 	PVOP_VCALL1(pv_mmu_ops.release_pmd, pfn);
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index db9ef553234..b82bac97525 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -255,7 +255,6 @@ struct pv_mmu_ops {
 	 */
 	void (*alloc_pte)(struct mm_struct *mm, unsigned long pfn);
 	void (*alloc_pmd)(struct mm_struct *mm, unsigned long pfn);
-	void (*alloc_pmd_clone)(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count);
 	void (*alloc_pud)(struct mm_struct *mm, unsigned long pfn);
 	void (*release_pte)(unsigned long pfn);
 	void (*release_pmd)(unsigned long pfn);
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 1db183ed7c0..c5b250011fd 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -413,7 +413,6 @@ struct pv_mmu_ops pv_mmu_ops = {
 
 	.alloc_pte = paravirt_nop,
 	.alloc_pmd = paravirt_nop,
-	.alloc_pmd_clone = paravirt_nop,
 	.alloc_pud = paravirt_nop,
 	.release_pte = paravirt_nop,
 	.release_pmd = paravirt_nop,
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 5c4ee422590..a96023e872a 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -98,10 +98,6 @@ static void pgd_ctor(pgd_t *pgd)
 		clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
 				swapper_pg_dir + KERNEL_PGD_BOUNDARY,
 				KERNEL_PGD_PTRS);
-		paravirt_alloc_pmd_clone(__pa(pgd) >> PAGE_SHIFT,
-					 __pa(swapper_pg_dir) >> PAGE_SHIFT,
-					 KERNEL_PGD_BOUNDARY,
-					 KERNEL_PGD_PTRS);
 	}
 
 	/* list required to sync kernel mapping updates */
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 42086ac406a..b2363fcbcd0 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1969,7 +1969,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
 	.alloc_pte = xen_alloc_pte_init,
 	.release_pte = xen_release_pte_init,
 	.alloc_pmd = xen_alloc_pmd_init,
-	.alloc_pmd_clone = paravirt_nop,
 	.release_pmd = xen_release_pmd_init,
 
 #ifdef CONFIG_X86_64
-- 
cgit v1.2.3-18-g5258


From d0cd7425fab774a480cce17c2f649984312d0b55 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Tue, 24 Aug 2010 17:32:04 -0700
Subject: x86, bios: By default, reserve the low 64K for all BIOSes

The laundry list of BIOSes that need the low 64K reserved is getting
very long, so make it the default across all BIOSes.  This also allows
the code to be simplified and unified with the reservation code for
the first 4K.

This resolves kernel bugzilla 16661 and who knows what else...

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
LKML-Reference: <tip-*@git.kernel.org>
---
 arch/x86/Kconfig        | 47 ++++++++++++++++-----------
 arch/x86/kernel/setup.c | 84 +++++--------------------------------------------
 2 files changed, 35 insertions(+), 96 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cea0cd9a316..683ae8f9bd0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1326,25 +1326,34 @@ config X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
 	  Set whether the default state of memory_corruption_check is
 	  on or off.
 
-config X86_RESERVE_LOW_64K
-	bool "Reserve low 64K of RAM on AMI/Phoenix BIOSen"
-	default y
-	---help---
-	  Reserve the first 64K of physical RAM on BIOSes that are known
-	  to potentially corrupt that memory range. A numbers of BIOSes are
-	  known to utilize this area during suspend/resume, so it must not
-	  be used by the kernel.
-
-	  Set this to N if you are absolutely sure that you trust the BIOS
-	  to get all its memory reservations and usages right.
-
-	  If you have doubts about the BIOS (e.g. suspend/resume does not
-	  work or there's kernel crashes after certain hardware hotplug
-	  events) and it's not AMI or Phoenix, then you might want to enable
-	  X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check typical
-	  corruption patterns.
-
-	  Say Y if unsure.
+config X86_LOW_RESERVE
+	int "Amount of low memory, in kilobytes, to reserve for the BIOS"
+	default 64
+	range 4 640
+	---help---
+	  Specify the amount of low memory to reserve for the BIOS.
+
+	  The first page contains BIOS data structures that the kernel
+	  must not use, so that page must always be reserved.
+
+	  By default we reserve the first 64K of physical RAM, as a
+	  number of BIOSes are known to corrupt that memory range
+	  during events such as suspend/resume or monitor cable
+	  insertion, so it must not be used by the kernel.
+
+	  You can set this to 4 if you are absolutely sure that you
+	  trust the BIOS to get all its memory reservations and usages
+	  right.  If you know your BIOS have problems beyond the
+	  default 64K area, you can set this to 640 to avoid using the
+	  entire low memory range.
+
+	  If you have doubts about the BIOS (e.g. suspend/resume does
+	  not work or there's kernel crashes after certain hardware
+	  hotplug events) then you might want to enable
+	  X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check
+	  typical corruption patterns.
+
+	  Leave this to the default value of 64 if you are unsure.
 
 config MATH_EMULATION
 	bool
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index c3a4fbb2b99..eb87f1c83f9 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -618,88 +618,20 @@ static __init void reserve_ibft_region(void)
 		reserve_early_overlap_ok(addr, addr + size, "ibft");
 }
 
-#ifdef CONFIG_X86_RESERVE_LOW_64K
-static int __init dmi_low_memory_corruption(const struct dmi_system_id *d)
-{
-	printk(KERN_NOTICE
-		"%s detected: BIOS may corrupt low RAM, working around it.\n",
-		d->ident);
-
-	e820_update_range(0, 0x10000, E820_RAM, E820_RESERVED);
-	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
-
-	return 0;
-}
-#endif
-
-/* List of systems that have known low memory corruption BIOS problems */
-static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
-#ifdef CONFIG_X86_RESERVE_LOW_64K
-	{
-		.callback = dmi_low_memory_corruption,
-		.ident = "AMI BIOS",
-		.matches = {
-			DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
-		},
-	},
-	{
-		.callback = dmi_low_memory_corruption,
-		.ident = "Phoenix BIOS",
-		.matches = {
-			DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies"),
-		},
-	},
-	{
-		.callback = dmi_low_memory_corruption,
-		.ident = "Phoenix/MSC BIOS",
-		.matches = {
-			DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix/MSC"),
-		},
-	},
-	/*
-	 * AMI BIOS with low memory corruption was found on Intel DG45ID and
-	 * DG45FC boards.
-	 * It has a different DMI_BIOS_VENDOR = "Intel Corp.", for now we will
-	 * match only DMI_BOARD_NAME and see if there is more bad products
-	 * with this vendor.
-	 */
-	{
-		.callback = dmi_low_memory_corruption,
-		.ident = "AMI BIOS",
-		.matches = {
-			DMI_MATCH(DMI_BOARD_NAME, "DG45ID"),
-		},
-	},
-	{
-		.callback = dmi_low_memory_corruption,
-		.ident = "AMI BIOS",
-		.matches = {
-			DMI_MATCH(DMI_BOARD_NAME, "DG45FC"),
-		},
-	},
-	/*
-	 * The Dell Inspiron Mini 1012 has DMI_BIOS_VENDOR = "Dell Inc.", so
-	 * match on the product name.
-	 */
-	{
-		.callback = dmi_low_memory_corruption,
-		.ident = "Phoenix BIOS",
-		.matches = {
-			DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 1012"),
-		},
-	},
-#endif
-	{}
-};
-
 static void __init trim_bios_range(void)
 {
 	/*
 	 * A special case is the first 4Kb of memory;
 	 * This is a BIOS owned area, not kernel ram, but generally
 	 * not listed as such in the E820 table.
+	 *
+	 * This typically reserves additional memory (64KiB by default)
+	 * since some BIOSes are known to corrupt low memory.  See the
+	 * Kconfig help text for X86_LOW_RESERVE.
 	 */
-	e820_update_range(0, PAGE_SIZE, E820_RAM, E820_RESERVED);
+	e820_update_range(0, ALIGN(CONFIG_X86_LOW_RESERVE << 10, PAGE_SIZE),
+			  E820_RAM, E820_RESERVED);
+
 	/*
 	 * special case: Some BIOSen report the PC BIOS
 	 * area (640->1Mb) as ram even though it is not.
@@ -863,8 +795,6 @@ void __init setup_arch(char **cmdline_p)
 
 	dmi_scan_machine();
 
-	dmi_check_system(bad_bios_dmi_table);
-
 	/*
 	 * VMware detection requires dmi to be available, so this
 	 * needs to be done after dmi_scan_machine, for the BP.
-- 
cgit v1.2.3-18-g5258


From 04fba67163a9e6132614b72b33bb2743bd33ffb3 Mon Sep 17 00:00:00 2001
From: Lin Ming <ming.m.lin@intel.com>
Date: Wed, 25 Aug 2010 14:49:09 +0800
Subject: perf: Remove unused variable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This fixes the following build warning introduced by the
callchain rework:

  arch/x86/kernel/cpu/perf_event.c:1574: warning: ‘perf_callchain_entry_nmi’ defined but not used

Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1282718949.16443.75.camel@minggr.sh.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index acc52afaf7b..bcddac61422 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1571,9 +1571,6 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
  * callchain support
  */
 
-static DEFINE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry_nmi);
-
-
 static void
 backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
 {
-- 
cgit v1.2.3-18-g5258


From acf01734b1747b1ec4be6f159aff579ea5f7f8e2 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@amd64.org>
Date: Wed, 25 Aug 2010 18:28:23 +0200
Subject: x86, tsc: Remove CPU frequency calibration on AMD

6b37f5a20c0e5c334c010a587058354215433e92 introduced the CPU frequency
calibration code for AMD CPUs whose TSCs didn't increment with the
core's P0 frequency. From F10h, revB onward, however, the TSC increment
rate is denoted by MSRC001_0015[24] and when this bit is set (which
should be done by the BIOS) the TSC increments with the P0 frequency
so the calibration is not needed and booting can be a couple of mcecs
faster on those machines.

Besides, there should be virtually no machines out there which don't
have this bit set, therefore this calibration can be safely removed. It
is a shaky hack anyway since it assumes implicitly that the core is in
P0 when BIOS hands off to the OS, which might not always be the case.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <20100825162823.GE26438@aftab>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/cpu/amd.c | 17 ++++++++++++++
 arch/x86/kernel/tsc.c     | 58 -----------------------------------------------
 2 files changed, 17 insertions(+), 58 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index ba5f62f45f0..fc563fabde6 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -412,6 +412,23 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
 			set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
 	}
 #endif
+
+	/* We need to do the following only once */
+	if (c != &boot_cpu_data)
+		return;
+
+	if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) {
+
+		if (c->x86 > 0x10 ||
+		    (c->x86 == 0x10 && c->x86_model >= 0x2)) {
+			u64 val;
+
+			rdmsrl(MSR_K7_HWCR, val);
+			if (!(val & BIT(24)))
+				printk(KERN_WARNING FW_BUG "TSC doesn't count "
+					"with P0 frequency!\n");
+		}
+	}
 }
 
 static void __cpuinit init_amd(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index ce8e5023933..13b6a6cc77f 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -854,60 +854,6 @@ static void __init init_tsc_clocksource(void)
 	clocksource_register_khz(&clocksource_tsc, tsc_khz);
 }
 
-#ifdef CONFIG_X86_64
-/*
- * calibrate_cpu is used on systems with fixed rate TSCs to determine
- * processor frequency
- */
-#define TICK_COUNT 100000000
-static unsigned long __init calibrate_cpu(void)
-{
-	int tsc_start, tsc_now;
-	int i, no_ctr_free;
-	unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0;
-	unsigned long flags;
-
-	for (i = 0; i < 4; i++)
-		if (avail_to_resrv_perfctr_nmi_bit(i))
-			break;
-	no_ctr_free = (i == 4);
-	if (no_ctr_free) {
-		WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... "
-		     "cpu_khz value may be incorrect.\n");
-		i = 3;
-		rdmsrl(MSR_K7_EVNTSEL3, evntsel3);
-		wrmsrl(MSR_K7_EVNTSEL3, 0);
-		rdmsrl(MSR_K7_PERFCTR3, pmc3);
-	} else {
-		reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i);
-		reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
-	}
-	local_irq_save(flags);
-	/* start measuring cycles, incrementing from 0 */
-	wrmsrl(MSR_K7_PERFCTR0 + i, 0);
-	wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76);
-	rdtscl(tsc_start);
-	do {
-		rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now);
-		tsc_now = get_cycles();
-	} while ((tsc_now - tsc_start) < TICK_COUNT);
-
-	local_irq_restore(flags);
-	if (no_ctr_free) {
-		wrmsrl(MSR_K7_EVNTSEL3, 0);
-		wrmsrl(MSR_K7_PERFCTR3, pmc3);
-		wrmsrl(MSR_K7_EVNTSEL3, evntsel3);
-	} else {
-		release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
-		release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
-	}
-
-	return pmc_now * tsc_khz / (tsc_now - tsc_start);
-}
-#else
-static inline unsigned long calibrate_cpu(void) { return cpu_khz; }
-#endif
-
 void __init tsc_init(void)
 {
 	u64 lpj;
@@ -926,10 +872,6 @@ void __init tsc_init(void)
 		return;
 	}
 
-	if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) &&
-			(boot_cpu_data.x86_vendor == X86_VENDOR_AMD))
-		cpu_khz = calibrate_cpu();
-
 	printk("Detected %lu.%03lu MHz processor.\n",
 			(unsigned long)cpu_khz / 1000,
 			(unsigned long)cpu_khz % 1000);
-- 
cgit v1.2.3-18-g5258


From 9ea77bdb39b62c9bf9fd3cdd1c25a9420bccd380 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Wed, 25 Aug 2010 16:38:20 -0700
Subject: x86, bios: Make the x86 early memory reservation a kernel option

Add a kernel command-line option so the x86 early memory reservation
size can be adjusted at runtime instead of only at compile time.

Suggested-by: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <tip-d0cd7425fab774a480cce17c2f649984312d0b55@git.kernel.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/Kconfig        |  2 +-
 arch/x86/kernel/setup.c | 28 ++++++++++++++++++++++++++--
 2 files changed, 27 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 683ae8f9bd0..d3590008c5d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1326,7 +1326,7 @@ config X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
 	  Set whether the default state of memory_corruption_check is
 	  on or off.
 
-config X86_LOW_RESERVE
+config X86_RESERVE_LOW
 	int "Amount of low memory, in kilobytes, to reserve for the BIOS"
 	default 64
 	range 4 640
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index eb87f1c83f9..af277e369de 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -618,6 +618,8 @@ static __init void reserve_ibft_region(void)
 		reserve_early_overlap_ok(addr, addr + size, "ibft");
 }
 
+static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10;
+
 static void __init trim_bios_range(void)
 {
 	/*
@@ -627,9 +629,9 @@ static void __init trim_bios_range(void)
 	 *
 	 * This typically reserves additional memory (64KiB by default)
 	 * since some BIOSes are known to corrupt low memory.  See the
-	 * Kconfig help text for X86_LOW_RESERVE.
+	 * Kconfig help text for X86_RESERVE_LOW.
 	 */
-	e820_update_range(0, ALIGN(CONFIG_X86_LOW_RESERVE << 10, PAGE_SIZE),
+	e820_update_range(0, ALIGN(reserve_low, PAGE_SIZE),
 			  E820_RAM, E820_RESERVED);
 
 	/*
@@ -641,6 +643,28 @@ static void __init trim_bios_range(void)
 	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
 }
 
+static int __init parse_reservelow(char *p)
+{
+	unsigned long long size;
+
+	if (!p)
+		return -EINVAL;
+
+	size = memparse(p, &p);
+
+	if (size < 4096)
+		size = 4096;
+
+	if (size > 640*1024)
+		size = 640*1024;
+
+	reserve_low = size;
+
+	return 0;
+}
+
+early_param("reservelow", parse_reservelow);
+
 /*
  * Determine if we were loaded by an EFI loader.  If so, then we have also been
  * passed the efi memmap, systab, etc., so we should use these data structures
-- 
cgit v1.2.3-18-g5258


From 6afb5157b9eba4092e2f0f54d24a3806409bdde5 Mon Sep 17 00:00:00 2001
From: Haicheng Li <haicheng.li@linux.intel.com>
Date: Wed, 19 May 2010 17:42:14 +0800
Subject: x86, mm: Separate x86_64 vmalloc_sync_all() into separate functions

No behavior change.

Move some of vmalloc_sync_all() code into a new function
sync_global_pgds() that will be useful for memory hotplug.

Signed-off-by: Haicheng Li <haicheng.li@linux.intel.com>
LKML-Reference: <4C6E4ECD.1090607@linux.intel.com>
Reviewed-by: Wu Fengguang <fengguang.wu@intel.com>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/pgtable_64.h |  2 ++
 arch/x86/mm/fault.c               | 24 +-----------------------
 arch/x86/mm/init_64.c             | 30 ++++++++++++++++++++++++++++++
 3 files changed, 33 insertions(+), 23 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 076052cd62b..f96ac9bedf7 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -102,6 +102,8 @@ static inline void native_pgd_clear(pgd_t *pgd)
 	native_set_pgd(pgd, native_make_pgd(0));
 }
 
+extern void sync_global_pgds(unsigned long start, unsigned long end);
+
 /*
  * Conversion functions: convert a page and protection to a page entry,
  * and a page entry and page directory to the page they refer to.
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 4c4508e8a20..51f7ee71d6c 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -326,29 +326,7 @@ out:
 
 void vmalloc_sync_all(void)
 {
-	unsigned long address;
-
-	for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
-	     address += PGDIR_SIZE) {
-
-		const pgd_t *pgd_ref = pgd_offset_k(address);
-		unsigned long flags;
-		struct page *page;
-
-		if (pgd_none(*pgd_ref))
-			continue;
-
-		spin_lock_irqsave(&pgd_lock, flags);
-		list_for_each_entry(page, &pgd_list, lru) {
-			pgd_t *pgd;
-			pgd = (pgd_t *)page_address(page) + pgd_index(address);
-			if (pgd_none(*pgd))
-				set_pgd(pgd, *pgd_ref);
-			else
-				BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
-		}
-		spin_unlock_irqrestore(&pgd_lock, flags);
-	}
+	sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END);
 }
 
 /*
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 9a6674689a2..61a1b4fdecb 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -97,6 +97,36 @@ static int __init nonx32_setup(char *str)
 }
 __setup("noexec32=", nonx32_setup);
 
+/*
+ * When memory was added/removed make sure all the processes MM have
+ * suitable PGD entries in the local PGD level page.
+ */
+void sync_global_pgds(unsigned long start, unsigned long end)
+{
+       unsigned long address;
+
+       for (address = start; address <= end; address += PGDIR_SIZE) {
+	       const pgd_t *pgd_ref = pgd_offset_k(address);
+	       unsigned long flags;
+	       struct page *page;
+
+	       if (pgd_none(*pgd_ref))
+		       continue;
+
+	       spin_lock_irqsave(&pgd_lock, flags);
+	       list_for_each_entry(page, &pgd_list, lru) {
+		       pgd_t *pgd;
+		       pgd = (pgd_t *)page_address(page) + pgd_index(address);
+		       if (pgd_none(*pgd))
+			       set_pgd(pgd, *pgd_ref);
+		       else
+			       BUG_ON(pgd_page_vaddr(*pgd)
+					!= pgd_page_vaddr(*pgd_ref));
+	       }
+	       spin_unlock_irqrestore(&pgd_lock, flags);
+       }
+}
+
 /*
  * NOTE: This function is marked __ref because it calls __init function
  * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
-- 
cgit v1.2.3-18-g5258


From 9b861528a8012e7bc4d1f7bae07395b225331477 Mon Sep 17 00:00:00 2001
From: Haicheng Li <haicheng.li@linux.intel.com>
Date: Fri, 20 Aug 2010 17:50:16 +0800
Subject: x86-64, mem: Update all PGDs for direct mapping and vmemmap mapping
 changes

When memory hotplug-adding happens for a large enough area
that a new PGD entry is needed for the direct mapping, the PGDs
of other processes would not get updated. This leads to some CPUs
oopsing like below when they have to access the unmapped areas.

[ 1139.243192] BUG: soft lockup - CPU#0 stuck for 61s! [bash:6534]
[ 1139.243195] Modules linked in: ipv6 autofs4 rfcomm l2cap crc16 bluetooth rfkill binfmt_misc
dm_mirror dm_region_hash dm_log dm_multipath dm_mod video output sbs sbshc fan battery ac parport_pc
lp parport joydev usbhid processor thermal thermal_sys container button rtc_cmos rtc_core rtc_lib
i2c_i801 i2c_core pcspkr uhci_hcd ohci_hcd ehci_hcd usbcore
[ 1139.243229] irq event stamp: 8538759
[ 1139.243230] hardirqs last  enabled at (8538759): [<ffffffff8100c3fc>] restore_args+0x0/0x30
[ 1139.243236] hardirqs last disabled at (8538757): [<ffffffff810422df>] __do_softirq+0x106/0x146
[ 1139.243240] softirqs last  enabled at (8538758): [<ffffffff81042310>] __do_softirq+0x137/0x146
[ 1139.243245] softirqs last disabled at (8538743): [<ffffffff8100cb5c>] call_softirq+0x1c/0x34
[ 1139.243249] CPU 0:
[ 1139.243250] Modules linked in: ipv6 autofs4 rfcomm l2cap crc16 bluetooth rfkill binfmt_misc
dm_mirror dm_region_hash dm_log dm_multipath dm_mod video output sbs sbshc fan battery ac parport_pc
lp parport joydev usbhid processor thermal thermal_sys container button rtc_cmos rtc_core rtc_lib
i2c_i801 i2c_core pcspkr uhci_hcd ohci_hcd ehci_hcd usbcore
[ 1139.243284] Pid: 6534, comm: bash Tainted: G   M       2.6.32-haicheng-cpuhp #7 QSSC-S4R
[ 1139.243287] RIP: 0010:[<ffffffff810ace35>]  [<ffffffff810ace35>] alloc_arraycache+0x35/0x69
[ 1139.243292] RSP: 0018:ffff8802799f9d78  EFLAGS: 00010286
[ 1139.243295] RAX: ffff8884ffc00000 RBX: ffff8802799f9d98 RCX: 0000000000000000
[ 1139.243297] RDX: 0000000000190018 RSI: 0000000000000001 RDI: ffff8884ffc00010
[ 1139.243300] RBP: ffffffff8100c34e R08: 0000000000000002 R09: 0000000000000000
[ 1139.243303] R10: ffffffff8246dda0 R11: 000000d08246dda0 R12: ffff8802599bfff0
[ 1139.243305] R13: ffff88027904c040 R14: ffff8802799f8000 R15: 0000000000000001
[ 1139.243308] FS:  00007fe81bfe86e0(0000) GS:ffff88000d800000(0000) knlGS:0000000000000000
[ 1139.243311] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1139.243313] CR2: ffff8884ffc00000 CR3: 000000026cf2d000 CR4: 00000000000006f0
[ 1139.243316] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 1139.243318] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 1139.243321] Call Trace:
[ 1139.243324]  [<ffffffff810ace29>] ? alloc_arraycache+0x29/0x69
[ 1139.243328]  [<ffffffff8135004e>] ? cpuup_callback+0x1b0/0x32a
[ 1139.243333]  [<ffffffff8105385d>] ? notifier_call_chain+0x33/0x5b
[ 1139.243337]  [<ffffffff810538a4>] ? __raw_notifier_call_chain+0x9/0xb
[ 1139.243340]  [<ffffffff8134ecfc>] ? cpu_up+0xb3/0x152
[ 1139.243344]  [<ffffffff813388ce>] ? store_online+0x4d/0x75
[ 1139.243348]  [<ffffffff811e53f3>] ? sysdev_store+0x1b/0x1d
[ 1139.243351]  [<ffffffff8110589f>] ? sysfs_write_file+0xe5/0x121
[ 1139.243355]  [<ffffffff810b539d>] ? vfs_write+0xae/0x14a
[ 1139.243358]  [<ffffffff810b587f>] ? sys_write+0x47/0x6f
[ 1139.243362]  [<ffffffff8100b9ab>] ? system_call_fastpath+0x16/0x1b

This patch makes sure to always replicate new direct mapping PGD entries
to the PGDs of all processes, as well as ensures corresponding vmemmap
mapping gets synced.

V1: initial code by Andi Kleen.
V2: fix several issues found in testing.
V3: as suggested by Wu Fengguang, reuse common code of vmalloc_sync_all().

[ hpa: changed pgd_change from int to bool ]

Originally-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Haicheng Li <haicheng.li@linux.intel.com>
LKML-Reference: <4C6E4FD8.6080100@linux.intel.com>
Reviewed-by: Wu Fengguang <fengguang.wu@intel.com>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/mm/init_64.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 61a1b4fdecb..64e7bc2ded9 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -564,8 +564,9 @@ kernel_physical_mapping_init(unsigned long start,
 			     unsigned long end,
 			     unsigned long page_size_mask)
 {
-
+	bool pgd_changed = false;
 	unsigned long next, last_map_addr = end;
+	unsigned long addr;
 
 	start = (unsigned long)__va(start);
 	end = (unsigned long)__va(end);
@@ -593,7 +594,12 @@ kernel_physical_mapping_init(unsigned long start,
 		spin_lock(&init_mm.page_table_lock);
 		pgd_populate(&init_mm, pgd, __va(pud_phys));
 		spin_unlock(&init_mm.page_table_lock);
+		pgd_changed = true;
 	}
+
+	if (pgd_changed)
+		sync_global_pgds(addr, end);
+
 	__flush_tlb_all();
 
 	return last_map_addr;
@@ -1033,6 +1039,7 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
 		}
 
 	}
+	sync_global_pgds((unsigned long)start_page, end);
 	return 0;
 }
 
-- 
cgit v1.2.3-18-g5258


From 0444ad93ea2449963132d68753020a6a24d69895 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 26 Aug 2010 13:57:56 -0400
Subject: x86, iommu: Add IOMMU_INIT macros, .iommu_table section, and
 iommu_table_entry structure

This patch set adds a mechanism to "modularize" the IOMMUs we have
on X86. Currently the count of IOMMUs is up to six and they have a complex
relationship that requires careful execution order. 'pci_iommu_alloc'
does that today, but most folks are unhappy with how it does it.
This patch set addresses this and also paves a mechanism to jettison
unused IOMMUs during run-time. For details that sparked this, please
refer to: http://lkml.org/lkml/2010/8/2/282

The first solution that comes to mind is to convert wholesale
the IOMMU detection routines to be called during initcall
time frame. Unfortunately that misses the dependency relationship
that some of the IOMMUs have (for example: for AMD-Vi IOMMU to work,
GART detection MUST run first, and before all of that SWIOTLB MUST run).

The second solution would be to introduce a registration call wherein
the IOMMU would provide its detection/init routines and as well on what
MUST run before it. That would work, except that the 'pci_iommu_alloc'
which would run through this list, is called during mem_init. This means we
don't have any memory allocator, and it is so early that we haven't yet
started running through the initcall_t list.

This solution borrows concepts from the 2nd idea and from how
MODULE_INIT works. A macro is provided that each IOMMU uses to define
it's detect function and early_init (before the memory allocate is
active), and as well what other IOMMU MUST run before us.  Since most IOMMUs
depend on having SWIOTLB run first ("pci_swiotlb_detect") a convenience macro
to depends on that is also provided.

This macro is similar in design to MODULE_PARAM macro wherein
we setup a .iommu_table section in which we populate it with the values
that match a struct iommu_table_entry. During bootup we will sort
through the array so that the IOMMUs that MUST run before us are first
elements in the array. And then we just iterate through them calling the
detection routine and if appropiate, the init routines.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
LKML-Reference: <1282845485-8991-2-git-send-email-konrad.wilk@oracle.com>
CC: H. Peter Anvin <hpa@zytor.com>
CC: Fujita Tomonori <fujita.tomonori@lab.ntt.co.jp>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Ingo Molnar <mingo@redhat.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/iommu_table.h | 95 ++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/vmlinux.lds.S      |  7 +++
 2 files changed, 102 insertions(+)
 create mode 100644 arch/x86/include/asm/iommu_table.h

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/iommu_table.h b/arch/x86/include/asm/iommu_table.h
new file mode 100644
index 00000000000..435176f96a5
--- /dev/null
+++ b/arch/x86/include/asm/iommu_table.h
@@ -0,0 +1,95 @@
+
+#ifndef _ASM_X86_IOMMU_TABLE_H
+#define _ASM_X86_IOMMU_TABLE_H
+
+#include <asm/swiotlb.h>
+
+/*
+ * History lesson:
+ * The execution chain of IOMMUs in 2.6.36 looks as so:
+ *
+ *            [xen-swiotlb]
+ *                 |
+ *         +----[swiotlb *]--+
+ *        /         |         \
+ *       /          |          \
+ *    [GART]     [Calgary]  [Intel VT-d]
+ *     /
+ *    /
+ * [AMD-Vi]
+ *
+ * *: if SWIOTLB detected 'iommu=soft'/'swiotlb=force' it would skip
+ * over the rest of IOMMUs and unconditionally initialize the SWIOTLB.
+ * Also it would surreptitiously initialize set the swiotlb=1 if there were
+ * more than 4GB and if the user did not pass in 'iommu=off'. The swiotlb
+ * flag would be turned off by all IOMMUs except the Calgary one.
+ *
+ * The IOMMU_INIT* macros allow a similar tree (or more complex if desired)
+ * to be built by defining who we depend on.
+ *
+ * And all that needs to be done is to use one of the macros in the IOMMU
+ * and the pci-dma.c will take care of the rest.
+ */
+
+struct iommu_table_entry {
+	initcall_t	detect;
+	initcall_t	depend;
+	void		(*early_init)(void); /* No memory allocate available. */
+	void		(*late_init)(void); /* Yes, can allocate memory. */
+#define IOMMU_FINISH_IF_DETECTED (1<<0)
+#define IOMMU_DETECTED		 (1<<1)
+	int		flags;
+};
+/*
+ * Macro fills out an entry in the .iommu_table that is equivalent
+ * to the fields that 'struct iommu_table_entry' has. The entries
+ * that are put in the .iommu_table section are not put in any order
+ * hence during boot-time we will have to resort them based on
+ * dependency. */
+
+
+#define __IOMMU_INIT(_detect, _depend, _early_init, _late_init, _finish)\
+	static const struct iommu_table_entry const			\
+		__iommu_entry_##_detect __used				\
+	__attribute__ ((unused, __section__(".iommu_table"),		\
+			aligned((sizeof(void *)))))	\
+	= {_detect, _depend, _early_init, _late_init,			\
+	   _finish ? IOMMU_FINISH_IF_DETECTED : 0}
+/*
+ * The simplest IOMMU definition. Provide the detection routine
+ * and it will be run after the SWIOTLB and the other IOMMUs
+ * that utilize this macro. If the IOMMU is detected (ie, the
+ * detect routine returns a positive value), the other IOMMUs
+ * are also checked. You can use IOMMU_INIT_FINISH if you prefer
+ * to stop detecting the other IOMMUs after yours has been detected.
+ */
+#define IOMMU_INIT_POST(_detect)					\
+	__IOMMU_INIT(_detect, pci_swiotlb_detect,  0, 0, 0)
+
+#define IOMMU_INIT_POST_FINISH(detect)					\
+	__IOMMU_INIT(_detect, pci_swiotlb_detect,  0, 0, 1)
+
+/*
+ * A more sophisticated version of IOMMU_INIT. This variant requires:
+ *  a). A detection routine function.
+ *  b). The name of the detection routine we depend on to get called
+ *      before us.
+ *  c). The init routine which gets called if the detection routine
+ *      returns a positive value from the pci_iommu_alloc. This means
+ *      no presence of a memory allocator.
+ *  d). Similar to the 'init', except that this gets called from pci_iommu_init
+ *      where we do have a memory allocator.
+ *
+ * The _CONT vs the _EXIT differs in that the _CONT variant will
+ * continue detecting other IOMMUs in the call list after the
+ * the detection routine returns a positive number. The _EXIT will
+ * stop the execution chain. Both will still call the 'init' and
+ * 'late_init' functions if they are set.
+ */
+#define IOMMU_INIT_FINISH(_detect, _depend, _init, _late_init)		\
+	__IOMMU_INIT(_detect, _depend, _init, _late_init, 1)
+
+#define IOMMU_INIT(_detect, _depend, _init, _late_init)			\
+	__IOMMU_INIT(_detect, _depend, _init, _late_init, 0)
+
+#endif /* _ASM_X86_IOMMU_TABLE_H */
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index d0bb52296fa..b92e040466c 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -260,6 +260,13 @@ SECTIONS
 		*(.altinstr_replacement)
 	}
 
+	.iommu_table : AT(ADDR(.iommu_table) - LOAD_OFFSET) {
+		__iommu_table = .;
+		*(.iommu_table)
+		. = ALIGN(8);
+		__iommu_table_end = .;
+	}
+
 	/*
 	 * .exit.text is discard at runtime, not link time, to deal with
 	 *  references from .altinstructions and .eh_frame
-- 
cgit v1.2.3-18-g5258


From 480125ba49ba62be93beea37770f266846e077ab Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 26 Aug 2010 13:57:57 -0400
Subject: x86, iommu: Make all IOMMU's detection routines return a value.

We return 1 if the IOMMU has been detected. Zero or an error number
if we failed to find it. This is in preperation of using the IOMMU_INIT
so that we can detect whether an IOMMU is present. I have not
tested this for regression on Calgary, nor on AMD Vi chipsets as
I don't have that hardware.

CC: Muli Ben-Yehuda <muli@il.ibm.com>
CC: "Jon D. Mason" <jdmason@kudzu.us>
CC: "Darrick J. Wong" <djwong@us.ibm.com>
CC: Jesse Barnes <jbarnes@virtuousgeek.org>
CC: David Woodhouse <David.Woodhouse@intel.com>
CC: Chris Wright <chrisw@sous-sol.org>
CC: Yinghai Lu <yinghai@kernel.org>
CC: Joerg Roedel <joerg.roedel@amd.com>
CC: H. Peter Anvin <hpa@zytor.com>
CC: Fujita Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
LKML-Reference: <1282845485-8991-3-git-send-email-konrad.wilk@oracle.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/amd_iommu.h |  4 ++--
 arch/x86/include/asm/calgary.h   |  4 ++--
 arch/x86/include/asm/gart.h      |  5 +++--
 arch/x86/kernel/amd_iommu_init.c |  8 +++++---
 arch/x86/kernel/aperture_64.c    | 11 +++++++----
 arch/x86/kernel/pci-calgary_64.c | 15 ++++++++-------
 6 files changed, 27 insertions(+), 20 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h
index 5af2982133b..2798142cdb4 100644
--- a/arch/x86/include/asm/amd_iommu.h
+++ b/arch/x86/include/asm/amd_iommu.h
@@ -24,11 +24,11 @@
 
 #ifdef CONFIG_AMD_IOMMU
 
-extern void amd_iommu_detect(void);
+extern int amd_iommu_detect(void);
 
 #else
 
-static inline void amd_iommu_detect(void) { }
+static inline int amd_iommu_detect(void) { return -ENODEV; }
 
 #endif
 
diff --git a/arch/x86/include/asm/calgary.h b/arch/x86/include/asm/calgary.h
index 0918654305a..0d467b33883 100644
--- a/arch/x86/include/asm/calgary.h
+++ b/arch/x86/include/asm/calgary.h
@@ -62,9 +62,9 @@ struct cal_chipset_ops {
 extern int use_calgary;
 
 #ifdef CONFIG_CALGARY_IOMMU
-extern void detect_calgary(void);
+extern int detect_calgary(void);
 #else
-static inline void detect_calgary(void) { return; }
+static inline int detect_calgary(void) { return -ENODEV; }
 #endif
 
 #endif /* _ASM_X86_CALGARY_H */
diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h
index 4ac5b0f33fc..d7d1d4c438a 100644
--- a/arch/x86/include/asm/gart.h
+++ b/arch/x86/include/asm/gart.h
@@ -37,7 +37,7 @@ extern int gart_iommu_aperture_disabled;
 extern void early_gart_iommu_check(void);
 extern int gart_iommu_init(void);
 extern void __init gart_parse_options(char *);
-extern void gart_iommu_hole_init(void);
+extern int gart_iommu_hole_init(void);
 
 #else
 #define gart_iommu_aperture            0
@@ -50,8 +50,9 @@ static inline void early_gart_iommu_check(void)
 static inline void gart_parse_options(char *options)
 {
 }
-static inline void gart_iommu_hole_init(void)
+static inline int gart_iommu_hole_init(void)
 {
+	return -ENODEV;
 }
 #endif
 
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 3cc63e2b8dd..0b9e2dc4fc9 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -1382,13 +1382,13 @@ static int __init early_amd_iommu_detect(struct acpi_table_header *table)
 	return 0;
 }
 
-void __init amd_iommu_detect(void)
+int __init amd_iommu_detect(void)
 {
 	if (no_iommu || (iommu_detected && !gart_iommu_aperture))
-		return;
+		return -ENODEV;
 
 	if (amd_iommu_disabled)
-		return;
+		return -ENODEV;
 
 	if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
 		iommu_detected = 1;
@@ -1397,7 +1397,9 @@ void __init amd_iommu_detect(void)
 
 		/* Make sure ACS will be enabled */
 		pci_request_acs();
+		return 1;
 	}
+	return -ENODEV;
 }
 
 /****************************************************************************
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index a2e0caf26e1..afa0dab3302 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -371,7 +371,7 @@ void __init early_gart_iommu_check(void)
 
 static int __initdata printed_gart_size_msg;
 
-void __init gart_iommu_hole_init(void)
+int __init gart_iommu_hole_init(void)
 {
 	u32 agp_aper_base = 0, agp_aper_order = 0;
 	u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0;
@@ -381,7 +381,7 @@ void __init gart_iommu_hole_init(void)
 
 	if (gart_iommu_aperture_disabled || !fix_aperture ||
 	    !early_pci_allowed())
-		return;
+		return -ENODEV;
 
 	printk(KERN_INFO  "Checking aperture...\n");
 
@@ -463,8 +463,9 @@ out:
 			unsigned long n = (32 * 1024 * 1024) << last_aper_order;
 
 			insert_aperture_resource((u32)last_aper_base, n);
+			return 1;
 		}
-		return;
+		return 0;
 	}
 
 	if (!fallback_aper_force) {
@@ -500,7 +501,7 @@ out:
 			panic("Not enough memory for aperture");
 		}
 	} else {
-		return;
+		return 0;
 	}
 
 	/* Fix up the north bridges */
@@ -524,4 +525,6 @@ out:
 	}
 
 	set_up_gart_resume(aper_order, aper_alloc);
+
+	return 1;
 }
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 078d4ec1a9d..28c6b389fee 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -1364,7 +1364,7 @@ static int __init calgary_iommu_init(void)
 	return 0;
 }
 
-void __init detect_calgary(void)
+int __init detect_calgary(void)
 {
 	int bus;
 	void *tbl;
@@ -1378,13 +1378,13 @@ void __init detect_calgary(void)
 	 * another HW IOMMU already, bail out.
 	 */
 	if (no_iommu || iommu_detected)
-		return;
+		return -ENODEV;
 
 	if (!use_calgary)
-		return;
+		return -ENODEV;
 
 	if (!early_pci_allowed())
-		return;
+		return -ENODEV;
 
 	printk(KERN_DEBUG "Calgary: detecting Calgary via BIOS EBDA area\n");
 
@@ -1410,13 +1410,13 @@ void __init detect_calgary(void)
 	if (!rio_table_hdr) {
 		printk(KERN_DEBUG "Calgary: Unable to locate Rio Grande table "
 		       "in EBDA - bailing!\n");
-		return;
+		return -ENODEV;
 	}
 
 	ret = build_detail_arrays();
 	if (ret) {
 		printk(KERN_DEBUG "Calgary: build_detail_arrays ret %d\n", ret);
-		return;
+		return -ENOMEM;
 	}
 
 	specified_table_size = determine_tce_table_size((is_kdump_kernel() ?
@@ -1464,7 +1464,7 @@ void __init detect_calgary(void)
 
 		x86_init.iommu.iommu_init = calgary_iommu_init;
 	}
-	return;
+	return calgary_found;
 
 cleanup:
 	for (--bus; bus >= 0; --bus) {
@@ -1473,6 +1473,7 @@ cleanup:
 		if (info->tce_space)
 			free_tce_table(info->tce_space);
 	}
+	return -ENOMEM;
 }
 
 static int __init calgary_parse_options(char *p)
-- 
cgit v1.2.3-18-g5258


From 5bef80a4b826b9cee1c6aec7ecc371ec395260cc Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 26 Aug 2010 13:57:58 -0400
Subject: x86, iommu: Add proper dependency sort routine (and sanity check).

We are using a very simple sort routine which sorts the .iommu_table
array in the order of dependencies. Specifically each structure
of iommu_table_entry has a field 'depend' which contains the function
pointer to the IOMMU that MUST be run before us. We sort the array
of structures so that the struct iommu_table_entry with no
'depend' field are first, and then the subsequent ones are the
ones for which the 'depend' function has been already invoked
(in other words, precede us).

Using the kernel's version 'sort', which is a mergeheap is
feasible, but would require making the comparison operator
scan recursivly the array to satisfy the "heapify" process: setting the
levels properly. The end result would much more complex than it should
be an it is just much simpler to utilize this simple sort routine.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
LKML-Reference: <1282845485-8991-4-git-send-email-konrad.wilk@oracle.com>
CC: H. Peter Anvin <hpa@zytor.com>
CC: Fujita Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/iommu_table.h |  6 +++
 arch/x86/kernel/Makefile           |  1 +
 arch/x86/kernel/pci-iommu_table.c  | 89 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 96 insertions(+)
 create mode 100644 arch/x86/kernel/pci-iommu_table.c

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/iommu_table.h b/arch/x86/include/asm/iommu_table.h
index 435176f96a5..2124e3ef6f9 100644
--- a/arch/x86/include/asm/iommu_table.h
+++ b/arch/x86/include/asm/iommu_table.h
@@ -92,4 +92,10 @@ struct iommu_table_entry {
 #define IOMMU_INIT(_detect, _depend, _init, _late_init)			\
 	__IOMMU_INIT(_detect, _depend, _init, _late_init, 0)
 
+void sort_iommu_table(struct iommu_table_entry *start,
+		      struct iommu_table_entry *finish);
+
+void check_iommu_entries(struct iommu_table_entry *start,
+			 struct iommu_table_entry *finish);
+
 #endif /* _ASM_X86_IOMMU_TABLE_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 0925676266b..6817546595e 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -42,6 +42,7 @@ obj-y			+= bootflag.o e820.o
 obj-y			+= pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
 obj-y			+= alternative.o i8253.o pci-nommu.o hw_breakpoint.o
 obj-y			+= tsc.o io_delay.o rtc.o
+obj-y			+= pci-iommu_table.o
 
 obj-$(CONFIG_X86_TRAMPOLINE)	+= trampoline.o
 obj-y				+= process.o
diff --git a/arch/x86/kernel/pci-iommu_table.c b/arch/x86/kernel/pci-iommu_table.c
new file mode 100644
index 00000000000..55d745ec118
--- /dev/null
+++ b/arch/x86/kernel/pci-iommu_table.c
@@ -0,0 +1,89 @@
+#include <linux/dma-mapping.h>
+#include <asm/iommu_table.h>
+#include <linux/string.h>
+#include <linux/kallsyms.h>
+
+
+#define DEBUG 1
+
+static struct iommu_table_entry * __init
+find_dependents_of(struct iommu_table_entry *start,
+		   struct iommu_table_entry *finish,
+		   struct iommu_table_entry *q)
+{
+	struct iommu_table_entry *p;
+
+	if (!q)
+		return NULL;
+
+	for (p = start; p < finish; p++)
+		if (p->detect == q->depend)
+			return p;
+
+	return NULL;
+}
+
+
+void __init sort_iommu_table(struct iommu_table_entry *start,
+			     struct iommu_table_entry *finish) {
+
+	struct iommu_table_entry *p, *q, tmp;
+
+	for (p = start; p < finish; p++) {
+again:
+		q = find_dependents_of(start, finish, p);
+		/* We are bit sneaky here. We use the memory address to figure
+		 * out if the node we depend on is past our point, if so, swap.
+		 */
+		if (q > p) {
+			tmp = *p;
+			memmove(p, q, sizeof(*p));
+			*q = tmp;
+			goto again;
+		}
+	}
+
+}
+
+#ifdef DEBUG
+void __init check_iommu_entries(struct iommu_table_entry *start,
+				struct iommu_table_entry *finish)
+{
+	struct iommu_table_entry *p, *q, *x;
+	char sym_p[KSYM_SYMBOL_LEN];
+	char sym_q[KSYM_SYMBOL_LEN];
+
+	/* Simple cyclic dependency checker. */
+	for (p = start; p < finish; p++) {
+		q = find_dependents_of(start, finish, p);
+		x = find_dependents_of(start, finish, q);
+		if (p == x) {
+			sprint_symbol(sym_p, (unsigned long)p->detect);
+			sprint_symbol(sym_q, (unsigned long)q->detect);
+
+			printk(KERN_ERR "CYCLIC DEPENDENCY FOUND! %s depends" \
+					" on %s and vice-versa. BREAKING IT.\n",
+					sym_p, sym_q);
+			/* Heavy handed way..*/
+			x->depend = 0;
+		}
+	}
+
+	for (p = start; p < finish; p++) {
+		q = find_dependents_of(p, finish, p);
+		if (q && q > p) {
+			sprint_symbol(sym_p, (unsigned long)p->detect);
+			sprint_symbol(sym_q, (unsigned long)q->detect);
+
+			printk(KERN_ERR "EXECUTION ORDER INVALID! %s "\
+					"should be called before %s!\n",
+					sym_p, sym_q);
+		}
+	}
+}
+#else
+inline void check_iommu_entries(struct iommu_table_entry *start,
+				       struct iommu_table_entry *finish)
+{
+}
+#endif
-- 
cgit v1.2.3-18-g5258


From efa631c26d3bb1162b8f95008801db602217f52b Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 26 Aug 2010 13:57:59 -0400
Subject: x86, swiotlb: Simplify SWIOTLB pci_swiotlb_detect routine.

In 'pci_swiotlb_detect' we used to do two different things:
 a). If user provided 'iommu=soft' or 'swiotlb=force' we
     would set swiotlb=1 and return 1 (and forcing pci-dma.c
     to call pci_swiotlb_init() immediately).
 b). If 4GB or more would be detected and if user did not specify
     iommu=off, we would set 'swiotlb=1' and return whatever 'a)'
     figured out.

We simplify this by splitting a) and b) in two different routines.

CC: Fujita Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
LKML-Reference: <1282845485-8991-5-git-send-email-konrad.wilk@oracle.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/iommu_table.h |  4 ++--
 arch/x86/include/asm/swiotlb.h     | 13 +++++++++++--
 arch/x86/kernel/pci-dma.c          |  4 +++-
 arch/x86/kernel/pci-swiotlb.c      | 33 +++++++++++++++++++++++++++------
 4 files changed, 43 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/iommu_table.h b/arch/x86/include/asm/iommu_table.h
index 2124e3ef6f9..df55a78888e 100644
--- a/arch/x86/include/asm/iommu_table.h
+++ b/arch/x86/include/asm/iommu_table.h
@@ -64,10 +64,10 @@ struct iommu_table_entry {
  * to stop detecting the other IOMMUs after yours has been detected.
  */
 #define IOMMU_INIT_POST(_detect)					\
-	__IOMMU_INIT(_detect, pci_swiotlb_detect,  0, 0, 0)
+	__IOMMU_INIT(_detect, pci_swiotlb_detect_4gb,  0, 0, 0)
 
 #define IOMMU_INIT_POST_FINISH(detect)					\
-	__IOMMU_INIT(_detect, pci_swiotlb_detect,  0, 0, 1)
+	__IOMMU_INIT(_detect, pci_swiotlb_detect_4gb,  0, 0, 1)
 
 /*
  * A more sophisticated version of IOMMU_INIT. This variant requires:
diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h
index 8085277e1b8..977f1761a25 100644
--- a/arch/x86/include/asm/swiotlb.h
+++ b/arch/x86/include/asm/swiotlb.h
@@ -5,17 +5,26 @@
 
 #ifdef CONFIG_SWIOTLB
 extern int swiotlb;
-extern int __init pci_swiotlb_detect(void);
+extern int __init pci_swiotlb_detect_override(void);
+extern int __init pci_swiotlb_detect_4gb(void);
 extern void __init pci_swiotlb_init(void);
+extern void __init pci_swiotlb_late_init(void);
 #else
 #define swiotlb 0
-static inline int pci_swiotlb_detect(void)
+static inline int pci_swiotlb_detect_override(void)
+{
+	return 0;
+}
+static inline int pci_swiotlb_detect_4gb(void)
 {
 	return 0;
 }
 static inline void pci_swiotlb_init(void)
 {
 }
+static inline void pci_swiotlb_late_init(void)
+{
+}
 #endif
 
 static inline void dma_mark_clean(void *addr, size_t size) {}
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 9f07cfcbd3a..1b3beb5075e 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -133,9 +133,11 @@ void __init pci_iommu_alloc(void)
 	/* free the range so iommu could get some range less than 4G */
 	dma32_free_bootmem();
 
-	if (pci_xen_swiotlb_detect() || pci_swiotlb_detect())
+	if (pci_xen_swiotlb_detect() || pci_swiotlb_detect_override())
 		goto out;
 
+	pci_swiotlb_detect_4gb();
+
 	gart_iommu_hole_init();
 
 	detect_calgary();
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index a5bc528d432..c7a72faeb14 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -41,24 +41,33 @@ static struct dma_map_ops swiotlb_dma_ops = {
 };
 
 /*
- * pci_swiotlb_detect - set swiotlb to 1 if necessary
+ * pci_swiotlb_detect_override - set swiotlb to 1 if necessary
  *
  * This returns non-zero if we are forced to use swiotlb (by the boot
  * option).
  */
-int __init pci_swiotlb_detect(void)
+int __init pci_swiotlb_detect_override(void)
 {
 	int use_swiotlb = swiotlb | swiotlb_force;
 
+	if (swiotlb_force)
+		swiotlb = 1;
+
+	return use_swiotlb;
+}
+
+/*
+ * if 4GB or more detected (and iommu=off not set) return 1
+ * and set swiotlb to 1.
+ */
+int __init pci_swiotlb_detect_4gb(void)
+{
 	/* don't initialize swiotlb if iommu=off (no_iommu=1) */
 #ifdef CONFIG_X86_64
 	if (!no_iommu && max_pfn > MAX_DMA32_PFN)
 		swiotlb = 1;
 #endif
-	if (swiotlb_force)
-		swiotlb = 1;
-
-	return use_swiotlb;
+	return swiotlb;
 }
 
 void __init pci_swiotlb_init(void)
@@ -68,3 +77,15 @@ void __init pci_swiotlb_init(void)
 		dma_ops = &swiotlb_dma_ops;
 	}
 }
+
+void __init pci_swiotlb_late_init(void)
+{
+	/* An IOMMU turned us off. */
+	if (!swiotlb)
+		swiotlb_free();
+	else {
+		printk(KERN_INFO "PCI-DMA: "
+		       "Using software bounce buffering for IO (SWIOTLB)\n");
+		swiotlb_print_info();
+	}
+}
-- 
cgit v1.2.3-18-g5258


From c116c5457c46edb767df6f4e36d4905e3514ad37 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 26 Aug 2010 13:58:00 -0400
Subject: x86, swiotlb: Make SWIOTLB use IOMMU_INIT_* macros.

We utilize the IOMMU_INIT macros to create this dependency:

       [pci_xen_swiotlb_detect]
                 |
       [pci_swiotlb_detect_override]
                 |
       [pci_swiotlb_detect_4gb]

And set the SWIOTLB IOMMU_INIT to utilize 'pci_swiotlb_init'
for .init and 'pci_swiotlb_late_init' for .late_init.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
LKML-Reference: <1282845485-8991-6-git-send-email-konrad.wilk@oracle.com>
CC: Fujita Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/pci-swiotlb.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index c7a72faeb14..8f972cbddef 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -10,7 +10,8 @@
 #include <asm/iommu.h>
 #include <asm/swiotlb.h>
 #include <asm/dma.h>
-
+#include <asm/xen/swiotlb-xen.h>
+#include <asm/iommu_table.h>
 int swiotlb __read_mostly;
 
 static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
@@ -55,6 +56,10 @@ int __init pci_swiotlb_detect_override(void)
 
 	return use_swiotlb;
 }
+IOMMU_INIT_FINISH(pci_swiotlb_detect_override,
+		  pci_xen_swiotlb_detect,
+		  pci_swiotlb_init,
+		  pci_swiotlb_late_init);
 
 /*
  * if 4GB or more detected (and iommu=off not set) return 1
@@ -69,6 +74,10 @@ int __init pci_swiotlb_detect_4gb(void)
 #endif
 	return swiotlb;
 }
+IOMMU_INIT(pci_swiotlb_detect_4gb,
+	   pci_swiotlb_detect_override,
+	   pci_swiotlb_init,
+	   pci_swiotlb_late_init);
 
 void __init pci_swiotlb_init(void)
 {
-- 
cgit v1.2.3-18-g5258


From 5cb3a267939a223eb84692d229569d2ef493d7ca Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 26 Aug 2010 13:58:01 -0400
Subject: x86, xen-swiotlb: Make Xen-SWIOTLB use IOMMU_INIT_* macros.

We utilize the IOMMU_INIT macros to create this dependency:

               [null]
                 |
       [pci_xen_swiotlb_detect]
                 |
       [pci_swiotlb_detect_override]
                 |
       [pci_swiotlb_detect_4gb]

In other words, we set 'pci_xen_swiotlb_detect' to be
the first detection to be run during start.

CC: Fujita Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
LKML-Reference: <1282845485-8991-7-git-send-email-konrad.wilk@oracle.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/xen/pci-swiotlb-xen.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c
index a013ec9d0c5..22471001b74 100644
--- a/arch/x86/xen/pci-swiotlb-xen.c
+++ b/arch/x86/xen/pci-swiotlb-xen.c
@@ -5,6 +5,7 @@
 
 #include <asm/xen/hypervisor.h>
 #include <xen/xen.h>
+#include <asm/iommu_table.h>
 
 int xen_swiotlb __read_mostly;
 
@@ -56,3 +57,7 @@ void __init pci_xen_swiotlb_init(void)
 		dma_ops = &xen_swiotlb_dma_ops;
 	}
 }
+IOMMU_INIT_FINISH(pci_xen_swiotlb_detect,
+		  0,
+		  pci_xen_swiotlb_init,
+		  0);
-- 
cgit v1.2.3-18-g5258


From d2aa232f3d0b5a3e22f91b736fe68eddcf0d5ea3 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 26 Aug 2010 13:58:02 -0400
Subject: x86, calgary: Make Calgary IOMMU use IOMMU_INIT_* macros.

We utilize the IOMMU_INIT macros to create this dependency:

     [pci_xen_swiotlb_detect]
         |
     [pci_swiotlb_detect_override]
         |
     [pci_swiotlb_detect_4gb]
         |
      [detect_calgary]

Meaning that 'detect_calgary' is going to be called after
'pci_swiotlb_detect'.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
LKML-Reference: <1282845485-8991-8-git-send-email-konrad.wilk@oracle.com>
CC: Muli Ben-Yehuda <muli@il.ibm.com>
CC: "Jon D. Mason" <jdmason@kudzu.us>
CC: "Darrick J. Wong" <djwong@us.ibm.com>
CC: Fujita Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/pci-calgary_64.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 28c6b389fee..f56a117cef6 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -47,6 +47,7 @@
 #include <asm/rio.h>
 #include <asm/bios_ebda.h>
 #include <asm/x86_init.h>
+#include <asm/iommu_table.h>
 
 #ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT
 int use_calgary __read_mostly = 1;
@@ -1595,3 +1596,5 @@ static int __init calgary_fixup_tce_spaces(void)
  * and before device_initcall.
  */
 rootfs_initcall(calgary_fixup_tce_spaces);
+
+IOMMU_INIT_POST(detect_calgary);
-- 
cgit v1.2.3-18-g5258


From 22e6daf41ba28ddc06295e42859b266f737b3e99 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 26 Aug 2010 13:58:03 -0400
Subject: x86, GART/AMD-VI: Make AMD GART and IOMMU use IOMMU_INIT_* macros.

We utilize the IOMMU_INIT macros to create this dependency:

               [null]
                 |
       [pci_xen_swiotlb_detect]
                 |
       [pci_swiotlb_detect_override]
                 |
       [pci_swiotlb_detect_4gb]
                 |
         +-------+--------+
        /                  \
[detect_calgary]    [gart_iommu_hole_init]
                            |
                    [amd_iommu_detect]

Meaning that 'amd_iommu_detect' will be called after
'gart_iommu_hole_init'.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
LKML-Reference: <1282845485-8991-9-git-send-email-konrad.wilk@oracle.com>
CC: Fujita Tomonori <fujita.tomonori@lab.ntt.co.jp>
CC: Joerg Roedel <joerg.roedel@amd.com>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Ingo Molnar <mingo@redhat.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/amd_iommu_init.c | 7 ++++++-
 arch/x86/kernel/pci-gart_64.c    | 2 ++
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 0b9e2dc4fc9..26a5e438521 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -31,7 +31,7 @@
 #include <asm/iommu.h>
 #include <asm/gart.h>
 #include <asm/x86_init.h>
-
+#include <asm/iommu_table.h>
 /*
  * definitions for the ACPI scanning code
  */
@@ -1430,3 +1430,8 @@ static int __init parse_amd_iommu_options(char *str)
 
 __setup("amd_iommu_dump", parse_amd_iommu_dump);
 __setup("amd_iommu=", parse_amd_iommu_options);
+
+IOMMU_INIT_FINISH(amd_iommu_detect,
+		  gart_iommu_hole_init,
+		  0,
+		  0);
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 0f7f130caa6..de9734b100a 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -41,6 +41,7 @@
 #include <asm/dma.h>
 #include <asm/k8.h>
 #include <asm/x86_init.h>
+#include <asm/iommu_table.h>
 
 static unsigned long iommu_bus_base;	/* GART remapping area (physical) */
 static unsigned long iommu_size;	/* size of remapping area bytes */
@@ -896,3 +897,4 @@ void __init gart_parse_options(char *p)
 		}
 	}
 }
+IOMMU_INIT_POST(gart_iommu_hole_init);
-- 
cgit v1.2.3-18-g5258


From ee1f284f38c8dfcbc7b656915a039dde016de7d3 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 26 Aug 2010 13:58:05 -0400
Subject: x86, iommu: Utilize the IOMMU_INIT macros functionality.

We remove all of the sub-platform detection/init routines and instead
use on the .iommu_table array of structs to call the .early_init if
.detect returned a positive value. Also we can stop detecting other
IOMMUs if the IOMMU used the _FINISH type macro. During the
'pci_iommu_init' stage, we call .init for the second-stage
initialization if it was defined. Currently only SWIOTLB has this
defined and it used to de-allocate the SWIOTLB if the other detected
IOMMUs have deemed it unnecessary to use SWIOTLB.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
LKML-Reference: <1282845485-8991-11-git-send-email-konrad.wilk@oracle.com>
CC: Fujita Tomonori <fujita.tomonori@lab.ntt.co.jp>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/pci-dma.c | 46 +++++++++++++++++++++-------------------------
 1 file changed, 21 insertions(+), 25 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 1b3beb5075e..9ea999a4dcc 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -11,9 +11,8 @@
 #include <asm/iommu.h>
 #include <asm/gart.h>
 #include <asm/calgary.h>
-#include <asm/amd_iommu.h>
 #include <asm/x86_init.h>
-#include <asm/xen/swiotlb-xen.h>
+#include <asm/iommu_table.h>
 
 static int forbid_dac __read_mostly;
 
@@ -45,6 +44,8 @@ int iommu_detected __read_mostly = 0;
  */
 int iommu_pass_through __read_mostly;
 
+extern struct iommu_table_entry __iommu_table[], __iommu_table_end[];
+
 /* Dummy device used for NULL arguments (normally ISA). */
 struct device x86_dma_fallback_dev = {
 	.init_name = "fallback device",
@@ -130,28 +131,24 @@ static void __init dma32_free_bootmem(void)
 
 void __init pci_iommu_alloc(void)
 {
+	struct iommu_table_entry *p;
+
 	/* free the range so iommu could get some range less than 4G */
 	dma32_free_bootmem();
 
-	if (pci_xen_swiotlb_detect() || pci_swiotlb_detect_override())
-		goto out;
-
-	pci_swiotlb_detect_4gb();
-
-	gart_iommu_hole_init();
-
-	detect_calgary();
-
-	detect_intel_iommu();
+	sort_iommu_table(__iommu_table, __iommu_table_end);
+	check_iommu_entries(__iommu_table, __iommu_table_end);
 
-	/* needs to be called after gart_iommu_hole_init */
-	amd_iommu_detect();
-out:
-	pci_xen_swiotlb_init();
-
-	pci_swiotlb_init();
+	for (p = __iommu_table; p < __iommu_table_end; p++) {
+		if (p && p->detect && p->detect() > 0) {
+			p->flags |= IOMMU_DETECTED;
+			if (p->early_init)
+				p->early_init();
+			if (p->flags & IOMMU_FINISH_IF_DETECTED)
+				break;
+		}
+	}
 }
-
 void *dma_generic_alloc_coherent(struct device *dev, size_t size,
 				 dma_addr_t *dma_addr, gfp_t flag)
 {
@@ -294,6 +291,7 @@ EXPORT_SYMBOL(dma_supported);
 
 static int __init pci_iommu_init(void)
 {
+	struct iommu_table_entry *p;
 	dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
 
 #ifdef CONFIG_PCI
@@ -301,12 +299,10 @@ static int __init pci_iommu_init(void)
 #endif
 	x86_init.iommu.iommu_init();
 
-	if (swiotlb || xen_swiotlb) {
-		printk(KERN_INFO "PCI-DMA: "
-		       "Using software bounce buffering for IO (SWIOTLB)\n");
-		swiotlb_print_info();
-	} else
-		swiotlb_free();
+	for (p = __iommu_table; p < __iommu_table_end; p++) {
+		if (p && (p->flags & IOMMU_DETECTED) && p->late_init)
+			p->late_init();
+	}
 
 	return 0;
 }
-- 
cgit v1.2.3-18-g5258


From 660a293ea9be709b893d371fbc0328fcca33c33a Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Tue, 27 Jul 2010 16:06:28 +0800
Subject: x86, mm: Make spurious_fault check explicitly check the PRESENT bit

pte_present() returns true even present bit isn't set but _PAGE_PROTNONE
(global bit) bit is set. While with CONFIG_DEBUG_PAGEALLOC, free pages have
global bit set but present bit clear. This patch makes we could catch
free pages access with CONFIG_DEBUG_PAGEALLOC enabled.

[ hpa: added a comment in the code as a warning to janitors ]

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
LKML-Reference: <1280217988.32400.75.camel@sli10-desk.sh.intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/mm/fault.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 51f7ee71d6c..caec22906d7 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -872,8 +872,14 @@ spurious_fault(unsigned long error_code, unsigned long address)
 	if (pmd_large(*pmd))
 		return spurious_fault_check(error_code, (pte_t *) pmd);
 
+	/*
+	 * Note: don't use pte_present() here, since it returns true
+	 * if the _PAGE_PROTNONE bit is set.  However, this aliases the
+	 * _PAGE_GLOBAL bit, which for kernel pages give false positives
+	 * when CONFIG_DEBUG_PAGEALLOC is used.
+	 */
 	pte = pte_offset_kernel(pmd, address);
-	if (!pte_present(*pte))
+	if (!(pte_flags(*pte) & _PAGE_PRESENT))
 		return 0;
 
 	ret = spurious_fault_check(error_code, pte);
-- 
cgit v1.2.3-18-g5258


From fb74fb6db91abc3c1ceeb9d2c17b44866a12c63e Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 25 Aug 2010 13:39:15 -0700
Subject: x86, memblock: Add memblock_x86_find_in_range_size()

size is returned according free range.
Will be used to find free ranges for early_memtest and memory corruption check

Do not mess it up with lib/memblock.c yet.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/memblock.h |  8 ++++
 arch/x86/mm/Makefile            |  2 +
 arch/x86/mm/memblock.c          | 87 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 97 insertions(+)
 create mode 100644 arch/x86/include/asm/memblock.h
 create mode 100644 arch/x86/mm/memblock.c

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/memblock.h b/arch/x86/include/asm/memblock.h
new file mode 100644
index 00000000000..c14219a2616
--- /dev/null
+++ b/arch/x86/include/asm/memblock.h
@@ -0,0 +1,8 @@
+#ifndef _X86_MEMBLOCK_H
+#define _X86_MEMBLOCK_H
+
+#define ARCH_DISCARD_MEMBLOCK
+
+u64 memblock_x86_find_in_range_size(u64 start, u64 *sizep, u64 align);
+
+#endif
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index a4c768397ba..55543397a8a 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -26,4 +26,6 @@ obj-$(CONFIG_NUMA)		+= numa.o numa_$(BITS).o
 obj-$(CONFIG_K8_NUMA)		+= k8topology_64.o
 obj-$(CONFIG_ACPI_NUMA)		+= srat_$(BITS).o
 
+obj-$(CONFIG_HAVE_MEMBLOCK)		+= memblock.o
+
 obj-$(CONFIG_MEMTEST)		+= memtest.o
diff --git a/arch/x86/mm/memblock.c b/arch/x86/mm/memblock.c
new file mode 100644
index 00000000000..26ba46234cb
--- /dev/null
+++ b/arch/x86/mm/memblock.c
@@ -0,0 +1,87 @@
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/bitops.h>
+#include <linux/memblock.h>
+#include <linux/bootmem.h>
+#include <linux/mm.h>
+#include <linux/range.h>
+
+/* Check for already reserved areas */
+static inline bool __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
+{
+	struct memblock_region *r;
+	u64 addr = *addrp, last;
+	u64 size = *sizep;
+	bool changed = false;
+
+again:
+	last = addr + size;
+	for_each_memblock(reserved, r) {
+		if (last > r->base && addr < r->base) {
+			size = r->base - addr;
+			changed = true;
+			goto again;
+		}
+		if (last > (r->base + r->size) && addr < (r->base + r->size)) {
+			addr = round_up(r->base + r->size, align);
+			size = last - addr;
+			changed = true;
+			goto again;
+		}
+		if (last <= (r->base + r->size) && addr >= r->base) {
+			(*sizep)++;
+			return false;
+		}
+	}
+	if (changed) {
+		*addrp = addr;
+		*sizep = size;
+	}
+	return changed;
+}
+
+static u64 __init __memblock_x86_find_in_range_size(u64 ei_start, u64 ei_last, u64 start,
+			 u64 *sizep, u64 align)
+{
+	u64 addr, last;
+
+	addr = round_up(ei_start, align);
+	if (addr < start)
+		addr = round_up(start, align);
+	if (addr >= ei_last)
+		goto out;
+	*sizep = ei_last - addr;
+	while (bad_addr_size(&addr, sizep, align) && addr + *sizep <= ei_last)
+		;
+	last = addr + *sizep;
+	if (last > ei_last)
+		goto out;
+
+	return addr;
+
+out:
+	return MEMBLOCK_ERROR;
+}
+
+/*
+ * Find next free range after start, and size is returned in *sizep
+ */
+u64 __init memblock_x86_find_in_range_size(u64 start, u64 *sizep, u64 align)
+{
+	struct memblock_region *r;
+
+	for_each_memblock(memory, r) {
+		u64 ei_start = r->base;
+		u64 ei_last = ei_start + r->size;
+		u64 addr;
+
+		addr = __memblock_x86_find_in_range_size(ei_start, ei_last, start,
+					 sizep, align);
+
+		if (addr != MEMBLOCK_ERROR)
+			return addr;
+	}
+
+	return MEMBLOCK_ERROR;
+}
-- 
cgit v1.2.3-18-g5258


From f88eff74aa848e58b1ea49768c0bbb874b31357f Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 25 Aug 2010 13:39:15 -0700
Subject: bootmem, x86: Add weak version of reserve_bootmem_generic

It will be used memblock_x86_to_bootmem converting

It is an wrapper for reserve_bootmem, and x86 64bit is using special one.

Also clean up that version for x86_64. We don't need to take care of numa
path for that, bootmem can handle it how

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/mm/init_32.c |  6 ------
 arch/x86/mm/init_64.c | 20 ++------------------
 2 files changed, 2 insertions(+), 24 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index bca79091b9d..90e054589aa 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -1069,9 +1069,3 @@ void mark_rodata_ro(void)
 #endif
 }
 #endif
-
-int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
-				   int flags)
-{
-	return reserve_bootmem(phys, len, flags);
-}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index ee41bba315d..634fa0884a4 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -799,13 +799,10 @@ void mark_rodata_ro(void)
 
 #endif
 
+#ifndef CONFIG_NO_BOOTMEM
 int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
 				   int flags)
 {
-#ifdef CONFIG_NUMA
-	int nid, next_nid;
-	int ret;
-#endif
 	unsigned long pfn = phys >> PAGE_SHIFT;
 
 	if (pfn >= max_pfn) {
@@ -821,21 +818,7 @@ int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
 		return -EFAULT;
 	}
 
-	/* Should check here against the e820 map to avoid double free */
-#ifdef CONFIG_NUMA
-	nid = phys_to_nid(phys);
-	next_nid = phys_to_nid(phys + len - 1);
-	if (nid == next_nid)
-		ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
-	else
-		ret = reserve_bootmem(phys, len, flags);
-
-	if (ret != 0)
-		return ret;
-
-#else
 	reserve_bootmem(phys, len, flags);
-#endif
 
 	if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
 		dma_reserve += len / PAGE_SIZE;
@@ -844,6 +827,7 @@ int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
 
 	return 0;
 }
+#endif
 
 int kern_addr_valid(unsigned long addr)
 {
-- 
cgit v1.2.3-18-g5258


From 27de794365786b4cdc3461ed4e23af2a33f40612 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 25 Aug 2010 13:39:15 -0700
Subject: x86, memblock: Add memblock_x86_to_bootmem()

memblock_x86_to_bootmem() will reserve memblock.reserved.region in
bootmem after bootmem is set up.

We can use it to with all arches that support memblock later.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/memblock.h |  1 +
 arch/x86/mm/memblock.c          | 29 +++++++++++++++++++++++++++++
 2 files changed, 30 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/memblock.h b/arch/x86/include/asm/memblock.h
index c14219a2616..69cf853e931 100644
--- a/arch/x86/include/asm/memblock.h
+++ b/arch/x86/include/asm/memblock.h
@@ -4,5 +4,6 @@
 #define ARCH_DISCARD_MEMBLOCK
 
 u64 memblock_x86_find_in_range_size(u64 start, u64 *sizep, u64 align);
+void memblock_x86_to_bootmem(u64 start, u64 end);
 
 #endif
diff --git a/arch/x86/mm/memblock.c b/arch/x86/mm/memblock.c
index 26ba46234cb..8101084d452 100644
--- a/arch/x86/mm/memblock.c
+++ b/arch/x86/mm/memblock.c
@@ -85,3 +85,32 @@ u64 __init memblock_x86_find_in_range_size(u64 start, u64 *sizep, u64 align)
 
 	return MEMBLOCK_ERROR;
 }
+
+#ifndef CONFIG_NO_BOOTMEM
+void __init memblock_x86_to_bootmem(u64 start, u64 end)
+{
+	int count;
+	u64 final_start, final_end;
+	struct memblock_region *r;
+
+	/* Take out region array itself */
+	memblock_free_reserved_regions();
+
+	count  = memblock.reserved.cnt;
+	pr_info("(%d early reservations) ==> bootmem [%010llx-%010llx]\n", count, start, end - 1);
+	for_each_memblock(reserved, r) {
+		pr_info("  [%010llx-%010llx] ", (u64)r->base, (u64)r->base + r->size - 1);
+		final_start = max(start, r->base);
+		final_end = min(end, r->base + r->size);
+		if (final_start >= final_end) {
+			pr_cont("\n");
+			continue;
+		}
+		pr_cont(" ==> [%010llx-%010llx]\n", final_start, final_end - 1);
+		reserve_bootmem_generic(final_start, final_end - final_start, BOOTMEM_DEFAULT);
+	}
+
+	/* Put region array back ? */
+	memblock_reserve_reserved_regions();
+}
+#endif
-- 
cgit v1.2.3-18-g5258


From 9dc5d569c133819c1ce069ebb1d771c62de32580 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 25 Aug 2010 13:39:15 -0700
Subject: x86, memblock: Add memblock_x86_reserve_range/memblock_x86_free_range

They are wrappers for core versions, which take start/end/name instead
of base/size.  This will make x86 conversion eaasier.

could add more debug print out

-v2: change get_max_mapped() to memblock.default_alloc_limit according to Michael
      Ellerman and Ben
     change to memblock_x86_reserve_range and memblock_x86_free_range according to Michael Ellerman
-v3: call check_and_double after reserve/free, so could avoid to use
      find_memblock_area. Suggested by Michael Ellerman

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Michael Ellerman <michael@ellerman.id.au>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/memblock.h |  3 +++
 arch/x86/mm/memblock.c          | 22 ++++++++++++++++++++++
 2 files changed, 25 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/memblock.h b/arch/x86/include/asm/memblock.h
index 69cf853e931..e11ddf059fa 100644
--- a/arch/x86/include/asm/memblock.h
+++ b/arch/x86/include/asm/memblock.h
@@ -6,4 +6,7 @@
 u64 memblock_x86_find_in_range_size(u64 start, u64 *sizep, u64 align);
 void memblock_x86_to_bootmem(u64 start, u64 end);
 
+void memblock_x86_reserve_range(u64 start, u64 end, char *name);
+void memblock_x86_free_range(u64 start, u64 end);
+
 #endif
diff --git a/arch/x86/mm/memblock.c b/arch/x86/mm/memblock.c
index 8101084d452..9829eaf1dbd 100644
--- a/arch/x86/mm/memblock.c
+++ b/arch/x86/mm/memblock.c
@@ -114,3 +114,25 @@ void __init memblock_x86_to_bootmem(u64 start, u64 end)
 	memblock_reserve_reserved_regions();
 }
 #endif
+
+void __init memblock_x86_reserve_range(u64 start, u64 end, char *name)
+{
+	if (start == end)
+		return;
+
+	if (WARN_ONCE(start > end, "memblock_x86_reserve_range: wrong range [%#llx, %#llx]\n", start, end))
+		return;
+
+	memblock_reserve(start, end - start);
+}
+
+void __init memblock_x86_free_range(u64 start, u64 end)
+{
+	if (start == end)
+		return;
+
+	if (WARN_ONCE(start > end, "memblock_x86_free_range: wrong range [%#llx, %#llx]\n", start, end))
+		return;
+
+	memblock_free(start, end - start);
+}
-- 
cgit v1.2.3-18-g5258


From 4d5cf86ce187c0d3a4cdf233ab0cc6526ccbe01f Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 25 Aug 2010 13:39:16 -0700
Subject: x86, memblock: Add get_free_all_memory_range()

get_free_all_memory_range is for CONFIG_NO_BOOTMEM=y, and will be called by
free_all_memory_core_early().

It will use early_node_map aka active ranges subtract memblock.reserved to
get all free range, and those ranges will convert to slab pages.

-v4: increase range size

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Jan Beulich <jbeulich@novell.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/memblock.h |  2 +
 arch/x86/mm/memblock.c          | 98 ++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 99 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/memblock.h b/arch/x86/include/asm/memblock.h
index e11ddf059fa..72639ce65e8 100644
--- a/arch/x86/include/asm/memblock.h
+++ b/arch/x86/include/asm/memblock.h
@@ -8,5 +8,7 @@ void memblock_x86_to_bootmem(u64 start, u64 end);
 
 void memblock_x86_reserve_range(u64 start, u64 end, char *name);
 void memblock_x86_free_range(u64 start, u64 end);
+struct range;
+int get_free_all_memory_range(struct range **rangep, int nodeid);
 
 #endif
diff --git a/arch/x86/mm/memblock.c b/arch/x86/mm/memblock.c
index 9829eaf1dbd..b4500604ab3 100644
--- a/arch/x86/mm/memblock.c
+++ b/arch/x86/mm/memblock.c
@@ -86,7 +86,103 @@ u64 __init memblock_x86_find_in_range_size(u64 start, u64 *sizep, u64 align)
 	return MEMBLOCK_ERROR;
 }
 
-#ifndef CONFIG_NO_BOOTMEM
+static __init struct range *find_range_array(int count)
+{
+	u64 end, size, mem;
+	struct range *range;
+
+	size = sizeof(struct range) * count;
+	end = memblock.current_limit;
+
+	mem = memblock_find_in_range(0, end, size, sizeof(struct range));
+	if (mem == MEMBLOCK_ERROR)
+		panic("can not find more space for range array");
+
+	/*
+	 * This range is tempoaray, so don't reserve it, it will not be
+	 * overlapped because We will not alloccate new buffer before
+	 * We discard this one
+	 */
+	range = __va(mem);
+	memset(range, 0, size);
+
+	return range;
+}
+
+#ifdef CONFIG_NO_BOOTMEM
+static void __init memblock_x86_subtract_reserved(struct range *range, int az)
+{
+	u64 final_start, final_end;
+	struct memblock_region *r;
+
+	/* Take out region array itself at first*/
+	memblock_free_reserved_regions();
+
+	pr_info("Subtract (%ld early reservations)\n", memblock.reserved.cnt);
+
+	for_each_memblock(reserved, r) {
+		pr_info("  [%010llx-%010llx]\n", (u64)r->base, (u64)r->base + r->size - 1);
+		final_start = PFN_DOWN(r->base);
+		final_end = PFN_UP(r->base + r->size);
+		if (final_start >= final_end)
+			continue;
+		subtract_range(range, az, final_start, final_end);
+	}
+
+	/* Put region array back ? */
+	memblock_reserve_reserved_regions();
+}
+
+struct count_data {
+	int nr;
+};
+
+static int __init count_work_fn(unsigned long start_pfn,
+				unsigned long end_pfn, void *datax)
+{
+	struct count_data *data = datax;
+
+	data->nr++;
+
+	return 0;
+}
+
+static int __init count_early_node_map(int nodeid)
+{
+	struct count_data data;
+
+	data.nr = 0;
+	work_with_active_regions(nodeid, count_work_fn, &data);
+
+	return data.nr;
+}
+
+int __init get_free_all_memory_range(struct range **rangep, int nodeid)
+{
+	int count;
+	struct range *range;
+	int nr_range;
+
+	count = (memblock.reserved.cnt + count_early_node_map(nodeid)) * 2;
+
+	range = find_range_array(count);
+	nr_range = 0;
+
+	/*
+	 * Use early_node_map[] and memblock.reserved.region to get range array
+	 * at first
+	 */
+	nr_range = add_from_early_node_map(range, count, nr_range, nodeid);
+#ifdef CONFIG_X86_32
+	subtract_range(range, count, max_low_pfn, -1ULL);
+#endif
+	memblock_x86_subtract_reserved(range, count);
+	nr_range = clean_sort_range(range, count);
+
+	*rangep = range;
+	return nr_range;
+}
+#else
 void __init memblock_x86_to_bootmem(u64 start, u64 end)
 {
 	int count;
-- 
cgit v1.2.3-18-g5258


From 88ba088c18457caaf8d2e5f8d36becc731a3d4f6 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 25 Aug 2010 13:39:16 -0700
Subject: x86, memblock: Add memblock_x86_register_active_regions() and
 memblock_x86_hole_size()

memblock_x86_register_active_regions() will be used to fill early_node_map,
the result will be memblock.memory.region AND numa data

memblock_x86_hole_size will be used to find hole size on memblock.memory.region
with specified range.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/memblock.h |  4 +++
 arch/x86/mm/memblock.c          | 66 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 70 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/memblock.h b/arch/x86/include/asm/memblock.h
index 72639ce65e8..16af28d3607 100644
--- a/arch/x86/include/asm/memblock.h
+++ b/arch/x86/include/asm/memblock.h
@@ -11,4 +11,8 @@ void memblock_x86_free_range(u64 start, u64 end);
 struct range;
 int get_free_all_memory_range(struct range **rangep, int nodeid);
 
+void memblock_x86_register_active_regions(int nid, unsigned long start_pfn,
+					 unsigned long last_pfn);
+u64 memblock_x86_hole_size(u64 start, u64 end);
+
 #endif
diff --git a/arch/x86/mm/memblock.c b/arch/x86/mm/memblock.c
index b4500604ab3..53a7a5aebd6 100644
--- a/arch/x86/mm/memblock.c
+++ b/arch/x86/mm/memblock.c
@@ -232,3 +232,69 @@ void __init memblock_x86_free_range(u64 start, u64 end)
 
 	memblock_free(start, end - start);
 }
+
+/*
+ * Finds an active region in the address range from start_pfn to last_pfn and
+ * returns its range in ei_startpfn and ei_endpfn for the memblock entry.
+ */
+static int __init memblock_x86_find_active_region(const struct memblock_region *ei,
+				  unsigned long start_pfn,
+				  unsigned long last_pfn,
+				  unsigned long *ei_startpfn,
+				  unsigned long *ei_endpfn)
+{
+	u64 align = PAGE_SIZE;
+
+	*ei_startpfn = round_up(ei->base, align) >> PAGE_SHIFT;
+	*ei_endpfn = round_down(ei->base + ei->size, align) >> PAGE_SHIFT;
+
+	/* Skip map entries smaller than a page */
+	if (*ei_startpfn >= *ei_endpfn)
+		return 0;
+
+	/* Skip if map is outside the node */
+	if (*ei_endpfn <= start_pfn || *ei_startpfn >= last_pfn)
+		return 0;
+
+	/* Check for overlaps */
+	if (*ei_startpfn < start_pfn)
+		*ei_startpfn = start_pfn;
+	if (*ei_endpfn > last_pfn)
+		*ei_endpfn = last_pfn;
+
+	return 1;
+}
+
+/* Walk the memblock.memory map and register active regions within a node */
+void __init memblock_x86_register_active_regions(int nid, unsigned long start_pfn,
+					 unsigned long last_pfn)
+{
+	unsigned long ei_startpfn;
+	unsigned long ei_endpfn;
+	struct memblock_region *r;
+
+	for_each_memblock(memory, r)
+		if (memblock_x86_find_active_region(r, start_pfn, last_pfn,
+					   &ei_startpfn, &ei_endpfn))
+			add_active_range(nid, ei_startpfn, ei_endpfn);
+}
+
+/*
+ * Find the hole size (in bytes) in the memory range.
+ * @start: starting address of the memory range to scan
+ * @end: ending address of the memory range to scan
+ */
+u64 __init memblock_x86_hole_size(u64 start, u64 end)
+{
+	unsigned long start_pfn = start >> PAGE_SHIFT;
+	unsigned long last_pfn = end >> PAGE_SHIFT;
+	unsigned long ei_startpfn, ei_endpfn, ram = 0;
+	struct memblock_region *r;
+
+	for_each_memblock(memory, r)
+		if (memblock_x86_find_active_region(r, start_pfn, last_pfn,
+					   &ei_startpfn, &ei_endpfn))
+			ram += ei_endpfn - ei_startpfn;
+
+	return end - start - ((u64)ram << PAGE_SHIFT);
+}
-- 
cgit v1.2.3-18-g5258


From 6bcc8176d07f108da3b1af17fb2c0e82c80e948e Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 25 Aug 2010 13:39:16 -0700
Subject: x86, memblock: Add memblock_x86_find_in_range_node()

It can be used to find NODE_DATA for numa.

Need to make sure early_node_map[] is filled before it is called, otherwise
it will fallback to memblock_find_in_range(), with node range.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/memblock.h |  1 +
 arch/x86/mm/memblock.c          | 15 +++++++++++++++
 2 files changed, 16 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/memblock.h b/arch/x86/include/asm/memblock.h
index 16af28d3607..3a86b10380f 100644
--- a/arch/x86/include/asm/memblock.h
+++ b/arch/x86/include/asm/memblock.h
@@ -14,5 +14,6 @@ int get_free_all_memory_range(struct range **rangep, int nodeid);
 void memblock_x86_register_active_regions(int nid, unsigned long start_pfn,
 					 unsigned long last_pfn);
 u64 memblock_x86_hole_size(u64 start, u64 end);
+u64 memblock_x86_find_in_range_node(int nid, u64 start, u64 end, u64 size, u64 align);
 
 #endif
diff --git a/arch/x86/mm/memblock.c b/arch/x86/mm/memblock.c
index 53a7a5aebd6..22ff0a39b22 100644
--- a/arch/x86/mm/memblock.c
+++ b/arch/x86/mm/memblock.c
@@ -233,6 +233,21 @@ void __init memblock_x86_free_range(u64 start, u64 end)
 	memblock_free(start, end - start);
 }
 
+/*
+ * Need to call this function after memblock_x86_register_active_regions,
+ * so early_node_map[] is filled already.
+ */
+u64 __init memblock_x86_find_in_range_node(int nid, u64 start, u64 end, u64 size, u64 align)
+{
+	u64 addr;
+	addr = find_memory_core_early(nid, size, align, start, end);
+	if (addr != MEMBLOCK_ERROR)
+		return addr;
+
+	/* Fallback, should already have start end within node range */
+	return memblock_find_in_range(start, end, size, align);
+}
+
 /*
  * Finds an active region in the address range from start_pfn to last_pfn and
  * returns its range in ei_startpfn and ei_endpfn for the memblock entry.
-- 
cgit v1.2.3-18-g5258


From b52c17ce854125700c4e19d4427d39bf2504ff63 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 25 Aug 2010 13:39:16 -0700
Subject: x86, memblock: Add memblock_x86_free_memory_in_range()

It will return free memory size in specified range.

We can not use memory_size - reserved_size here, because some reserved area
may not be in the scope of memblock.memory.region.

Use memblock.memory.region subtracting memblock.reserved.region to get free range array.
then count size of all free ranges.

-v2: Ben insist on using _in_range

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/memblock.h |  1 +
 arch/x86/mm/memblock.c          | 48 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 49 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/memblock.h b/arch/x86/include/asm/memblock.h
index 3a86b10380f..fc3c230812e 100644
--- a/arch/x86/include/asm/memblock.h
+++ b/arch/x86/include/asm/memblock.h
@@ -15,5 +15,6 @@ void memblock_x86_register_active_regions(int nid, unsigned long start_pfn,
 					 unsigned long last_pfn);
 u64 memblock_x86_hole_size(u64 start, u64 end);
 u64 memblock_x86_find_in_range_node(int nid, u64 start, u64 end, u64 size, u64 align);
+u64 memblock_x86_free_memory_in_range(u64 addr, u64 limit);
 
 #endif
diff --git a/arch/x86/mm/memblock.c b/arch/x86/mm/memblock.c
index 22ff0a39b22..30d60cf29ce 100644
--- a/arch/x86/mm/memblock.c
+++ b/arch/x86/mm/memblock.c
@@ -211,6 +211,54 @@ void __init memblock_x86_to_bootmem(u64 start, u64 end)
 }
 #endif
 
+u64 __init memblock_x86_free_memory_in_range(u64 addr, u64 limit)
+{
+	int i, count;
+	struct range *range;
+	int nr_range;
+	u64 final_start, final_end;
+	u64 free_size;
+	struct memblock_region *r;
+
+	count = (memblock.reserved.cnt + memblock.memory.cnt) * 2;
+
+	range = find_range_array(count);
+	nr_range = 0;
+
+	addr = PFN_UP(addr);
+	limit = PFN_DOWN(limit);
+
+	for_each_memblock(memory, r) {
+		final_start = PFN_UP(r->base);
+		final_end = PFN_DOWN(r->base + r->size);
+		if (final_start >= final_end)
+			continue;
+		if (final_start >= limit || final_end <= addr)
+			continue;
+
+		nr_range = add_range(range, count, nr_range, final_start, final_end);
+	}
+	subtract_range(range, count, 0, addr);
+	subtract_range(range, count, limit, -1ULL);
+	for_each_memblock(reserved, r) {
+		final_start = PFN_DOWN(r->base);
+		final_end = PFN_UP(r->base + r->size);
+		if (final_start >= final_end)
+			continue;
+		if (final_start >= limit || final_end <= addr)
+			continue;
+
+		subtract_range(range, count, final_start, final_end);
+	}
+	nr_range = clean_sort_range(range, count);
+
+	free_size = 0;
+	for (i = 0; i < nr_range; i++)
+		free_size += range[i].end - range[i].start;
+
+	return free_size << PAGE_SHIFT;
+}
+
 void __init memblock_x86_reserve_range(u64 start, u64 end, char *name)
 {
 	if (start == end)
-- 
cgit v1.2.3-18-g5258


From e82d42be24bd5d75bf6f81045636e6ca95ab55f2 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 25 Aug 2010 13:39:17 -0700
Subject: x86, memblock: Add memblock_x86_memory_in_range()

It will return memory size in specified range according to memblock.memory.region

Try to share some code with memblock_x86_free_memory_in_range() by passing get_free to
__memblock_x86_memory_in_range().

-v2: Ben want _in_range in the name instead of size

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/memblock.h |  1 +
 arch/x86/mm/memblock.c          | 18 +++++++++++++++++-
 2 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/memblock.h b/arch/x86/include/asm/memblock.h
index fc3c230812e..2c304bb6e07 100644
--- a/arch/x86/include/asm/memblock.h
+++ b/arch/x86/include/asm/memblock.h
@@ -16,5 +16,6 @@ void memblock_x86_register_active_regions(int nid, unsigned long start_pfn,
 u64 memblock_x86_hole_size(u64 start, u64 end);
 u64 memblock_x86_find_in_range_node(int nid, u64 start, u64 end, u64 size, u64 align);
 u64 memblock_x86_free_memory_in_range(u64 addr, u64 limit);
+u64 memblock_x86_memory_in_range(u64 addr, u64 limit);
 
 #endif
diff --git a/arch/x86/mm/memblock.c b/arch/x86/mm/memblock.c
index 30d60cf29ce..32ddad5dc93 100644
--- a/arch/x86/mm/memblock.c
+++ b/arch/x86/mm/memblock.c
@@ -211,7 +211,7 @@ void __init memblock_x86_to_bootmem(u64 start, u64 end)
 }
 #endif
 
-u64 __init memblock_x86_free_memory_in_range(u64 addr, u64 limit)
+static u64 __init __memblock_x86_memory_in_range(u64 addr, u64 limit, bool get_free)
 {
 	int i, count;
 	struct range *range;
@@ -240,6 +240,10 @@ u64 __init memblock_x86_free_memory_in_range(u64 addr, u64 limit)
 	}
 	subtract_range(range, count, 0, addr);
 	subtract_range(range, count, limit, -1ULL);
+
+	/* Subtract memblock.reserved.region in range ? */
+	if (!get_free)
+		goto sort_and_count_them;
 	for_each_memblock(reserved, r) {
 		final_start = PFN_DOWN(r->base);
 		final_end = PFN_UP(r->base + r->size);
@@ -250,6 +254,8 @@ u64 __init memblock_x86_free_memory_in_range(u64 addr, u64 limit)
 
 		subtract_range(range, count, final_start, final_end);
 	}
+
+sort_and_count_them:
 	nr_range = clean_sort_range(range, count);
 
 	free_size = 0;
@@ -259,6 +265,16 @@ u64 __init memblock_x86_free_memory_in_range(u64 addr, u64 limit)
 	return free_size << PAGE_SHIFT;
 }
 
+u64 __init memblock_x86_free_memory_in_range(u64 addr, u64 limit)
+{
+	return __memblock_x86_memory_in_range(addr, limit, true);
+}
+
+u64 __init memblock_x86_memory_in_range(u64 addr, u64 limit)
+{
+	return __memblock_x86_memory_in_range(addr, limit, false);
+}
+
 void __init memblock_x86_reserve_range(u64 start, u64 end, char *name)
 {
 	if (start == end)
-- 
cgit v1.2.3-18-g5258


From 301ff3e88ef9ff4bdb92f36a3e6170fce4c9dd34 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 25 Aug 2010 13:39:17 -0700
Subject: x86, memblock: Use memblock_debug to control debug message print out

Also let memblock_x86_reserve_range/memblock_x86_free_range could print out name if memblock=debug is
specified

will also print ther name when reserve_memblock_area/free_memblock_area are called.

-v2: according to Ingo, put " if (memblock_debug) " in one place

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/mm/memblock.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/memblock.c b/arch/x86/mm/memblock.c
index 32ddad5dc93..aaff3932588 100644
--- a/arch/x86/mm/memblock.c
+++ b/arch/x86/mm/memblock.c
@@ -118,10 +118,10 @@ static void __init memblock_x86_subtract_reserved(struct range *range, int az)
 	/* Take out region array itself at first*/
 	memblock_free_reserved_regions();
 
-	pr_info("Subtract (%ld early reservations)\n", memblock.reserved.cnt);
+	memblock_dbg("Subtract (%ld early reservations)\n", memblock.reserved.cnt);
 
 	for_each_memblock(reserved, r) {
-		pr_info("  [%010llx-%010llx]\n", (u64)r->base, (u64)r->base + r->size - 1);
+		memblock_dbg("  [%010llx-%010llx]\n", (u64)r->base, (u64)r->base + r->size - 1);
 		final_start = PFN_DOWN(r->base);
 		final_end = PFN_UP(r->base + r->size);
 		if (final_start >= final_end)
@@ -193,16 +193,16 @@ void __init memblock_x86_to_bootmem(u64 start, u64 end)
 	memblock_free_reserved_regions();
 
 	count  = memblock.reserved.cnt;
-	pr_info("(%d early reservations) ==> bootmem [%010llx-%010llx]\n", count, start, end - 1);
+	memblock_dbg("(%d early reservations) ==> bootmem [%#010llx-%#010llx]\n", count, start, end - 1);
 	for_each_memblock(reserved, r) {
-		pr_info("  [%010llx-%010llx] ", (u64)r->base, (u64)r->base + r->size - 1);
+		memblock_dbg("  [%#010llx-%#010llx] ", (u64)r->base, (u64)r->base + r->size - 1);
 		final_start = max(start, r->base);
 		final_end = min(end, r->base + r->size);
 		if (final_start >= final_end) {
-			pr_cont("\n");
+			memblock_dbg("\n");
 			continue;
 		}
-		pr_cont(" ==> [%010llx-%010llx]\n", final_start, final_end - 1);
+		memblock_dbg(" ==> [%#010llx-%#010llx]\n", final_start, final_end - 1);
 		reserve_bootmem_generic(final_start, final_end - final_start, BOOTMEM_DEFAULT);
 	}
 
@@ -280,9 +280,11 @@ void __init memblock_x86_reserve_range(u64 start, u64 end, char *name)
 	if (start == end)
 		return;
 
-	if (WARN_ONCE(start > end, "memblock_x86_reserve_range: wrong range [%#llx, %#llx]\n", start, end))
+	if (WARN_ONCE(start > end, "memblock_x86_reserve_range: wrong range [%#llx, %#llx)\n", start, end))
 		return;
 
+	memblock_dbg("    memblock_x86_reserve_range: [%#010llx-%#010llx] %16s\n", start, end - 1, name);
+
 	memblock_reserve(start, end - start);
 }
 
@@ -291,9 +293,11 @@ void __init memblock_x86_free_range(u64 start, u64 end)
 	if (start == end)
 		return;
 
-	if (WARN_ONCE(start > end, "memblock_x86_free_range: wrong range [%#llx, %#llx]\n", start, end))
+	if (WARN_ONCE(start > end, "memblock_x86_free_range: wrong range [%#llx, %#llx)\n", start, end))
 		return;
 
+	memblock_dbg("       memblock_x86_free_range: [%#010llx-%#010llx]\n", start, end - 1);
+
 	memblock_free(start, end - start);
 }
 
-- 
cgit v1.2.3-18-g5258


From 72d7c3b33c980843e756681fb4867dc1efd62a76 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 25 Aug 2010 13:39:17 -0700
Subject: x86: Use memblock to replace early_res

1. replace find_e820_area with memblock_find_in_range
2. replace reserve_early with memblock_x86_reserve_range
3. replace free_early with memblock_x86_free_range.
4. NO_BOOTMEM will switch to use memblock too.
5. use _e820, _early wrap in the patch, in following patch, will
   replace them all
6. because memblock_x86_free_range support partial free, we can remove some special care
7. Need to make sure that memblock_find_in_range() is called after memblock_x86_fill()
   so adjust some calling later in setup.c::setup_arch()
   -- corruption_check and mptable_update

-v2: Move reserve_brk() early
    Before fill_memblock_area, to avoid overlap between brk and memblock_find_in_range()
    that could happen We have more then 128 RAM entry in E820 tables, and
    memblock_x86_fill() could use memblock_find_in_range() to find a new place for
    memblock.memory.region array.
    and We don't need to use extend_brk() after fill_memblock_area()
    So move reserve_brk() early before fill_memblock_area().
-v3: Move find_smp_config early
    To make sure memblock_find_in_range not find wrong place, if BIOS doesn't put mptable
    in right place.
-v4: Treat RESERVED_KERN as RAM in memblock.memory. and they are already in
    memblock.reserved already..
    use __NOT_KEEP_MEMBLOCK to make sure memblock related code could be freed later.
-v5: Generic version __memblock_find_in_range() is going from high to low, and for 32bit
    active_region for 32bit does include high pages
    need to replace the limit with memblock.default_alloc_limit, aka get_max_mapped()
-v6: Use current_limit instead
-v7: check with MEMBLOCK_ERROR instead of -1ULL or -1L
-v8: Set memblock_can_resize early to handle EFI with more RAM entries
-v9: update after kmemleak changes in mainline

Suggested-by: David S. Miller <davem@davemloft.net>
Suggested-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/Kconfig               |   9 +--
 arch/x86/include/asm/e820.h    |  14 ++--
 arch/x86/kernel/check.c        |  16 +++--
 arch/x86/kernel/e820.c         | 159 ++++++++++++++---------------------------
 arch/x86/kernel/head.c         |   3 +-
 arch/x86/kernel/head32.c       |   6 +-
 arch/x86/kernel/head64.c       |   3 +
 arch/x86/kernel/mpparse.c      |   5 +-
 arch/x86/kernel/setup.c        |  46 ++++++++----
 arch/x86/kernel/setup_percpu.c |   6 --
 arch/x86/mm/numa_64.c          |   9 +--
 11 files changed, 123 insertions(+), 153 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index dcb0593b4a6..542bb2610cb 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -27,6 +27,7 @@ config X86
 	select HAVE_PERF_EVENTS if (!M386 && !M486)
 	select HAVE_IOREMAP_PROT
 	select HAVE_KPROBES
+	select HAVE_MEMBLOCK
 	select ARCH_WANT_OPTIONAL_GPIOLIB
 	select ARCH_WANT_FRAME_POINTERS
 	select HAVE_DMA_ATTRS
@@ -195,9 +196,6 @@ config ARCH_SUPPORTS_OPTIMIZED_INLINING
 config ARCH_SUPPORTS_DEBUG_PAGEALLOC
 	def_bool y
 
-config HAVE_EARLY_RES
-	def_bool y
-
 config HAVE_INTEL_TXT
 	def_bool y
 	depends on EXPERIMENTAL && DMAR && ACPI
@@ -590,14 +588,13 @@ config NO_BOOTMEM
 	default y
 	bool "Disable Bootmem code"
 	---help---
-	  Use early_res directly instead of bootmem before slab is ready.
+	  Use memblock directly instead of bootmem before slab is ready.
 		- allocator (buddy) [generic]
 		- early allocator (bootmem) [generic]
-		- very early allocator (reserve_early*()) [x86]
+		- very early allocator (memblock) [some generic]
 		- very very early allocator (early brk model) [x86]
 	  So reduce one layer between early allocator to final allocator
 
-
 config MEMTEST
 	bool "Memtest"
 	---help---
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index ec8a52d14ab..388fed29146 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -117,24 +117,26 @@ extern unsigned long end_user_pfn;
 extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align);
 extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align);
 extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align);
-#include <linux/early_res.h>
 
 extern unsigned long e820_end_of_ram_pfn(void);
 extern unsigned long e820_end_of_low_ram_pfn(void);
-extern int e820_find_active_region(const struct e820entry *ei,
-				  unsigned long start_pfn,
-				  unsigned long last_pfn,
-				  unsigned long *ei_startpfn,
-				  unsigned long *ei_endpfn);
 extern void e820_register_active_regions(int nid, unsigned long start_pfn,
 					 unsigned long end_pfn);
 extern u64 e820_hole_size(u64 start, u64 end);
+
+extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align);
+
+void memblock_x86_fill(void);
+
 extern void finish_e820_parsing(void);
 extern void e820_reserve_resources(void);
 extern void e820_reserve_resources_late(void);
 extern void setup_memory_map(void);
 extern char *default_machine_specific_memory_setup(void);
 
+void reserve_early(u64 start, u64 end, char *name);
+void free_early(u64 start, u64 end);
+
 /*
  * Returns true iff the specified range [s,e) is completely contained inside
  * the ISA region.
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index fc999e6fc46..13a38917951 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -2,7 +2,8 @@
 #include <linux/sched.h>
 #include <linux/kthread.h>
 #include <linux/workqueue.h>
-#include <asm/e820.h>
+#include <linux/memblock.h>
+
 #include <asm/proto.h>
 
 /*
@@ -18,10 +19,12 @@ static int __read_mostly memory_corruption_check = -1;
 static unsigned __read_mostly corruption_check_size = 64*1024;
 static unsigned __read_mostly corruption_check_period = 60; /* seconds */
 
-static struct e820entry scan_areas[MAX_SCAN_AREAS];
+static struct scan_area {
+	u64 addr;
+	u64 size;
+} scan_areas[MAX_SCAN_AREAS];
 static int num_scan_areas;
 
-
 static __init int set_corruption_check(char *arg)
 {
 	char *end;
@@ -81,9 +84,9 @@ void __init setup_bios_corruption_check(void)
 
 	while (addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) {
 		u64 size;
-		addr = find_e820_area_size(addr, &size, PAGE_SIZE);
+		addr = memblock_x86_find_in_range_size(addr, &size, PAGE_SIZE);
 
-		if (!(addr + 1))
+		if (addr == MEMBLOCK_ERROR)
 			break;
 
 		if (addr >= corruption_check_size)
@@ -92,7 +95,7 @@ void __init setup_bios_corruption_check(void)
 		if ((addr + size) > corruption_check_size)
 			size = corruption_check_size - addr;
 
-		e820_update_range(addr, size, E820_RAM, E820_RESERVED);
+		memblock_x86_reserve_range(addr, addr + size, "SCAN RAM");
 		scan_areas[num_scan_areas].addr = addr;
 		scan_areas[num_scan_areas].size = size;
 		num_scan_areas++;
@@ -105,7 +108,6 @@ void __init setup_bios_corruption_check(void)
 
 	printk(KERN_INFO "Scanning %d areas for low memory corruption\n",
 	       num_scan_areas);
-	update_e820();
 }
 
 
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 0d6fc71bedb..a9221d18a5e 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -15,6 +15,7 @@
 #include <linux/pfn.h>
 #include <linux/suspend.h>
 #include <linux/firmware-map.h>
+#include <linux/memblock.h>
 
 #include <asm/e820.h>
 #include <asm/proto.h>
@@ -742,69 +743,29 @@ core_initcall(e820_mark_nvs_memory);
  */
 u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
 {
-	int i;
-
-	for (i = 0; i < e820.nr_map; i++) {
-		struct e820entry *ei = &e820.map[i];
-		u64 addr;
-		u64 ei_start, ei_last;
+	u64 mem = memblock_find_in_range(start, end, size, align);
 
-		if (ei->type != E820_RAM)
-			continue;
-
-		ei_last = ei->addr + ei->size;
-		ei_start = ei->addr;
-		addr = find_early_area(ei_start, ei_last, start, end,
-					 size, align);
-
-		if (addr != -1ULL)
-			return addr;
-	}
-	return -1ULL;
-}
+	if (mem == MEMBLOCK_ERROR)
+		return -1ULL;
 
-u64 __init find_fw_memmap_area(u64 start, u64 end, u64 size, u64 align)
-{
-	return find_e820_area(start, end, size, align);
+	return mem;
 }
 
-u64 __init get_max_mapped(void)
-{
-	u64 end = max_pfn_mapped;
-
-	end <<= PAGE_SHIFT;
-
-	return end;
-}
 /*
  * Find next free range after *start
  */
 u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
 {
-	int i;
+	u64 mem = memblock_x86_find_in_range_size(start, sizep, align);
 
-	for (i = 0; i < e820.nr_map; i++) {
-		struct e820entry *ei = &e820.map[i];
-		u64 addr;
-		u64 ei_start, ei_last;
-
-		if (ei->type != E820_RAM)
-			continue;
-
-		ei_last = ei->addr + ei->size;
-		ei_start = ei->addr;
-		addr = find_early_area_size(ei_start, ei_last, start,
-					 sizep, align);
+	if (mem == MEMBLOCK_ERROR)
+		return -1ULL
 
-		if (addr != -1ULL)
-			return addr;
-	}
-
-	return -1ULL;
+	return mem;
 }
 
 /*
- * pre allocated 4k and reserved it in e820
+ * pre allocated 4k and reserved it in memblock and e820_saved
  */
 u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
 {
@@ -813,8 +774,8 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
 	u64 start;
 
 	for (start = startt; ; start += size) {
-		start = find_e820_area_size(start, &size, align);
-		if (!(start + 1))
+		start = memblock_x86_find_in_range_size(start, &size, align);
+		if (start == MEMBLOCK_ERROR)
 			return 0;
 		if (size >= sizet)
 			break;
@@ -830,10 +791,9 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
 	addr = round_down(start + size - sizet, align);
 	if (addr < start)
 		return 0;
-	e820_update_range(addr, sizet, E820_RAM, E820_RESERVED);
+	memblock_x86_reserve_range(addr, addr + sizet, "new next");
 	e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED);
-	printk(KERN_INFO "update e820 for early_reserve_e820\n");
-	update_e820();
+	printk(KERN_INFO "update e820_saved for early_reserve_e820\n");
 	update_e820_saved();
 
 	return addr;
@@ -895,52 +855,12 @@ unsigned long __init e820_end_of_low_ram_pfn(void)
 {
 	return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM);
 }
-/*
- * Finds an active region in the address range from start_pfn to last_pfn and
- * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
- */
-int __init e820_find_active_region(const struct e820entry *ei,
-				  unsigned long start_pfn,
-				  unsigned long last_pfn,
-				  unsigned long *ei_startpfn,
-				  unsigned long *ei_endpfn)
-{
-	u64 align = PAGE_SIZE;
-
-	*ei_startpfn = round_up(ei->addr, align) >> PAGE_SHIFT;
-	*ei_endpfn = round_down(ei->addr + ei->size, align) >> PAGE_SHIFT;
-
-	/* Skip map entries smaller than a page */
-	if (*ei_startpfn >= *ei_endpfn)
-		return 0;
-
-	/* Skip if map is outside the node */
-	if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
-				    *ei_startpfn >= last_pfn)
-		return 0;
-
-	/* Check for overlaps */
-	if (*ei_startpfn < start_pfn)
-		*ei_startpfn = start_pfn;
-	if (*ei_endpfn > last_pfn)
-		*ei_endpfn = last_pfn;
-
-	return 1;
-}
 
 /* Walk the e820 map and register active regions within a node */
 void __init e820_register_active_regions(int nid, unsigned long start_pfn,
 					 unsigned long last_pfn)
 {
-	unsigned long ei_startpfn;
-	unsigned long ei_endpfn;
-	int i;
-
-	for (i = 0; i < e820.nr_map; i++)
-		if (e820_find_active_region(&e820.map[i],
-					    start_pfn, last_pfn,
-					    &ei_startpfn, &ei_endpfn))
-			add_active_range(nid, ei_startpfn, ei_endpfn);
+	memblock_x86_register_active_regions(nid, start_pfn, last_pfn);
 }
 
 /*
@@ -950,18 +870,16 @@ void __init e820_register_active_regions(int nid, unsigned long start_pfn,
  */
 u64 __init e820_hole_size(u64 start, u64 end)
 {
-	unsigned long start_pfn = start >> PAGE_SHIFT;
-	unsigned long last_pfn = end >> PAGE_SHIFT;
-	unsigned long ei_startpfn, ei_endpfn, ram = 0;
-	int i;
+	return memblock_x86_hole_size(start, end);
+}
 
-	for (i = 0; i < e820.nr_map; i++) {
-		if (e820_find_active_region(&e820.map[i],
-					    start_pfn, last_pfn,
-					    &ei_startpfn, &ei_endpfn))
-			ram += ei_endpfn - ei_startpfn;
-	}
-	return end - start - ((u64)ram << PAGE_SHIFT);
+void reserve_early(u64 start, u64 end, char *name)
+{
+	memblock_x86_reserve_range(start, end, name);
+}
+void free_early(u64 start, u64 end)
+{
+	memblock_x86_free_range(start, end);
 }
 
 static void early_panic(char *msg)
@@ -1210,3 +1128,32 @@ void __init setup_memory_map(void)
 	printk(KERN_INFO "BIOS-provided physical RAM map:\n");
 	e820_print_map(who);
 }
+
+void __init memblock_x86_fill(void)
+{
+	int i;
+	u64 end;
+
+	/*
+	 * EFI may have more than 128 entries
+	 * We are safe to enable resizing, beause memblock_x86_fill()
+	 * is rather later for x86
+	 */
+	memblock_can_resize = 1;
+
+	for (i = 0; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+
+		end = ei->addr + ei->size;
+		if (end != (resource_size_t)end)
+			continue;
+
+		if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
+			continue;
+
+		memblock_add(ei->addr, ei->size);
+	}
+
+	memblock_analyze();
+	memblock_dump_all();
+}
diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c
index 3e66bd364a9..af0699ba48c 100644
--- a/arch/x86/kernel/head.c
+++ b/arch/x86/kernel/head.c
@@ -1,5 +1,6 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
+#include <linux/memblock.h>
 
 #include <asm/setup.h>
 #include <asm/bios_ebda.h>
@@ -51,5 +52,5 @@ void __init reserve_ebda_region(void)
 		lowmem = 0x9f000;
 
 	/* reserve all memory between lowmem and the 1MB mark */
-	reserve_early_overlap_ok(lowmem, 0x100000, "BIOS reserved");
+	memblock_x86_reserve_range(lowmem, 0x100000, "* BIOS reserved");
 }
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index b2e24603739..da60aa8a850 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -8,6 +8,7 @@
 #include <linux/init.h>
 #include <linux/start_kernel.h>
 #include <linux/mm.h>
+#include <linux/memblock.h>
 
 #include <asm/setup.h>
 #include <asm/sections.h>
@@ -30,14 +31,15 @@ static void __init i386_default_early_setup(void)
 
 void __init i386_start_kernel(void)
 {
+	memblock_init();
+
 #ifdef CONFIG_X86_TRAMPOLINE
 	/*
 	 * But first pinch a few for the stack/trampoline stuff
 	 * FIXME: Don't need the extra page at 4K, but need to fix
 	 * trampoline before removing it. (see the GDT stuff)
 	 */
-	reserve_early_overlap_ok(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE,
-					 "EX TRAMPOLINE");
+	memblock_x86_reserve_range(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE");
 #endif
 
 	reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 7147143fd61..8ee930fdeeb 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -12,6 +12,7 @@
 #include <linux/percpu.h>
 #include <linux/start_kernel.h>
 #include <linux/io.h>
+#include <linux/memblock.h>
 
 #include <asm/processor.h>
 #include <asm/proto.h>
@@ -98,6 +99,8 @@ void __init x86_64_start_reservations(char *real_mode_data)
 {
 	copy_bootdata(__va(real_mode_data));
 
+	memblock_init();
+
 	reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
 
 #ifdef CONFIG_BLK_DEV_INITRD
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index d86dbf7e54b..8252545ae6f 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -11,6 +11,7 @@
 #include <linux/init.h>
 #include <linux/delay.h>
 #include <linux/bootmem.h>
+#include <linux/memblock.h>
 #include <linux/kernel_stat.h>
 #include <linux/mc146818rtc.h>
 #include <linux/bitops.h>
@@ -641,7 +642,7 @@ static void __init smp_reserve_memory(struct mpf_intel *mpf)
 {
 	unsigned long size = get_mpc_size(mpf->physptr);
 
-	reserve_early_overlap_ok(mpf->physptr, mpf->physptr+size, "MP-table mpc");
+	memblock_x86_reserve_range(mpf->physptr, mpf->physptr+size, "* MP-table mpc");
 }
 
 static int __init smp_scan_config(unsigned long base, unsigned long length)
@@ -670,7 +671,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)
 			       mpf, (u64)virt_to_phys(mpf));
 
 			mem = virt_to_phys(mpf);
-			reserve_early_overlap_ok(mem, mem + sizeof(*mpf), "MP-table mpf");
+			memblock_x86_reserve_range(mem, mem + sizeof(*mpf), "* MP-table mpf");
 			if (mpf->physptr)
 				smp_reserve_memory(mpf);
 
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b4ae4acbd03..bbe0aaf7749 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -31,6 +31,7 @@
 #include <linux/apm_bios.h>
 #include <linux/initrd.h>
 #include <linux/bootmem.h>
+#include <linux/memblock.h>
 #include <linux/seq_file.h>
 #include <linux/console.h>
 #include <linux/mca.h>
@@ -614,7 +615,7 @@ static __init void reserve_ibft_region(void)
 	addr = find_ibft_region(&size);
 
 	if (size)
-		reserve_early_overlap_ok(addr, addr + size, "ibft");
+		memblock_x86_reserve_range(addr, addr + size, "* ibft");
 }
 
 #ifdef CONFIG_X86_RESERVE_LOW_64K
@@ -708,6 +709,15 @@ static void __init trim_bios_range(void)
 	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
 }
 
+static u64 __init get_max_mapped(void)
+{
+	u64 end = max_pfn_mapped;
+
+	end <<= PAGE_SHIFT;
+
+	return end;
+}
+
 /*
  * Determine if we were loaded by an EFI loader.  If so, then we have also been
  * passed the efi memmap, systab, etc., so we should use these data structures
@@ -891,8 +901,6 @@ void __init setup_arch(char **cmdline_p)
 	 */
 	max_pfn = e820_end_of_ram_pfn();
 
-	/* preallocate 4k for mptable mpc */
-	early_reserve_e820_mpc_new();
 	/* update e820 for memory not covered by WB MTRRs */
 	mtrr_bp_init();
 	if (mtrr_trim_uncached_memory(max_pfn))
@@ -917,15 +925,6 @@ void __init setup_arch(char **cmdline_p)
 	max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
 #endif
 
-#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
-	setup_bios_corruption_check();
-#endif
-
-	printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n",
-			max_pfn_mapped<<PAGE_SHIFT);
-
-	reserve_brk();
-
 	/*
 	 * Find and reserve possible boot-time SMP configuration:
 	 */
@@ -933,6 +932,26 @@ void __init setup_arch(char **cmdline_p)
 
 	reserve_ibft_region();
 
+	/*
+	 * Need to conclude brk, before memblock_x86_fill()
+	 *  it could use memblock_find_in_range, could overlap with
+	 *  brk area.
+	 */
+	reserve_brk();
+
+	memblock.current_limit = get_max_mapped();
+	memblock_x86_fill();
+
+	/* preallocate 4k for mptable mpc */
+	early_reserve_e820_mpc_new();
+
+#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
+	setup_bios_corruption_check();
+#endif
+
+	printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n",
+			max_pfn_mapped<<PAGE_SHIFT);
+
 	reserve_trampoline_memory();
 
 #ifdef CONFIG_ACPI_SLEEP
@@ -956,6 +975,7 @@ void __init setup_arch(char **cmdline_p)
 		max_low_pfn = max_pfn;
 	}
 #endif
+	memblock.current_limit = get_max_mapped();
 
 	/*
 	 * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
@@ -995,7 +1015,7 @@ void __init setup_arch(char **cmdline_p)
 
 	initmem_init(0, max_pfn, acpi, k8);
 #ifndef CONFIG_NO_BOOTMEM
-	early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
+	memblock_x86_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
 #endif
 
 	dma32_reserve_bootmem();
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index a60df9ae645..42e2633f369 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -131,13 +131,7 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
 
 static void __init pcpu_fc_free(void *ptr, size_t size)
 {
-#ifdef CONFIG_NO_BOOTMEM
-	u64 start = __pa(ptr);
-	u64 end = start + size;
-	free_early_partial(start, end);
-#else
 	free_bootmem(__pa(ptr), size);
-#endif
 }
 
 static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index a7bcc23ef96..3d54f9f95d4 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -7,6 +7,7 @@
 #include <linux/string.h>
 #include <linux/init.h>
 #include <linux/bootmem.h>
+#include <linux/memblock.h>
 #include <linux/mmzone.h>
 #include <linux/ctype.h>
 #include <linux/module.h>
@@ -171,8 +172,8 @@ static void * __init early_node_mem(int nodeid, unsigned long start,
 	if (start < (MAX_DMA32_PFN<<PAGE_SHIFT) &&
 	    end > (MAX_DMA32_PFN<<PAGE_SHIFT))
 		start = MAX_DMA32_PFN<<PAGE_SHIFT;
-	mem = find_e820_area(start, end, size, align);
-	if (mem != -1L)
+	mem = memblock_x86_find_in_range_node(nodeid, start, end, size, align);
+	if (mem != MEMBLOCK_ERROR)
 		return __va(mem);
 
 	/* extend the search scope */
@@ -181,8 +182,8 @@ static void * __init early_node_mem(int nodeid, unsigned long start,
 		start = MAX_DMA32_PFN<<PAGE_SHIFT;
 	else
 		start = MAX_DMA_PFN<<PAGE_SHIFT;
-	mem = find_e820_area(start, end, size, align);
-	if (mem != -1L)
+	mem = memblock_x86_find_in_range_node(nodeid, start, end, size, align);
+	if (mem != MEMBLOCK_ERROR)
 		return __va(mem);
 
 	printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
-- 
cgit v1.2.3-18-g5258


From a9ce6bc15100023b411f8117e53a016d61889800 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 25 Aug 2010 13:39:17 -0700
Subject: x86, memblock: Replace e820_/_early string with memblock_

1.include linux/memblock.h directly. so later could reduce e820.h reference.
2 this patch is done by sed scripts mainly

-v2: use MEMBLOCK_ERROR instead of -1ULL or -1UL

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/efi.h      |  2 +-
 arch/x86/kernel/acpi/sleep.c    |  9 +++++----
 arch/x86/kernel/apic/numaq_32.c |  3 ++-
 arch/x86/kernel/efi.c           |  5 +++--
 arch/x86/kernel/head32.c        |  4 ++--
 arch/x86/kernel/head64.c        |  4 ++--
 arch/x86/kernel/setup.c         | 29 ++++++++++++++---------------
 arch/x86/kernel/trampoline.c    | 10 +++++-----
 arch/x86/mm/init.c              | 10 ++++++----
 arch/x86/mm/init_32.c           | 14 ++++++++------
 arch/x86/mm/init_64.c           | 11 ++++++-----
 arch/x86/mm/k8topology_64.c     |  4 +++-
 arch/x86/mm/memtest.c           |  7 +++----
 arch/x86/mm/numa_32.c           | 25 +++++++++++++------------
 arch/x86/mm/numa_64.c           | 34 +++++++++++++++++-----------------
 arch/x86/mm/srat_32.c           |  3 ++-
 arch/x86/mm/srat_64.c           | 11 ++++++-----
 arch/x86/xen/mmu.c              |  5 +++--
 arch/x86/xen/setup.c            |  3 ++-
 19 files changed, 103 insertions(+), 90 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 8406ed7f992..8e4a16508d4 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -90,7 +90,7 @@ extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size,
 #endif /* CONFIG_X86_32 */
 
 extern int add_efi_memmap;
-extern void efi_reserve_early(void);
+extern void efi_memblock_x86_reserve_range(void);
 extern void efi_call_phys_prelog(void);
 extern void efi_call_phys_epilog(void);
 
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index fcc3c61fdec..d829e75f968 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -7,6 +7,7 @@
 
 #include <linux/acpi.h>
 #include <linux/bootmem.h>
+#include <linux/memblock.h>
 #include <linux/dmi.h>
 #include <linux/cpumask.h>
 #include <asm/segment.h>
@@ -125,7 +126,7 @@ void acpi_restore_state_mem(void)
  */
 void __init acpi_reserve_wakeup_memory(void)
 {
-	unsigned long mem;
+	phys_addr_t mem;
 
 	if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) {
 		printk(KERN_ERR
@@ -133,15 +134,15 @@ void __init acpi_reserve_wakeup_memory(void)
 		return;
 	}
 
-	mem = find_e820_area(0, 1<<20, WAKEUP_SIZE, PAGE_SIZE);
+	mem = memblock_find_in_range(0, 1<<20, WAKEUP_SIZE, PAGE_SIZE);
 
-	if (mem == -1L) {
+	if (mem == MEMBLOCK_ERROR) {
 		printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
 		return;
 	}
 	acpi_realmode = (unsigned long) phys_to_virt(mem);
 	acpi_wakeup_address = mem;
-	reserve_early(mem, mem + WAKEUP_SIZE, "ACPI WAKEUP");
+	memblock_x86_reserve_range(mem, mem + WAKEUP_SIZE, "ACPI WAKEUP");
 }
 
 
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 3e28401f161..960f26ab5c9 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -26,6 +26,7 @@
 #include <linux/nodemask.h>
 #include <linux/topology.h>
 #include <linux/bootmem.h>
+#include <linux/memblock.h>
 #include <linux/threads.h>
 #include <linux/cpumask.h>
 #include <linux/kernel.h>
@@ -88,7 +89,7 @@ static inline void numaq_register_node(int node, struct sys_cfg_data *scd)
 	node_end_pfn[node] =
 		 MB_TO_PAGES(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
 
-	e820_register_active_regions(node, node_start_pfn[node],
+	memblock_x86_register_active_regions(node, node_start_pfn[node],
 						node_end_pfn[node]);
 
 	memory_present(node, node_start_pfn[node], node_end_pfn[node]);
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index c2fa9b8b497..0fe27d7c625 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -30,6 +30,7 @@
 #include <linux/init.h>
 #include <linux/efi.h>
 #include <linux/bootmem.h>
+#include <linux/memblock.h>
 #include <linux/spinlock.h>
 #include <linux/uaccess.h>
 #include <linux/time.h>
@@ -275,7 +276,7 @@ static void __init do_add_efi_memmap(void)
 	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
 }
 
-void __init efi_reserve_early(void)
+void __init efi_memblock_x86_reserve_range(void)
 {
 	unsigned long pmap;
 
@@ -290,7 +291,7 @@ void __init efi_reserve_early(void)
 		boot_params.efi_info.efi_memdesc_size;
 	memmap.desc_version = boot_params.efi_info.efi_memdesc_version;
 	memmap.desc_size = boot_params.efi_info.efi_memdesc_size;
-	reserve_early(pmap, pmap + memmap.nr_map * memmap.desc_size,
+	memblock_x86_reserve_range(pmap, pmap + memmap.nr_map * memmap.desc_size,
 		      "EFI memmap");
 }
 
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index da60aa8a850..74e4cf65043 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -42,7 +42,7 @@ void __init i386_start_kernel(void)
 	memblock_x86_reserve_range(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE");
 #endif
 
-	reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
+	memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
 
 #ifdef CONFIG_BLK_DEV_INITRD
 	/* Reserve INITRD */
@@ -51,7 +51,7 @@ void __init i386_start_kernel(void)
 		u64 ramdisk_image = boot_params.hdr.ramdisk_image;
 		u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
 		u64 ramdisk_end   = PAGE_ALIGN(ramdisk_image + ramdisk_size);
-		reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
+		memblock_x86_reserve_range(ramdisk_image, ramdisk_end, "RAMDISK");
 	}
 #endif
 
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 8ee930fdeeb..97adf9828b9 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -101,7 +101,7 @@ void __init x86_64_start_reservations(char *real_mode_data)
 
 	memblock_init();
 
-	reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
+	memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
 
 #ifdef CONFIG_BLK_DEV_INITRD
 	/* Reserve INITRD */
@@ -110,7 +110,7 @@ void __init x86_64_start_reservations(char *real_mode_data)
 		unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
 		unsigned long ramdisk_size  = boot_params.hdr.ramdisk_size;
 		unsigned long ramdisk_end   = PAGE_ALIGN(ramdisk_image + ramdisk_size);
-		reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
+		memblock_x86_reserve_range(ramdisk_image, ramdisk_end, "RAMDISK");
 	}
 #endif
 
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index bbe0aaf7749..a4f01733e87 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -302,7 +302,7 @@ static inline void init_gbpages(void)
 static void __init reserve_brk(void)
 {
 	if (_brk_end > _brk_start)
-		reserve_early(__pa(_brk_start), __pa(_brk_end), "BRK");
+		memblock_x86_reserve_range(__pa(_brk_start), __pa(_brk_end), "BRK");
 
 	/* Mark brk area as locked down and no longer taking any
 	   new allocations */
@@ -324,17 +324,16 @@ static void __init relocate_initrd(void)
 	char *p, *q;
 
 	/* We need to move the initrd down into lowmem */
-	ramdisk_here = find_e820_area(0, end_of_lowmem, area_size,
+	ramdisk_here = memblock_find_in_range(0, end_of_lowmem, area_size,
 					 PAGE_SIZE);
 
-	if (ramdisk_here == -1ULL)
+	if (ramdisk_here == MEMBLOCK_ERROR)
 		panic("Cannot find place for new RAMDISK of size %lld\n",
 			 ramdisk_size);
 
 	/* Note: this includes all the lowmem currently occupied by
 	   the initrd, we rely on that fact to keep the data intact. */
-	reserve_early(ramdisk_here, ramdisk_here + area_size,
-			 "NEW RAMDISK");
+	memblock_x86_reserve_range(ramdisk_here, ramdisk_here + area_size, "NEW RAMDISK");
 	initrd_start = ramdisk_here + PAGE_OFFSET;
 	initrd_end   = initrd_start + ramdisk_size;
 	printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n",
@@ -390,7 +389,7 @@ static void __init reserve_initrd(void)
 	initrd_start = 0;
 
 	if (ramdisk_size >= (end_of_lowmem>>1)) {
-		free_early(ramdisk_image, ramdisk_end);
+		memblock_x86_free_range(ramdisk_image, ramdisk_end);
 		printk(KERN_ERR "initrd too large to handle, "
 		       "disabling initrd\n");
 		return;
@@ -413,7 +412,7 @@ static void __init reserve_initrd(void)
 
 	relocate_initrd();
 
-	free_early(ramdisk_image, ramdisk_end);
+	memblock_x86_free_range(ramdisk_image, ramdisk_end);
 }
 #else
 static void __init reserve_initrd(void)
@@ -469,7 +468,7 @@ static void __init e820_reserve_setup_data(void)
 	e820_print_map("reserve setup_data");
 }
 
-static void __init reserve_early_setup_data(void)
+static void __init memblock_x86_reserve_range_setup_data(void)
 {
 	struct setup_data *data;
 	u64 pa_data;
@@ -481,7 +480,7 @@ static void __init reserve_early_setup_data(void)
 	while (pa_data) {
 		data = early_memremap(pa_data, sizeof(*data));
 		sprintf(buf, "setup data %x", data->type);
-		reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
+		memblock_x86_reserve_range(pa_data, pa_data+sizeof(*data)+data->len, buf);
 		pa_data = data->next;
 		early_iounmap(data, sizeof(*data));
 	}
@@ -519,23 +518,23 @@ static void __init reserve_crashkernel(void)
 	if (crash_base <= 0) {
 		const unsigned long long alignment = 16<<20;	/* 16M */
 
-		crash_base = find_e820_area(alignment, ULONG_MAX, crash_size,
+		crash_base = memblock_find_in_range(alignment, ULONG_MAX, crash_size,
 				 alignment);
-		if (crash_base == -1ULL) {
+		if (crash_base == MEMBLOCK_ERROR) {
 			pr_info("crashkernel reservation failed - No suitable area found.\n");
 			return;
 		}
 	} else {
 		unsigned long long start;
 
-		start = find_e820_area(crash_base, ULONG_MAX, crash_size,
+		start = memblock_find_in_range(crash_base, ULONG_MAX, crash_size,
 				 1<<20);
 		if (start != crash_base) {
 			pr_info("crashkernel reservation failed - memory is in use.\n");
 			return;
 		}
 	}
-	reserve_early(crash_base, crash_base + crash_size, "CRASH KERNEL");
+	memblock_x86_reserve_range(crash_base, crash_base + crash_size, "CRASH KERNEL");
 
 	printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
 			"for crashkernel (System RAM: %ldMB)\n",
@@ -786,7 +785,7 @@ void __init setup_arch(char **cmdline_p)
 #endif
 	 4)) {
 		efi_enabled = 1;
-		efi_reserve_early();
+		efi_memblock_x86_reserve_range();
 	}
 #endif
 
@@ -846,7 +845,7 @@ void __init setup_arch(char **cmdline_p)
 	vmi_activate();
 
 	/* after early param, so could get panic from serial */
-	reserve_early_setup_data();
+	memblock_x86_reserve_range_setup_data();
 
 	if (acpi_mps_check()) {
 #ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
index c652ef62742..7c2102c2aad 100644
--- a/arch/x86/kernel/trampoline.c
+++ b/arch/x86/kernel/trampoline.c
@@ -1,7 +1,7 @@
 #include <linux/io.h>
+#include <linux/memblock.h>
 
 #include <asm/trampoline.h>
-#include <asm/e820.h>
 
 #if defined(CONFIG_X86_64) && defined(CONFIG_ACPI_SLEEP)
 #define __trampinit
@@ -16,15 +16,15 @@ unsigned char *__trampinitdata trampoline_base;
 
 void __init reserve_trampoline_memory(void)
 {
-	unsigned long mem;
+	phys_addr_t mem;
 
 	/* Has to be in very low memory so we can execute real-mode AP code. */
-	mem = find_e820_area(0, 1<<20, TRAMPOLINE_SIZE, PAGE_SIZE);
-	if (mem == -1L)
+	mem = memblock_find_in_range(0, 1<<20, TRAMPOLINE_SIZE, PAGE_SIZE);
+	if (mem == MEMBLOCK_ERROR)
 		panic("Cannot allocate trampoline\n");
 
 	trampoline_base = __va(mem);
-	reserve_early(mem, mem + TRAMPOLINE_SIZE, "TRAMPOLINE");
+	memblock_x86_reserve_range(mem, mem + TRAMPOLINE_SIZE, "TRAMPOLINE");
 }
 
 /*
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index b278535b14a..c0e28a13de7 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -2,6 +2,7 @@
 #include <linux/initrd.h>
 #include <linux/ioport.h>
 #include <linux/swap.h>
+#include <linux/memblock.h>
 
 #include <asm/cacheflush.h>
 #include <asm/e820.h>
@@ -33,6 +34,7 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
 					  int use_gbpages)
 {
 	unsigned long puds, pmds, ptes, tables, start;
+	phys_addr_t base;
 
 	puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
 	tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
@@ -75,12 +77,12 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
 #else
 	start = 0x8000;
 #endif
-	e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
+	base = memblock_find_in_range(start, max_pfn_mapped<<PAGE_SHIFT,
 					tables, PAGE_SIZE);
-	if (e820_table_start == -1UL)
+	if (base == MEMBLOCK_ERROR)
 		panic("Cannot find space for the kernel page tables");
 
-	e820_table_start >>= PAGE_SHIFT;
+	e820_table_start = base >> PAGE_SHIFT;
 	e820_table_end = e820_table_start;
 	e820_table_top = e820_table_start + (tables >> PAGE_SHIFT);
 
@@ -299,7 +301,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 	__flush_tlb_all();
 
 	if (!after_bootmem && e820_table_end > e820_table_start)
-		reserve_early(e820_table_start << PAGE_SHIFT,
+		memblock_x86_reserve_range(e820_table_start << PAGE_SHIFT,
 				 e820_table_end << PAGE_SHIFT, "PGTABLE");
 
 	if (!after_bootmem)
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 90e054589aa..63b09bae250 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -25,6 +25,7 @@
 #include <linux/pfn.h>
 #include <linux/poison.h>
 #include <linux/bootmem.h>
+#include <linux/memblock.h>
 #include <linux/proc_fs.h>
 #include <linux/memory_hotplug.h>
 #include <linux/initrd.h>
@@ -712,14 +713,14 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
 	highstart_pfn = highend_pfn = max_pfn;
 	if (max_pfn > max_low_pfn)
 		highstart_pfn = max_low_pfn;
-	e820_register_active_regions(0, 0, highend_pfn);
+	memblock_x86_register_active_regions(0, 0, highend_pfn);
 	sparse_memory_present_with_active_regions(0);
 	printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
 		pages_to_mb(highend_pfn - highstart_pfn));
 	num_physpages = highend_pfn;
 	high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
 #else
-	e820_register_active_regions(0, 0, max_low_pfn);
+	memblock_x86_register_active_regions(0, 0, max_low_pfn);
 	sparse_memory_present_with_active_regions(0);
 	num_physpages = max_low_pfn;
 	high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
@@ -776,16 +777,16 @@ void __init setup_bootmem_allocator(void)
 {
 #ifndef CONFIG_NO_BOOTMEM
 	int nodeid;
-	unsigned long bootmap_size, bootmap;
+	phys_addr_t bootmap_size, bootmap;
 	/*
 	 * Initialize the boot-time allocator (with low memory only):
 	 */
 	bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT;
-	bootmap = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
+	bootmap = memblock_find_in_range(0, max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
 				 PAGE_SIZE);
-	if (bootmap == -1L)
+	if (bootmap == MEMBLOCK_ERROR)
 		panic("Cannot find bootmem map of size %ld\n", bootmap_size);
-	reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
+	memblock_x86_reserve_range(bootmap, bootmap + bootmap_size, "BOOTMAP");
 #endif
 
 	printk(KERN_INFO "  mapped low ram: 0 - %08lx\n",
@@ -1069,3 +1070,4 @@ void mark_rodata_ro(void)
 #endif
 }
 #endif
+
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 634fa0884a4..592b2368062 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -21,6 +21,7 @@
 #include <linux/initrd.h>
 #include <linux/pagemap.h>
 #include <linux/bootmem.h>
+#include <linux/memblock.h>
 #include <linux/proc_fs.h>
 #include <linux/pci.h>
 #include <linux/pfn.h>
@@ -577,18 +578,18 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
 	unsigned long bootmap_size, bootmap;
 
 	bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
-	bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
+	bootmap = memblock_find_in_range(0, end_pfn<<PAGE_SHIFT, bootmap_size,
 				 PAGE_SIZE);
-	if (bootmap == -1L)
+	if (bootmap == MEMBLOCK_ERROR)
 		panic("Cannot find bootmem map of size %ld\n", bootmap_size);
-	reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
+	memblock_x86_reserve_range(bootmap, bootmap + bootmap_size, "BOOTMAP");
 	/* don't touch min_low_pfn */
 	bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
 					 0, end_pfn);
-	e820_register_active_regions(0, start_pfn, end_pfn);
+	memblock_x86_register_active_regions(0, start_pfn, end_pfn);
 	free_bootmem_with_active_regions(0, end_pfn);
 #else
-	e820_register_active_regions(0, start_pfn, end_pfn);
+	memblock_x86_register_active_regions(0, start_pfn, end_pfn);
 #endif
 }
 #endif
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c
index 970ed579d4e..966de9372e8 100644
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/k8topology_64.c
@@ -11,6 +11,8 @@
 #include <linux/string.h>
 #include <linux/module.h>
 #include <linux/nodemask.h>
+#include <linux/memblock.h>
+
 #include <asm/io.h>
 #include <linux/pci_ids.h>
 #include <linux/acpi.h>
@@ -222,7 +224,7 @@ int __init k8_scan_nodes(void)
 	for_each_node_mask(i, node_possible_map) {
 		int j;
 
-		e820_register_active_regions(i,
+		memblock_x86_register_active_regions(i,
 				nodes[i].start >> PAGE_SHIFT,
 				nodes[i].end >> PAGE_SHIFT);
 		for (j = apicid_base; j < cores + apicid_base; j++)
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
index 18d244f7020..92faf3a1c53 100644
--- a/arch/x86/mm/memtest.c
+++ b/arch/x86/mm/memtest.c
@@ -6,8 +6,7 @@
 #include <linux/smp.h>
 #include <linux/init.h>
 #include <linux/pfn.h>
-
-#include <asm/e820.h>
+#include <linux/memblock.h>
 
 static u64 patterns[] __initdata = {
 	0,
@@ -35,7 +34,7 @@ static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad)
 	       (unsigned long long) pattern,
 	       (unsigned long long) start_bad,
 	       (unsigned long long) end_bad);
-	reserve_early(start_bad, end_bad, "BAD RAM");
+	memblock_x86_reserve_range(start_bad, end_bad, "BAD RAM");
 }
 
 static void __init memtest(u64 pattern, u64 start_phys, u64 size)
@@ -74,7 +73,7 @@ static void __init do_one_pass(u64 pattern, u64 start, u64 end)
 	u64 size = 0;
 
 	while (start < end) {
-		start = find_e820_area_size(start, &size, 1);
+		start = memblock_x86_find_in_range_size(start, &size, 1);
 
 		/* done ? */
 		if (start >= end)
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 809baaaf48b..ddf9730b206 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -24,6 +24,7 @@
 
 #include <linux/mm.h>
 #include <linux/bootmem.h>
+#include <linux/memblock.h>
 #include <linux/mmzone.h>
 #include <linux/highmem.h>
 #include <linux/initrd.h>
@@ -120,7 +121,7 @@ int __init get_memcfg_numa_flat(void)
 
 	node_start_pfn[0] = 0;
 	node_end_pfn[0] = max_pfn;
-	e820_register_active_regions(0, 0, max_pfn);
+	memblock_x86_register_active_regions(0, 0, max_pfn);
 	memory_present(0, 0, max_pfn);
 	node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn);
 
@@ -161,14 +162,14 @@ static void __init allocate_pgdat(int nid)
 		NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
 	else {
 		unsigned long pgdat_phys;
-		pgdat_phys = find_e820_area(min_low_pfn<<PAGE_SHIFT,
+		pgdat_phys = memblock_find_in_range(min_low_pfn<<PAGE_SHIFT,
 				 max_pfn_mapped<<PAGE_SHIFT,
 				 sizeof(pg_data_t),
 				 PAGE_SIZE);
 		NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(pgdat_phys>>PAGE_SHIFT));
 		memset(buf, 0, sizeof(buf));
 		sprintf(buf, "NODE_DATA %d",  nid);
-		reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t), buf);
+		memblock_x86_reserve_range(pgdat_phys, pgdat_phys + sizeof(pg_data_t), buf);
 	}
 	printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n",
 		nid, (unsigned long)NODE_DATA(nid));
@@ -291,15 +292,15 @@ static __init unsigned long calculate_numa_remap_pages(void)
 						 PTRS_PER_PTE);
 		node_kva_target <<= PAGE_SHIFT;
 		do {
-			node_kva_final = find_e820_area(node_kva_target,
+			node_kva_final = memblock_find_in_range(node_kva_target,
 					((u64)node_end_pfn[nid])<<PAGE_SHIFT,
 						((u64)size)<<PAGE_SHIFT,
 						LARGE_PAGE_BYTES);
 			node_kva_target -= LARGE_PAGE_BYTES;
-		} while (node_kva_final == -1ULL &&
+		} while (node_kva_final == MEMBLOCK_ERROR &&
 			 (node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid]));
 
-		if (node_kva_final == -1ULL)
+		if (node_kva_final == MEMBLOCK_ERROR)
 			panic("Can not get kva ram\n");
 
 		node_remap_size[nid] = size;
@@ -318,9 +319,9 @@ static __init unsigned long calculate_numa_remap_pages(void)
 		 *  but we could have some hole in high memory, and it will only
 		 *  check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide
 		 *  to use it as free.
-		 *  So reserve_early here, hope we don't run out of that array
+		 *  So memblock_x86_reserve_range here, hope we don't run out of that array
 		 */
-		reserve_early(node_kva_final,
+		memblock_x86_reserve_range(node_kva_final,
 			      node_kva_final+(((u64)size)<<PAGE_SHIFT),
 			      "KVA RAM");
 
@@ -367,14 +368,14 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
 
 	kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE);
 	do {
-		kva_start_pfn = find_e820_area(kva_target_pfn<<PAGE_SHIFT,
+		kva_start_pfn = memblock_find_in_range(kva_target_pfn<<PAGE_SHIFT,
 					max_low_pfn<<PAGE_SHIFT,
 					kva_pages<<PAGE_SHIFT,
 					PTRS_PER_PTE<<PAGE_SHIFT) >> PAGE_SHIFT;
 		kva_target_pfn -= PTRS_PER_PTE;
-	} while (kva_start_pfn == -1UL && kva_target_pfn > min_low_pfn);
+	} while (kva_start_pfn == MEMBLOCK_ERROR && kva_target_pfn > min_low_pfn);
 
-	if (kva_start_pfn == -1UL)
+	if (kva_start_pfn == MEMBLOCK_ERROR)
 		panic("Can not get kva space\n");
 
 	printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n",
@@ -382,7 +383,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
 	printk(KERN_INFO "max_pfn = %lx\n", max_pfn);
 
 	/* avoid clash with initrd */
-	reserve_early(kva_start_pfn<<PAGE_SHIFT,
+	memblock_x86_reserve_range(kva_start_pfn<<PAGE_SHIFT,
 		      (kva_start_pfn + kva_pages)<<PAGE_SHIFT,
 		     "KVA PG");
 #ifdef CONFIG_HIGHMEM
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 3d54f9f95d4..984b1ff7db4 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -87,16 +87,16 @@ static int __init allocate_cachealigned_memnodemap(void)
 
 	addr = 0x8000;
 	nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
-	nodemap_addr = find_e820_area(addr, max_pfn<<PAGE_SHIFT,
+	nodemap_addr = memblock_find_in_range(addr, max_pfn<<PAGE_SHIFT,
 				      nodemap_size, L1_CACHE_BYTES);
-	if (nodemap_addr == -1UL) {
+	if (nodemap_addr == MEMBLOCK_ERROR) {
 		printk(KERN_ERR
 		       "NUMA: Unable to allocate Memory to Node hash map\n");
 		nodemap_addr = nodemap_size = 0;
 		return -1;
 	}
 	memnodemap = phys_to_virt(nodemap_addr);
-	reserve_early(nodemap_addr, nodemap_addr + nodemap_size, "MEMNODEMAP");
+	memblock_x86_reserve_range(nodemap_addr, nodemap_addr + nodemap_size, "MEMNODEMAP");
 
 	printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
 	       nodemap_addr, nodemap_addr + nodemap_size);
@@ -227,7 +227,7 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
 	if (node_data[nodeid] == NULL)
 		return;
 	nodedata_phys = __pa(node_data[nodeid]);
-	reserve_early(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA");
+	memblock_x86_reserve_range(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA");
 	printk(KERN_INFO "  NODE_DATA [%016lx - %016lx]\n", nodedata_phys,
 		nodedata_phys + pgdat_size - 1);
 	nid = phys_to_nid(nodedata_phys);
@@ -246,7 +246,7 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
 	 * Find a place for the bootmem map
 	 * nodedata_phys could be on other nodes by alloc_bootmem,
 	 * so need to sure bootmap_start not to be small, otherwise
-	 * early_node_mem will get that with find_e820_area instead
+	 * early_node_mem will get that with memblock_find_in_range instead
 	 * of alloc_bootmem, that could clash with reserved range
 	 */
 	bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn);
@@ -258,12 +258,12 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
 	bootmap = early_node_mem(nodeid, bootmap_start, end,
 				 bootmap_pages<<PAGE_SHIFT, PAGE_SIZE);
 	if (bootmap == NULL)  {
-		free_early(nodedata_phys, nodedata_phys + pgdat_size);
+		memblock_x86_free_range(nodedata_phys, nodedata_phys + pgdat_size);
 		node_data[nodeid] = NULL;
 		return;
 	}
 	bootmap_start = __pa(bootmap);
-	reserve_early(bootmap_start, bootmap_start+(bootmap_pages<<PAGE_SHIFT),
+	memblock_x86_reserve_range(bootmap_start, bootmap_start+(bootmap_pages<<PAGE_SHIFT),
 			"BOOTMAP");
 
 	bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
@@ -417,7 +417,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr,
 		nr_nodes = MAX_NUMNODES;
 	}
 
-	size = (max_addr - addr - e820_hole_size(addr, max_addr)) / nr_nodes;
+	size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / nr_nodes;
 	/*
 	 * Calculate the number of big nodes that can be allocated as a result
 	 * of consolidating the remainder.
@@ -453,7 +453,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr,
 			 * non-reserved memory is less than the per-node size.
 			 */
 			while (end - physnodes[i].start -
-				e820_hole_size(physnodes[i].start, end) < size) {
+				memblock_x86_hole_size(physnodes[i].start, end) < size) {
 				end += FAKE_NODE_MIN_SIZE;
 				if (end > physnodes[i].end) {
 					end = physnodes[i].end;
@@ -467,7 +467,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr,
 			 * this one must extend to the boundary.
 			 */
 			if (end < dma32_end && dma32_end - end -
-			    e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
+			    memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
 				end = dma32_end;
 
 			/*
@@ -476,7 +476,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr,
 			 * physical node.
 			 */
 			if (physnodes[i].end - end -
-			    e820_hole_size(end, physnodes[i].end) < size)
+			    memblock_x86_hole_size(end, physnodes[i].end) < size)
 				end = physnodes[i].end;
 
 			/*
@@ -504,7 +504,7 @@ static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
 {
 	u64 end = start + size;
 
-	while (end - start - e820_hole_size(start, end) < size) {
+	while (end - start - memblock_x86_hole_size(start, end) < size) {
 		end += FAKE_NODE_MIN_SIZE;
 		if (end > max_addr) {
 			end = max_addr;
@@ -533,7 +533,7 @@ static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size)
 	 * creates a uniform distribution of node sizes across the entire
 	 * machine (but not necessarily over physical nodes).
 	 */
-	min_size = (max_addr - addr - e820_hole_size(addr, max_addr)) /
+	min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) /
 						MAX_NUMNODES;
 	min_size = max(min_size, FAKE_NODE_MIN_SIZE);
 	if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
@@ -566,7 +566,7 @@ static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size)
 			 * this one must extend to the boundary.
 			 */
 			if (end < dma32_end && dma32_end - end -
-			    e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
+			    memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
 				end = dma32_end;
 
 			/*
@@ -575,7 +575,7 @@ static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size)
 			 * physical node.
 			 */
 			if (physnodes[i].end - end -
-			    e820_hole_size(end, physnodes[i].end) < size)
+			    memblock_x86_hole_size(end, physnodes[i].end) < size)
 				end = physnodes[i].end;
 
 			/*
@@ -639,7 +639,7 @@ static int __init numa_emulation(unsigned long start_pfn,
 	 */
 	remove_all_active_ranges();
 	for_each_node_mask(i, node_possible_map) {
-		e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
+		memblock_x86_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
 						nodes[i].end >> PAGE_SHIFT);
 		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
 	}
@@ -692,7 +692,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
 	node_set(0, node_possible_map);
 	for (i = 0; i < nr_cpu_ids; i++)
 		numa_set_node(i, 0);
-	e820_register_active_regions(0, start_pfn, last_pfn);
+	memblock_x86_register_active_regions(0, start_pfn, last_pfn);
 	setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT);
 }
 
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
index 9324f13492d..a17dffd136c 100644
--- a/arch/x86/mm/srat_32.c
+++ b/arch/x86/mm/srat_32.c
@@ -25,6 +25,7 @@
  */
 #include <linux/mm.h>
 #include <linux/bootmem.h>
+#include <linux/memblock.h>
 #include <linux/mmzone.h>
 #include <linux/acpi.h>
 #include <linux/nodemask.h>
@@ -264,7 +265,7 @@ int __init get_memcfg_from_srat(void)
 		if (node_read_chunk(chunk->nid, chunk))
 			continue;
 
-		e820_register_active_regions(chunk->nid, chunk->start_pfn,
+		memblock_x86_register_active_regions(chunk->nid, chunk->start_pfn,
 					     min(chunk->end_pfn, max_pfn));
 	}
 	/* for out of order entries in SRAT */
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index f9897f7a9ef..7f44eb62a5e 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -16,6 +16,7 @@
 #include <linux/module.h>
 #include <linux/topology.h>
 #include <linux/bootmem.h>
+#include <linux/memblock.h>
 #include <linux/mm.h>
 #include <asm/proto.h>
 #include <asm/numa.h>
@@ -98,15 +99,15 @@ void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
 	unsigned long phys;
 
 	length = slit->header.length;
-	phys = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, length,
+	phys = memblock_find_in_range(0, max_pfn_mapped<<PAGE_SHIFT, length,
 		 PAGE_SIZE);
 
-	if (phys == -1L)
+	if (phys == MEMBLOCK_ERROR)
 		panic(" Can not save slit!\n");
 
 	acpi_slit = __va(phys);
 	memcpy(acpi_slit, slit, length);
-	reserve_early(phys, phys + length, "ACPI SLIT");
+	memblock_x86_reserve_range(phys, phys + length, "ACPI SLIT");
 }
 
 /* Callback for Proximity Domain -> x2APIC mapping */
@@ -324,7 +325,7 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
 			pxmram = 0;
 	}
 
-	e820ram = max_pfn - (e820_hole_size(0, max_pfn<<PAGE_SHIFT)>>PAGE_SHIFT);
+	e820ram = max_pfn - (memblock_x86_hole_size(0, max_pfn<<PAGE_SHIFT)>>PAGE_SHIFT);
 	/* We seem to lose 3 pages somewhere. Allow 1M of slack. */
 	if ((long)(e820ram - pxmram) >= (1<<(20 - PAGE_SHIFT))) {
 		printk(KERN_ERR
@@ -421,7 +422,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
 	}
 
 	for_each_node_mask(i, nodes_parsed)
-		e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
+		memblock_x86_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
 						nodes[i].end >> PAGE_SHIFT);
 	/* for out of order entries in SRAT */
 	sort_node_map();
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 914f04695ce..b511f198691 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -44,6 +44,7 @@
 #include <linux/bug.h>
 #include <linux/module.h>
 #include <linux/gfp.h>
+#include <linux/memblock.h>
 
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
@@ -1735,7 +1736,7 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
 	__xen_write_cr3(true, __pa(pgd));
 	xen_mc_issue(PARAVIRT_LAZY_CPU);
 
-	reserve_early(__pa(xen_start_info->pt_base),
+	memblock_x86_reserve_range(__pa(xen_start_info->pt_base),
 		      __pa(xen_start_info->pt_base +
 			   xen_start_info->nr_pt_frames * PAGE_SIZE),
 		      "XEN PAGETABLES");
@@ -1773,7 +1774,7 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
 
 	pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
 
-	reserve_early(__pa(xen_start_info->pt_base),
+	memblock_x86_reserve_range(__pa(xen_start_info->pt_base),
 		      __pa(xen_start_info->pt_base +
 			   xen_start_info->nr_pt_frames * PAGE_SIZE),
 		      "XEN PAGETABLES");
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index ad0047f47cd..2ac8f29f89c 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -8,6 +8,7 @@
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/pm.h>
+#include <linux/memblock.h>
 
 #include <asm/elf.h>
 #include <asm/vdso.h>
@@ -61,7 +62,7 @@ char * __init xen_memory_setup(void)
 	 *  - xen_start_info
 	 * See comment above "struct start_info" in <xen/interface/xen.h>
 	 */
-	reserve_early(__pa(xen_start_info->mfn_list),
+	memblock_x86_reserve_range(__pa(xen_start_info->mfn_list),
 		      __pa(xen_start_info->pt_base),
 			"XEN START INFO");
 
-- 
cgit v1.2.3-18-g5258


From a587d2daebcd2bc159d4348b6a7b028950a6d803 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 25 Aug 2010 13:39:18 -0700
Subject: x86: Remove not used early_res code

and some functions in e820.c that are not used anymore

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/e820.h | 14 ------------
 arch/x86/kernel/e820.c      | 52 ---------------------------------------------
 2 files changed, 66 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 388fed29146..718646384e0 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -112,31 +112,17 @@ static inline void early_memtest(unsigned long start, unsigned long end)
 }
 #endif
 
-extern unsigned long end_user_pfn;
-
-extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align);
-extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align);
-extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align);
-
 extern unsigned long e820_end_of_ram_pfn(void);
 extern unsigned long e820_end_of_low_ram_pfn(void);
-extern void e820_register_active_regions(int nid, unsigned long start_pfn,
-					 unsigned long end_pfn);
-extern u64 e820_hole_size(u64 start, u64 end);
-
 extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align);
 
 void memblock_x86_fill(void);
-
 extern void finish_e820_parsing(void);
 extern void e820_reserve_resources(void);
 extern void e820_reserve_resources_late(void);
 extern void setup_memory_map(void);
 extern char *default_machine_specific_memory_setup(void);
 
-void reserve_early(u64 start, u64 end, char *name);
-void free_early(u64 start, u64 end);
-
 /*
  * Returns true iff the specified range [s,e) is completely contained inside
  * the ISA region.
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index a9221d18a5e..d5fd89462d7 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -738,32 +738,6 @@ static int __init e820_mark_nvs_memory(void)
 core_initcall(e820_mark_nvs_memory);
 #endif
 
-/*
- * Find a free area with specified alignment in a specific range.
- */
-u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
-{
-	u64 mem = memblock_find_in_range(start, end, size, align);
-
-	if (mem == MEMBLOCK_ERROR)
-		return -1ULL;
-
-	return mem;
-}
-
-/*
- * Find next free range after *start
- */
-u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
-{
-	u64 mem = memblock_x86_find_in_range_size(start, sizep, align);
-
-	if (mem == MEMBLOCK_ERROR)
-		return -1ULL
-
-	return mem;
-}
-
 /*
  * pre allocated 4k and reserved it in memblock and e820_saved
  */
@@ -856,32 +830,6 @@ unsigned long __init e820_end_of_low_ram_pfn(void)
 	return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM);
 }
 
-/* Walk the e820 map and register active regions within a node */
-void __init e820_register_active_regions(int nid, unsigned long start_pfn,
-					 unsigned long last_pfn)
-{
-	memblock_x86_register_active_regions(nid, start_pfn, last_pfn);
-}
-
-/*
- * Find the hole size (in bytes) in the memory range.
- * @start: starting address of the memory range to scan
- * @end: ending address of the memory range to scan
- */
-u64 __init e820_hole_size(u64 start, u64 end)
-{
-	return memblock_x86_hole_size(start, end);
-}
-
-void reserve_early(u64 start, u64 end, char *name)
-{
-	memblock_x86_reserve_range(start, end, name);
-}
-void free_early(u64 start, u64 end)
-{
-	memblock_x86_free_range(start, end);
-}
-
 static void early_panic(char *msg)
 {
 	early_printk(msg);
-- 
cgit v1.2.3-18-g5258


From 6f2a75369e7561e800d86927ecd83c970996b21f Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 25 Aug 2010 13:39:18 -0700
Subject: x86, memblock: Use memblock_memory_size()/memblock_free_memory_size()
 to get correct dma_reserve

memblock_memory_size() will return memory size in memblock.memory.region.
memblock_free_memory_size() will return free memory size in memblock.memory.region.

So We can get exact reseved size in specified range.

Set the size right after initmem_init(), because later bootmem API will
get area above 16M. (except some fallback).

Later after we remove the bootmem, We could call that just before paging_init().

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/e820.h |  2 ++
 arch/x86/kernel/e820.c      | 16 ++++++++++++++++
 arch/x86/kernel/setup.c     |  1 +
 arch/x86/mm/init_64.c       |  7 -------
 4 files changed, 19 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 718646384e0..5be1542fbfa 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -117,6 +117,8 @@ extern unsigned long e820_end_of_low_ram_pfn(void);
 extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align);
 
 void memblock_x86_fill(void);
+void memblock_find_dma_reserve(void);
+
 extern void finish_e820_parsing(void);
 extern void e820_reserve_resources(void);
 extern void e820_reserve_resources_late(void);
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index d5fd89462d7..0c2b7ef7a34 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1105,3 +1105,19 @@ void __init memblock_x86_fill(void)
 	memblock_analyze();
 	memblock_dump_all();
 }
+
+void __init memblock_find_dma_reserve(void)
+{
+#ifdef CONFIG_X86_64
+	u64 free_size_pfn;
+	u64 mem_size_pfn;
+	/*
+	 * need to find out used area below MAX_DMA_PFN
+	 * need to use memblock to get free size in [0, MAX_DMA_PFN]
+	 * at first, and assume boot_mem will not take below MAX_DMA_PFN
+	 */
+	mem_size_pfn = memblock_x86_memory_in_range(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT;
+	free_size_pfn = memblock_x86_free_memory_in_range(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT;
+	set_dma_reserve(mem_size_pfn - free_size_pfn);
+#endif
+}
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index a4f01733e87..924c8f78e98 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1013,6 +1013,7 @@ void __init setup_arch(char **cmdline_p)
 #endif
 
 	initmem_init(0, max_pfn, acpi, k8);
+	memblock_find_dma_reserve();
 #ifndef CONFIG_NO_BOOTMEM
 	memblock_x86_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
 #endif
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 592b2368062..d6d408467c4 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -53,8 +53,6 @@
 #include <asm/init.h>
 #include <linux/bootmem.h>
 
-static unsigned long dma_reserve __initdata;
-
 static int __init parse_direct_gbpages_off(char *arg)
 {
 	direct_gbpages = 0;
@@ -821,11 +819,6 @@ int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
 
 	reserve_bootmem(phys, len, flags);
 
-	if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
-		dma_reserve += len / PAGE_SIZE;
-		set_dma_reserve(dma_reserve);
-	}
-
 	return 0;
 }
 #endif
-- 
cgit v1.2.3-18-g5258


From 774ea0bcb27f57b6fd521b3b6c43237782fed4b9 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 25 Aug 2010 13:39:18 -0700
Subject: x86: Remove old bootmem code

Requested by Ingo, Thomas and HPA.

The old bootmem code is no longer necessary, and the transition is
complete.  Remove it.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/Kconfig        | 10 +--------
 arch/x86/kernel/setup.c |  4 ----
 arch/x86/mm/init_32.c   | 56 -------------------------------------------------
 arch/x86/mm/init_64.c   | 41 ------------------------------------
 arch/x86/mm/memblock.c  | 29 -------------------------
 arch/x86/mm/numa_32.c   |  3 ---
 arch/x86/mm/numa_64.c   | 47 -----------------------------------------
 7 files changed, 1 insertion(+), 189 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 542bb2610cb..ce07615f1cd 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -585,15 +585,7 @@ config PARAVIRT_DEBUG
 	  a paravirt_op is missing when it is called.
 
 config NO_BOOTMEM
-	default y
-	bool "Disable Bootmem code"
-	---help---
-	  Use memblock directly instead of bootmem before slab is ready.
-		- allocator (buddy) [generic]
-		- early allocator (bootmem) [generic]
-		- very early allocator (memblock) [some generic]
-		- very very early allocator (early brk model) [x86]
-	  So reduce one layer between early allocator to final allocator
+	def_bool y
 
 config MEMTEST
 	bool "Memtest"
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 924c8f78e98..1d114ff6a07 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1014,10 +1014,6 @@ void __init setup_arch(char **cmdline_p)
 
 	initmem_init(0, max_pfn, acpi, k8);
 	memblock_find_dma_reserve();
-#ifndef CONFIG_NO_BOOTMEM
-	memblock_x86_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
-#endif
-
 	dma32_reserve_bootmem();
 
 #ifdef CONFIG_KVM_CLOCK
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 63b09bae250..c2385d7ae31 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -751,68 +751,12 @@ static void __init zone_sizes_init(void)
 	free_area_init_nodes(max_zone_pfns);
 }
 
-#ifndef CONFIG_NO_BOOTMEM
-static unsigned long __init setup_node_bootmem(int nodeid,
-				 unsigned long start_pfn,
-				 unsigned long end_pfn,
-				 unsigned long bootmap)
-{
-	unsigned long bootmap_size;
-
-	/* don't touch min_low_pfn */
-	bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
-					 bootmap >> PAGE_SHIFT,
-					 start_pfn, end_pfn);
-	printk(KERN_INFO "  node %d low ram: %08lx - %08lx\n",
-		nodeid, start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
-	printk(KERN_INFO "  node %d bootmap %08lx - %08lx\n",
-		 nodeid, bootmap, bootmap + bootmap_size);
-	free_bootmem_with_active_regions(nodeid, end_pfn);
-
-	return bootmap + bootmap_size;
-}
-#endif
-
 void __init setup_bootmem_allocator(void)
 {
-#ifndef CONFIG_NO_BOOTMEM
-	int nodeid;
-	phys_addr_t bootmap_size, bootmap;
-	/*
-	 * Initialize the boot-time allocator (with low memory only):
-	 */
-	bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT;
-	bootmap = memblock_find_in_range(0, max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
-				 PAGE_SIZE);
-	if (bootmap == MEMBLOCK_ERROR)
-		panic("Cannot find bootmem map of size %ld\n", bootmap_size);
-	memblock_x86_reserve_range(bootmap, bootmap + bootmap_size, "BOOTMAP");
-#endif
-
 	printk(KERN_INFO "  mapped low ram: 0 - %08lx\n",
 		 max_pfn_mapped<<PAGE_SHIFT);
 	printk(KERN_INFO "  low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
 
-#ifndef CONFIG_NO_BOOTMEM
-	for_each_online_node(nodeid) {
-		 unsigned long start_pfn, end_pfn;
-
-#ifdef CONFIG_NEED_MULTIPLE_NODES
-		start_pfn = node_start_pfn[nodeid];
-		end_pfn = node_end_pfn[nodeid];
-		if (start_pfn > max_low_pfn)
-			continue;
-		if (end_pfn > max_low_pfn)
-			end_pfn = max_low_pfn;
-#else
-		start_pfn = 0;
-		end_pfn = max_low_pfn;
-#endif
-		bootmap = setup_node_bootmem(nodeid, start_pfn, end_pfn,
-						 bootmap);
-	}
-#endif
-
 	after_bootmem = 1;
 }
 
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index d6d408467c4..690b8d13971 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -572,23 +572,7 @@ kernel_physical_mapping_init(unsigned long start,
 void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
 				int acpi, int k8)
 {
-#ifndef CONFIG_NO_BOOTMEM
-	unsigned long bootmap_size, bootmap;
-
-	bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
-	bootmap = memblock_find_in_range(0, end_pfn<<PAGE_SHIFT, bootmap_size,
-				 PAGE_SIZE);
-	if (bootmap == MEMBLOCK_ERROR)
-		panic("Cannot find bootmem map of size %ld\n", bootmap_size);
-	memblock_x86_reserve_range(bootmap, bootmap + bootmap_size, "BOOTMAP");
-	/* don't touch min_low_pfn */
-	bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
-					 0, end_pfn);
 	memblock_x86_register_active_regions(0, start_pfn, end_pfn);
-	free_bootmem_with_active_regions(0, end_pfn);
-#else
-	memblock_x86_register_active_regions(0, start_pfn, end_pfn);
-#endif
 }
 #endif
 
@@ -798,31 +782,6 @@ void mark_rodata_ro(void)
 
 #endif
 
-#ifndef CONFIG_NO_BOOTMEM
-int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
-				   int flags)
-{
-	unsigned long pfn = phys >> PAGE_SHIFT;
-
-	if (pfn >= max_pfn) {
-		/*
-		 * This can happen with kdump kernels when accessing
-		 * firmware tables:
-		 */
-		if (pfn < max_pfn_mapped)
-			return -EFAULT;
-
-		printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %lu\n",
-				phys, len);
-		return -EFAULT;
-	}
-
-	reserve_bootmem(phys, len, flags);
-
-	return 0;
-}
-#endif
-
 int kern_addr_valid(unsigned long addr)
 {
 	unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
diff --git a/arch/x86/mm/memblock.c b/arch/x86/mm/memblock.c
index aaff3932588..50ecbc59757 100644
--- a/arch/x86/mm/memblock.c
+++ b/arch/x86/mm/memblock.c
@@ -109,7 +109,6 @@ static __init struct range *find_range_array(int count)
 	return range;
 }
 
-#ifdef CONFIG_NO_BOOTMEM
 static void __init memblock_x86_subtract_reserved(struct range *range, int az)
 {
 	u64 final_start, final_end;
@@ -182,34 +181,6 @@ int __init get_free_all_memory_range(struct range **rangep, int nodeid)
 	*rangep = range;
 	return nr_range;
 }
-#else
-void __init memblock_x86_to_bootmem(u64 start, u64 end)
-{
-	int count;
-	u64 final_start, final_end;
-	struct memblock_region *r;
-
-	/* Take out region array itself */
-	memblock_free_reserved_regions();
-
-	count  = memblock.reserved.cnt;
-	memblock_dbg("(%d early reservations) ==> bootmem [%#010llx-%#010llx]\n", count, start, end - 1);
-	for_each_memblock(reserved, r) {
-		memblock_dbg("  [%#010llx-%#010llx] ", (u64)r->base, (u64)r->base + r->size - 1);
-		final_start = max(start, r->base);
-		final_end = min(end, r->base + r->size);
-		if (final_start >= final_end) {
-			memblock_dbg("\n");
-			continue;
-		}
-		memblock_dbg(" ==> [%#010llx-%#010llx]\n", final_start, final_end - 1);
-		reserve_bootmem_generic(final_start, final_end - final_start, BOOTMEM_DEFAULT);
-	}
-
-	/* Put region array back ? */
-	memblock_reserve_reserved_regions();
-}
-#endif
 
 static u64 __init __memblock_x86_memory_in_range(u64 addr, u64 limit, bool get_free)
 {
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index ddf9730b206..70ddeb75ba2 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -420,9 +420,6 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
 	for_each_online_node(nid) {
 		memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
 		NODE_DATA(nid)->node_id = nid;
-#ifndef CONFIG_NO_BOOTMEM
-		NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
-#endif
 	}
 
 	setup_bootmem_allocator();
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 984b1ff7db4..aef0ff74f7d 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -199,10 +199,6 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
 	unsigned long start_pfn, last_pfn, nodedata_phys;
 	const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
 	int nid;
-#ifndef CONFIG_NO_BOOTMEM
-	unsigned long bootmap_start, bootmap_pages, bootmap_size;
-	void *bootmap;
-#endif
 
 	if (!end)
 		return;
@@ -239,47 +235,6 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
 	NODE_DATA(nodeid)->node_start_pfn = start_pfn;
 	NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
 
-#ifndef CONFIG_NO_BOOTMEM
-	NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid];
-
-	/*
-	 * Find a place for the bootmem map
-	 * nodedata_phys could be on other nodes by alloc_bootmem,
-	 * so need to sure bootmap_start not to be small, otherwise
-	 * early_node_mem will get that with memblock_find_in_range instead
-	 * of alloc_bootmem, that could clash with reserved range
-	 */
-	bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn);
-	bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE);
-	/*
-	 * SMP_CACHE_BYTES could be enough, but init_bootmem_node like
-	 * to use that to align to PAGE_SIZE
-	 */
-	bootmap = early_node_mem(nodeid, bootmap_start, end,
-				 bootmap_pages<<PAGE_SHIFT, PAGE_SIZE);
-	if (bootmap == NULL)  {
-		memblock_x86_free_range(nodedata_phys, nodedata_phys + pgdat_size);
-		node_data[nodeid] = NULL;
-		return;
-	}
-	bootmap_start = __pa(bootmap);
-	memblock_x86_reserve_range(bootmap_start, bootmap_start+(bootmap_pages<<PAGE_SHIFT),
-			"BOOTMAP");
-
-	bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
-					 bootmap_start >> PAGE_SHIFT,
-					 start_pfn, last_pfn);
-
-	printk(KERN_INFO "  bootmap [%016lx -  %016lx] pages %lx\n",
-		 bootmap_start, bootmap_start + bootmap_size - 1,
-		 bootmap_pages);
-	nid = phys_to_nid(bootmap_start);
-	if (nid != nodeid)
-		printk(KERN_INFO "    bootmap(%d) on node %d\n", nodeid, nid);
-
-	free_bootmem_with_active_regions(nodeid, end);
-#endif
-
 	node_set_online(nodeid);
 }
 
@@ -704,9 +659,7 @@ unsigned long __init numa_free_all_bootmem(void)
 	for_each_online_node(i)
 		pages += free_all_bootmem_node(NODE_DATA(i));
 
-#ifdef CONFIG_NO_BOOTMEM
 	pages += free_all_memory_core_early(MAX_NUMNODES);
-#endif
 
 	return pages;
 }
-- 
cgit v1.2.3-18-g5258


From 6f44d0337cc54a46e83b4c8a6195607e78fff71d Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Fri, 27 Aug 2010 14:19:33 -0400
Subject: x86, doc: Adding comments about .iommu_table and its neighbors.

Updating the linker section with comments about .iommu_table and
some other ones that I know of.

CC: Sam Ravnborg <sam@ravnborg.org>
CC: H. Peter Anvin <hpa@zytor.com>
CC: Fujita Tomonori <fujita.tomonori@lab.ntt.co.jp>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
LKML-Reference: <1282933173-19960-1-git-send-email-konrad.wilk@oracle.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/vmlinux.lds.S | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index b92e040466c..3f07c370f76 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -242,6 +242,12 @@ SECTIONS
 		__x86_cpu_dev_end = .;
 	}
 
+	/*
+	 * start address and size of operations which during runtime
+	 * can be patched with virtualization friendly instructions or
+	 * baremetal native ones. Think page table operations.
+	 * Details in paravirt_types.h
+	 */
 	. = ALIGN(8);
 	.parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
 		__parainstructions = .;
@@ -249,6 +255,11 @@ SECTIONS
 		__parainstructions_end = .;
 	}
 
+	/*
+	 * struct alt_inst entries. From the header (alternative.h):
+	 * "Alternative instructions for different CPU types or capabilities"
+	 * Think locking instructions on spinlocks.
+	 */
 	. = ALIGN(8);
 	.altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
 		__alt_instructions = .;
@@ -256,10 +267,21 @@ SECTIONS
 		__alt_instructions_end = .;
 	}
 
+	/*
+	 * And here are the replacement instructions. The linker sticks
+	 * them as binary blobs. The .altinstructions has enough data to
+	 * get the address and the length of them to patch the kernel safely.
+	 */
 	.altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
 		*(.altinstr_replacement)
 	}
 
+	/*
+	 * struct iommu_table_entry entries are injected in this section.
+	 * It is an array of IOMMUs which during run time gets sorted depending
+	 * on its dependency order. After rootfs_initcall is complete
+	 * this section can be safely removed.
+	 */
 	.iommu_table : AT(ADDR(.iommu_table) - LOAD_OFFSET) {
 		__iommu_table = .;
 		*(.iommu_table)
-- 
cgit v1.2.3-18-g5258


From 9fbaf49c7f717740002d49eee1bbd03d89d8766a Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Sat, 28 Aug 2010 17:41:03 +0200
Subject: x86, kmemcheck: Remove double test

The opcodes 0x2e and 0x3e are tested for in the first Group 2
line as well.

The sematic match that finds this problem is as follows:
(http://coccinelle.lip6.fr/)

// <smpl>
@expression@
expression E;
@@

(
* E
  || ... || E
|
* E
  && ... && E
)
// </smpl>

Signed-off-by: Julia Lawall <julia@diku.dk>
Reviewed-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Vegard Nossum <vegardno@ifi.uio.no>
LKML-Reference: <1283010066-20935-5-git-send-email-julia@diku.dk>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/kmemcheck/opcode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c
index 63c19e27aa6..324aa3f0723 100644
--- a/arch/x86/mm/kmemcheck/opcode.c
+++ b/arch/x86/mm/kmemcheck/opcode.c
@@ -9,7 +9,7 @@ static bool opcode_is_prefix(uint8_t b)
 		b == 0xf0 || b == 0xf2 || b == 0xf3
 		/* Group 2 */
 		|| b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26
-		|| b == 0x64 || b == 0x65 || b == 0x2e || b == 0x3e
+		|| b == 0x64 || b == 0x65
 		/* Group 3 */
 		|| b == 0x66
 		/* Group 4 */
-- 
cgit v1.2.3-18-g5258


From 7ac41ccf47d82569d26f34beab1dec92cc3b6347 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Mon, 30 Aug 2010 14:10:02 -0400
Subject: x86, iommu: Fix IOMMU_INIT alignment rules

This boot crash was observed:

 DMA-API: preallocated 32768 debug entries
 DMA-API: debugging enabled by kernel config
 BUG: unable to handle kernel paging request at 19da8955
 IP: [<f4ffffff>] 0xf4ffffff
 *pde = 00000000

The crux of the failure was that even if we did not use any
of the .iommu_table section, the linker would still insert it
in the vmlinux file. This patch fixes that and also fixes the
runtime crash where we would try to access the array.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Joerg Roedel <joerg.roedel@amd.com>
Cc: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
LKML-Reference: <1283191802-25086-1-git-send-email-konrad.wilk@oracle.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/vmlinux.lds.S | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 3f07c370f76..38e2b67807e 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -285,10 +285,9 @@ SECTIONS
 	.iommu_table : AT(ADDR(.iommu_table) - LOAD_OFFSET) {
 		__iommu_table = .;
 		*(.iommu_table)
-		. = ALIGN(8);
 		__iommu_table_end = .;
 	}
-
+	. = ALIGN(8);
 	/*
 	 * .exit.text is discard at runtime, not link time, to deal with
 	 *  references from .altinstructions and .eh_frame
-- 
cgit v1.2.3-18-g5258


From c9cf4a019cff198ee5638323e3b0ee18886467e8 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Wed, 25 Aug 2010 22:23:34 +0400
Subject: perf, x86, Pentium4: Add RAW events verification

Implements verification of

- Bits of ESCR EventMask field (meaningful bits in field are hardware
  predefined and others bits should be set to zero)

- INSTR_COMPLETED event (it is available on predefined cpu model only)

- Thread shared events (they should be guarded by "perf_event_paranoid"
  sysctl due to security reason). The side effect of this action is
  that PERF_COUNT_HW_BUS_CYCLES become a "paranoid" general event.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Tested-by: Lin Ming <ming.m.lin@intel.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <20100825182334.GB14874@lenovo>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_event_p4.h |  52 +++----
 arch/x86/kernel/cpu/perf_event_p4.c  | 292 ++++++++++++++++++++++++++++++++---
 2 files changed, 290 insertions(+), 54 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index def500776b1..a70cd216be5 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -36,19 +36,6 @@
 #define P4_ESCR_EMASK(v)	((v) << P4_ESCR_EVENTMASK_SHIFT)
 #define P4_ESCR_TAG(v)		((v) << P4_ESCR_TAG_SHIFT)
 
-/* Non HT mask */
-#define P4_ESCR_MASK			\
-	(P4_ESCR_EVENT_MASK	|	\
-	P4_ESCR_EVENTMASK_MASK	|	\
-	P4_ESCR_TAG_MASK	|	\
-	P4_ESCR_TAG_ENABLE	|	\
-	P4_ESCR_T0_OS		|	\
-	P4_ESCR_T0_USR)
-
-/* HT mask */
-#define P4_ESCR_MASK_HT			\
-	(P4_ESCR_MASK |	P4_ESCR_T1_OS | P4_ESCR_T1_USR)
-
 #define P4_CCCR_OVF			0x80000000U
 #define P4_CCCR_CASCADE			0x40000000U
 #define P4_CCCR_OVF_PMI_T0		0x04000000U
@@ -70,23 +57,6 @@
 #define P4_CCCR_THRESHOLD(v)		((v) << P4_CCCR_THRESHOLD_SHIFT)
 #define P4_CCCR_ESEL(v)			((v) << P4_CCCR_ESCR_SELECT_SHIFT)
 
-/* Non HT mask */
-#define P4_CCCR_MASK				\
-	(P4_CCCR_OVF			|	\
-	P4_CCCR_CASCADE			|	\
-	P4_CCCR_OVF_PMI_T0		|	\
-	P4_CCCR_FORCE_OVF		|	\
-	P4_CCCR_EDGE			|	\
-	P4_CCCR_THRESHOLD_MASK		|	\
-	P4_CCCR_COMPLEMENT		|	\
-	P4_CCCR_COMPARE			|	\
-	P4_CCCR_ESCR_SELECT_MASK	|	\
-	P4_CCCR_ENABLE)
-
-/* HT mask */
-#define P4_CCCR_MASK_HT				\
-	(P4_CCCR_MASK | P4_CCCR_OVF_PMI_T1 | P4_CCCR_THREAD_ANY)
-
 #define P4_GEN_ESCR_EMASK(class, name, bit)	\
 	class##__##name = ((1 << bit) << P4_ESCR_EVENTMASK_SHIFT)
 #define P4_ESCR_EMASK_BIT(class, name)		class##__##name
@@ -127,6 +97,28 @@
 #define P4_CONFIG_HT_SHIFT		63
 #define P4_CONFIG_HT			(1ULL << P4_CONFIG_HT_SHIFT)
 
+/*
+ * The bits we allow to pass for RAW events
+ */
+#define P4_CONFIG_MASK_ESCR		\
+	P4_ESCR_EVENT_MASK	|	\
+	P4_ESCR_EVENTMASK_MASK	|	\
+	P4_ESCR_TAG_MASK	|	\
+	P4_ESCR_TAG_ENABLE
+
+#define P4_CONFIG_MASK_CCCR		\
+	P4_CCCR_EDGE		|	\
+	P4_CCCR_THRESHOLD_MASK	|	\
+	P4_CCCR_COMPLEMENT	|	\
+	P4_CCCR_COMPARE		|	\
+	P4_CCCR_THREAD_ANY	|	\
+	P4_CCCR_RESERVED
+
+/* some dangerous bits are reserved for kernel internals */
+#define P4_CONFIG_MASK				  	  \
+	(p4_config_pack_escr(P4_CONFIG_MASK_ESCR))	| \
+	(p4_config_pack_cccr(P4_CONFIG_MASK_CCCR))
+
 static inline bool p4_is_event_cascaded(u64 config)
 {
 	u32 cccr = p4_config_unpack_cccr(config);
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 7e578e9cc58..4fe62c7165c 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -18,6 +18,8 @@
 struct p4_event_bind {
 	unsigned int opcode;			/* Event code and ESCR selector */
 	unsigned int escr_msr[2];		/* ESCR MSR for this event */
+	unsigned int escr_emask;		/* valid ESCR EventMask bits */
+	unsigned int shared;			/* event is shared across threads */
 	char cntr[2][P4_CNTR_LIMIT];		/* counter index (offset), -1 on abscence */
 };
 
@@ -66,231 +68,435 @@ static struct p4_event_bind p4_event_bind_map[] = {
 	[P4_EVENT_TC_DELIVER_MODE] = {
 		.opcode		= P4_OPCODE(P4_EVENT_TC_DELIVER_MODE),
 		.escr_msr	= { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DD)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DB)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DI)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BD)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BB)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BI)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, ID),
+		.shared		= 1,
 		.cntr		= { {4, 5, -1}, {6, 7, -1} },
 	},
 	[P4_EVENT_BPU_FETCH_REQUEST] = {
 		.opcode		= P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST),
 		.escr_msr	= { MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_BPU_FETCH_REQUEST, TCMISS),
 		.cntr		= { {0, -1, -1}, {2, -1, -1} },
 	},
 	[P4_EVENT_ITLB_REFERENCE] = {
 		.opcode		= P4_OPCODE(P4_EVENT_ITLB_REFERENCE),
 		.escr_msr	= { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, MISS)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT_UK),
 		.cntr		= { {0, -1, -1}, {2, -1, -1} },
 	},
 	[P4_EVENT_MEMORY_CANCEL] = {
 		.opcode		= P4_OPCODE(P4_EVENT_MEMORY_CANCEL),
 		.escr_msr	= { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, ST_RB_FULL)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, 64K_CONF),
 		.cntr		= { {8, 9, -1}, {10, 11, -1} },
 	},
 	[P4_EVENT_MEMORY_COMPLETE] = {
 		.opcode		= P4_OPCODE(P4_EVENT_MEMORY_COMPLETE),
 		.escr_msr	= { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, LSC)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, SSC),
 		.cntr		= { {8, 9, -1}, {10, 11, -1} },
 	},
 	[P4_EVENT_LOAD_PORT_REPLAY] = {
 		.opcode		= P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY),
 		.escr_msr	= { MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_LOAD_PORT_REPLAY, SPLIT_LD),
 		.cntr		= { {8, 9, -1}, {10, 11, -1} },
 	},
 	[P4_EVENT_STORE_PORT_REPLAY] = {
 		.opcode		= P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY),
 		.escr_msr	= { MSR_P4_SAAT_ESCR0 ,  MSR_P4_SAAT_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_STORE_PORT_REPLAY, SPLIT_ST),
 		.cntr		= { {8, 9, -1}, {10, 11, -1} },
 	},
 	[P4_EVENT_MOB_LOAD_REPLAY] = {
 		.opcode		= P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY),
 		.escr_msr	= { MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STA)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STD)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, PARTIAL_DATA)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, UNALGN_ADDR),
 		.cntr		= { {0, -1, -1}, {2, -1, -1} },
 	},
 	[P4_EVENT_PAGE_WALK_TYPE] = {
 		.opcode		= P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE),
 		.escr_msr	= { MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, DTMISS)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, ITMISS),
+		.shared		= 1,
 		.cntr		= { {0, -1, -1}, {2, -1, -1} },
 	},
 	[P4_EVENT_BSQ_CACHE_REFERENCE] = {
 		.opcode		= P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE),
 		.escr_msr	= { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS),
 		.cntr		= { {0, -1, -1}, {2, -1, -1} },
 	},
 	[P4_EVENT_IOQ_ALLOCATION] = {
 		.opcode		= P4_OPCODE(P4_EVENT_IOQ_ALLOCATION),
 		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, DEFAULT)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_READ)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_WRITE)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_UC)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WC)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WT)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WP)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WB)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OWN)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OTHER)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, PREFETCH),
 		.cntr		= { {0, -1, -1}, {2, -1, -1} },
 	},
 	[P4_EVENT_IOQ_ACTIVE_ENTRIES] = {	/* shared ESCR */
 		.opcode		= P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES),
 		.escr_msr	= { MSR_P4_FSB_ESCR1,  MSR_P4_FSB_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, DEFAULT)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_READ)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_WRITE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_UC)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WC)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WT)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WP)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WB)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OWN)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OTHER)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, PREFETCH),
 		.cntr		= { {2, -1, -1}, {3, -1, -1} },
 	},
 	[P4_EVENT_FSB_DATA_ACTIVITY] = {
 		.opcode		= P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY),
 		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OTHER)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_DRV)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OWN)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OTHER),
+		.shared		= 1,
 		.cntr		= { {0, -1, -1}, {2, -1, -1} },
 	},
 	[P4_EVENT_BSQ_ALLOCATION] = {		/* shared ESCR, broken CCCR1 */
 		.opcode		= P4_OPCODE(P4_EVENT_BSQ_ALLOCATION),
 		.escr_msr	= { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR0 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE0)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE1)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN0)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN1)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_IO_TYPE)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LOCK_TYPE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_CACHE_TYPE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_SPLIT_TYPE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_DEM_TYPE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_ORD_TYPE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE0)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE1)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE2),
 		.cntr		= { {0, -1, -1}, {1, -1, -1} },
 	},
 	[P4_EVENT_BSQ_ACTIVE_ENTRIES] = {	/* shared ESCR */
 		.opcode		= P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES),
 		.escr_msr	= { MSR_P4_BSU_ESCR1 , MSR_P4_BSU_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE0)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE1)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN0)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN1)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_IO_TYPE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LOCK_TYPE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_CACHE_TYPE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_SPLIT_TYPE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_DEM_TYPE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_ORD_TYPE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE0)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE1)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE2),
 		.cntr		= { {2, -1, -1}, {3, -1, -1} },
 	},
 	[P4_EVENT_SSE_INPUT_ASSIST] = {
 		.opcode		= P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST),
 		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_SSE_INPUT_ASSIST, ALL),
+		.shared		= 1,
 		.cntr		= { {8, 9, -1}, {10, 11, -1} },
 	},
 	[P4_EVENT_PACKED_SP_UOP] = {
 		.opcode		= P4_OPCODE(P4_EVENT_PACKED_SP_UOP),
 		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_SP_UOP, ALL),
+		.shared		= 1,
 		.cntr		= { {8, 9, -1}, {10, 11, -1} },
 	},
 	[P4_EVENT_PACKED_DP_UOP] = {
 		.opcode		= P4_OPCODE(P4_EVENT_PACKED_DP_UOP),
 		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_DP_UOP, ALL),
+		.shared		= 1,
 		.cntr		= { {8, 9, -1}, {10, 11, -1} },
 	},
 	[P4_EVENT_SCALAR_SP_UOP] = {
 		.opcode		= P4_OPCODE(P4_EVENT_SCALAR_SP_UOP),
 		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_SP_UOP, ALL),
+		.shared		= 1,
 		.cntr		= { {8, 9, -1}, {10, 11, -1} },
 	},
 	[P4_EVENT_SCALAR_DP_UOP] = {
 		.opcode		= P4_OPCODE(P4_EVENT_SCALAR_DP_UOP),
 		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_DP_UOP, ALL),
+		.shared		= 1,
 		.cntr		= { {8, 9, -1}, {10, 11, -1} },
 	},
 	[P4_EVENT_64BIT_MMX_UOP] = {
 		.opcode		= P4_OPCODE(P4_EVENT_64BIT_MMX_UOP),
 		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_64BIT_MMX_UOP, ALL),
+		.shared		= 1,
 		.cntr		= { {8, 9, -1}, {10, 11, -1} },
 	},
 	[P4_EVENT_128BIT_MMX_UOP] = {
 		.opcode		= P4_OPCODE(P4_EVENT_128BIT_MMX_UOP),
 		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_128BIT_MMX_UOP, ALL),
+		.shared		= 1,
 		.cntr		= { {8, 9, -1}, {10, 11, -1} },
 	},
 	[P4_EVENT_X87_FP_UOP] = {
 		.opcode		= P4_OPCODE(P4_EVENT_X87_FP_UOP),
 		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_X87_FP_UOP, ALL),
+		.shared		= 1,
 		.cntr		= { {8, 9, -1}, {10, 11, -1} },
 	},
 	[P4_EVENT_TC_MISC] = {
 		.opcode		= P4_OPCODE(P4_EVENT_TC_MISC),
 		.escr_msr	= { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_TC_MISC, FLUSH),
 		.cntr		= { {4, 5, -1}, {6, 7, -1} },
 	},
 	[P4_EVENT_GLOBAL_POWER_EVENTS] = {
 		.opcode		= P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS),
 		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING),
 		.cntr		= { {0, -1, -1}, {2, -1, -1} },
 	},
 	[P4_EVENT_TC_MS_XFER] = {
 		.opcode		= P4_OPCODE(P4_EVENT_TC_MS_XFER),
 		.escr_msr	= { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_TC_MS_XFER, CISC),
 		.cntr		= { {4, 5, -1}, {6, 7, -1} },
 	},
 	[P4_EVENT_UOP_QUEUE_WRITES] = {
 		.opcode		= P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES),
 		.escr_msr	= { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_BUILD)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_DELIVER)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_ROM),
 		.cntr		= { {4, 5, -1}, {6, 7, -1} },
 	},
 	[P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE] = {
 		.opcode		= P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE),
 		.escr_msr	= { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR0 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CONDITIONAL)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CALL)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, RETURN)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, INDIRECT),
 		.cntr		= { {4, 5, -1}, {6, 7, -1} },
 	},
 	[P4_EVENT_RETIRED_BRANCH_TYPE] = {
 		.opcode		= P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE),
 		.escr_msr	= { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT),
 		.cntr		= { {4, 5, -1}, {6, 7, -1} },
 	},
 	[P4_EVENT_RESOURCE_STALL] = {
 		.opcode		= P4_OPCODE(P4_EVENT_RESOURCE_STALL),
 		.escr_msr	= { MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_RESOURCE_STALL, SBFULL),
 		.cntr		= { {12, 13, 16}, {14, 15, 17} },
 	},
 	[P4_EVENT_WC_BUFFER] = {
 		.opcode		= P4_OPCODE(P4_EVENT_WC_BUFFER),
 		.escr_msr	= { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_EVICTS)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_FULL_EVICTS),
+		.shared		= 1,
 		.cntr		= { {8, 9, -1}, {10, 11, -1} },
 	},
 	[P4_EVENT_B2B_CYCLES] = {
 		.opcode		= P4_OPCODE(P4_EVENT_B2B_CYCLES),
 		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.escr_emask	= 0,
 		.cntr		= { {0, -1, -1}, {2, -1, -1} },
 	},
 	[P4_EVENT_BNR] = {
 		.opcode		= P4_OPCODE(P4_EVENT_BNR),
 		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.escr_emask	= 0,
 		.cntr		= { {0, -1, -1}, {2, -1, -1} },
 	},
 	[P4_EVENT_SNOOP] = {
 		.opcode		= P4_OPCODE(P4_EVENT_SNOOP),
 		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.escr_emask	= 0,
 		.cntr		= { {0, -1, -1}, {2, -1, -1} },
 	},
 	[P4_EVENT_RESPONSE] = {
 		.opcode		= P4_OPCODE(P4_EVENT_RESPONSE),
 		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.escr_emask	= 0,
 		.cntr		= { {0, -1, -1}, {2, -1, -1} },
 	},
 	[P4_EVENT_FRONT_END_EVENT] = {
 		.opcode		= P4_OPCODE(P4_EVENT_FRONT_END_EVENT),
 		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, NBOGUS)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, BOGUS),
 		.cntr		= { {12, 13, 16}, {14, 15, 17} },
 	},
 	[P4_EVENT_EXECUTION_EVENT] = {
 		.opcode		= P4_OPCODE(P4_EVENT_EXECUTION_EVENT),
 		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3),
 		.cntr		= { {12, 13, 16}, {14, 15, 17} },
 	},
 	[P4_EVENT_REPLAY_EVENT] = {
 		.opcode		= P4_OPCODE(P4_EVENT_REPLAY_EVENT),
 		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, NBOGUS)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, BOGUS),
 		.cntr		= { {12, 13, 16}, {14, 15, 17} },
 	},
 	[P4_EVENT_INSTR_RETIRED] = {
 		.opcode		= P4_OPCODE(P4_EVENT_INSTR_RETIRED),
 		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSTAG)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSTAG),
 		.cntr		= { {12, 13, 16}, {14, 15, 17} },
 	},
 	[P4_EVENT_UOPS_RETIRED] = {
 		.opcode		= P4_OPCODE(P4_EVENT_UOPS_RETIRED),
 		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, NBOGUS)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, BOGUS),
 		.cntr		= { {12, 13, 16}, {14, 15, 17} },
 	},
 	[P4_EVENT_UOP_TYPE] = {
 		.opcode		= P4_OPCODE(P4_EVENT_UOP_TYPE),
 		.escr_msr	= { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGLOADS)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGSTORES),
 		.cntr		= { {12, 13, 16}, {14, 15, 17} },
 	},
 	[P4_EVENT_BRANCH_RETIRED] = {
 		.opcode		= P4_OPCODE(P4_EVENT_BRANCH_RETIRED),
 		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNP)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNM)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTP)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTM),
 		.cntr		= { {12, 13, 16}, {14, 15, 17} },
 	},
 	[P4_EVENT_MISPRED_BRANCH_RETIRED] = {
 		.opcode		= P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED),
 		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+		.escr_emask	=
+		P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS),
 		.cntr		= { {12, 13, 16}, {14, 15, 17} },
 	},
 	[P4_EVENT_X87_ASSIST] = {
 		.opcode		= P4_OPCODE(P4_EVENT_X87_ASSIST),
 		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSU)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSO)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAO)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAU)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, PREA),
 		.cntr		= { {12, 13, 16}, {14, 15, 17} },
 	},
 	[P4_EVENT_MACHINE_CLEAR] = {
 		.opcode		= P4_OPCODE(P4_EVENT_MACHINE_CLEAR),
 		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, CLEAR)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, MOCLEAR)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, SMCLEAR),
 		.cntr		= { {12, 13, 16}, {14, 15, 17} },
 	},
 	[P4_EVENT_INSTR_COMPLETED] = {
 		.opcode		= P4_OPCODE(P4_EVENT_INSTR_COMPLETED),
 		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, NBOGUS)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, BOGUS),
 		.cntr		= { {12, 13, 16}, {14, 15, 17} },
 	},
 };
@@ -428,29 +634,73 @@ static u64 p4_pmu_event_map(int hw_event)
 	return config;
 }
 
+/* check cpu model specifics */
+static bool p4_event_match_cpu_model(unsigned int event_idx)
+{
+	/* INSTR_COMPLETED event only exist for model 3, 4, 6 (Prescott) */
+	if (event_idx == P4_EVENT_INSTR_COMPLETED) {
+		if (boot_cpu_data.x86_model != 3 &&
+			boot_cpu_data.x86_model != 4 &&
+			boot_cpu_data.x86_model != 6)
+			return false;
+	}
+
+	/*
+	 * For info
+	 * - IQ_ESCR0, IQ_ESCR1 only for models 1 and 2
+	 */
+
+	return true;
+}
+
 static int p4_validate_raw_event(struct perf_event *event)
 {
-	unsigned int v;
+	unsigned int v, emask;
 
-	/* user data may have out-of-bound event index */
+	/* User data may have out-of-bound event index */
 	v = p4_config_unpack_event(event->attr.config);
-	if (v >= ARRAY_SIZE(p4_event_bind_map)) {
-		pr_warning("P4 PMU: Unknown event code: %d\n", v);
+	if (v >= ARRAY_SIZE(p4_event_bind_map))
+		return -EINVAL;
+
+	/* It may be unsupported: */
+	if (!p4_event_match_cpu_model(v))
 		return -EINVAL;
+
+	/*
+	 * NOTE: P4_CCCR_THREAD_ANY has not the same meaning as
+	 * in Architectural Performance Monitoring, it means not
+	 * on _which_ logical cpu to count but rather _when_, ie it
+	 * depends on logical cpu state -- count event if one cpu active,
+	 * none, both or any, so we just allow user to pass any value
+	 * desired.
+	 *
+	 * In turn we always set Tx_OS/Tx_USR bits bound to logical
+	 * cpu without their propagation to another cpu
+	 */
+
+	/*
+	 * if an event is shared accross the logical threads
+	 * the user needs special permissions to be able to use it
+	 */
+	if (p4_event_bind_map[v].shared) {
+		if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+			return -EACCES;
 	}
 
+	/* ESCR EventMask bits may be invalid */
+	emask = p4_config_unpack_escr(event->attr.config) & P4_ESCR_EVENTMASK_MASK;
+	if (emask & ~p4_event_bind_map[v].escr_emask)
+		return -EINVAL;
+
 	/*
-	 * it may have some screwed PEBS bits
+	 * it may have some invalid PEBS bits
 	 */
-	if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE)) {
-		pr_warning("P4 PMU: PEBS are not supported yet\n");
+	if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE))
 		return -EINVAL;
-	}
+
 	v = p4_config_unpack_metric(event->attr.config);
-	if (v >= ARRAY_SIZE(p4_pebs_bind_map)) {
-		pr_warning("P4 PMU: Unknown metric code: %d\n", v);
+	if (v >= ARRAY_SIZE(p4_pebs_bind_map))
 		return -EINVAL;
-	}
 
 	return 0;
 }
@@ -478,27 +728,21 @@ static int p4_hw_config(struct perf_event *event)
 
 	if (event->attr.type == PERF_TYPE_RAW) {
 
+		/*
+		 * Clear bits we reserve to be managed by kernel itself
+		 * and never allowed from a user space
+		 */
+		 event->attr.config &= P4_CONFIG_MASK;
+
 		rc = p4_validate_raw_event(event);
 		if (rc)
 			goto out;
 
 		/*
-		 * We don't control raw events so it's up to the caller
-		 * to pass sane values (and we don't count the thread number
-		 * on HT machine but allow HT-compatible specifics to be
-		 * passed on)
-		 *
 		 * Note that for RAW events we allow user to use P4_CCCR_RESERVED
 		 * bits since we keep additional info here (for cache events and etc)
-		 *
-		 * XXX: HT wide things should check perf_paranoid_cpu() &&
-		 *      CAP_SYS_ADMIN
 		 */
-		event->hw.config |= event->attr.config &
-			(p4_config_pack_escr(P4_ESCR_MASK_HT) |
-			 p4_config_pack_cccr(P4_CCCR_MASK_HT | P4_CCCR_RESERVED));
-
-		event->hw.config &= ~P4_CCCR_FORCE_OVF;
+		event->hw.config |= event->attr.config;
 	}
 
 	rc = x86_setup_perfctr(event);
-- 
cgit v1.2.3-18-g5258


From e6b04b6b5a3182ae36cf9a69f1aaaee432edc8ad Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Thu, 2 Sep 2010 13:52:45 +0100
Subject: x86-64: Fix unwind annotations in syscall stubs

With the return address removed from the stack, these should
really refer to their caller's register state.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Acked-by: Alexander van Heukelum <heukelum@fastmail.fm>
LKML-Reference: <4C7FBA3D0200007800013F61@vpn.id2.novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/entry_64.S | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 17be5ec7cbb..16aeff0c315 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -714,9 +714,8 @@ END(ptregscall_common)
 
 ENTRY(stub_execve)
 	CFI_STARTPROC
-	popq %r11
-	CFI_ADJUST_CFA_OFFSET -8
-	CFI_REGISTER rip, r11
+	addq $8, %rsp
+	PARTIAL_FRAME 0
 	SAVE_REST
 	FIXUP_TOP_OF_STACK %r11
 	movq %rsp, %rcx
@@ -735,7 +734,7 @@ END(stub_execve)
 ENTRY(stub_rt_sigreturn)
 	CFI_STARTPROC
 	addq $8, %rsp
-	CFI_ADJUST_CFA_OFFSET	-8
+	PARTIAL_FRAME 0
 	SAVE_REST
 	movq %rsp,%rdi
 	FIXUP_TOP_OF_STACK %r11
@@ -1445,7 +1444,6 @@ error_swapgs:
 error_sti:
 	TRACE_IRQS_OFF
 	ret
-	CFI_ENDPROC
 
 /*
  * There are two places in the kernel that can potentially fault with
@@ -1470,6 +1468,7 @@ bstep_iret:
 	/* Fix truncated RIP */
 	movq %rcx,RIP+8(%rsp)
 	jmp error_swapgs
+	CFI_ENDPROC
 END(error_entry)
 
 
-- 
cgit v1.2.3-18-g5258


From 1f130a783a796f147b080c594488b566c86007d0 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Thu, 2 Sep 2010 13:54:32 +0100
Subject: x86-64: Adjust frame type at paranoid_exit:

As this isn't an exception or interrupt entry point, it doesn't
have any of the hardware provide frame layouts active.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Acked-by: Alexander van Heukelum <heukelum@fastmail.fm>
LKML-Reference: <4C7FBAA80200007800013F67@vpn.id2.novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/entry_64.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 16aeff0c315..64dfe3045c1 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1367,7 +1367,7 @@ paranoidzeroentry machine_check *machine_check_vector(%rip)
 
 	/* ebx:	no swapgs flag */
 ENTRY(paranoid_exit)
-	INTR_FRAME
+	DEFAULT_FRAME
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	testl %ebx,%ebx				/* swapgs needed? */
-- 
cgit v1.2.3-18-g5258


From b1cccb1bb01dc1cb89f58723a58c3d4988d44d94 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Thu, 2 Sep 2010 13:55:11 +0100
Subject: x86-64: Use symbolics instead of raw numbers in entry_64.S

... making the code a little less fragile.

Also use pushq_cfi instead of raw CFI annotations in two more
places, and add two missing annotations after stack pointer
adjustments which got modified here anyway.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Acked-by: Alexander van Heukelum <heukelum@fastmail.fm>
LKML-Reference: <4C7FBACF0200007800013F6A@vpn.id2.novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/entry_64.S | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 64dfe3045c1..6f305830c80 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -795,8 +795,8 @@ END(interrupt)
 
 /* 0(%rsp): ~(interrupt number) */
 	.macro interrupt func
-	subq $10*8, %rsp
-	CFI_ADJUST_CFA_OFFSET 10*8
+	subq $ORIG_RAX-ARGOFFSET+8, %rsp
+	CFI_ADJUST_CFA_OFFSET ORIG_RAX-ARGOFFSET+8
 	call save_args
 	PARTIAL_FRAME 0
 	call \func
@@ -1035,8 +1035,8 @@ ENTRY(\sym)
 	INTR_FRAME
 	PARAVIRT_ADJUST_EXCEPTION_FRAME
 	pushq_cfi $-1		/* ORIG_RAX: no syscall to restart */
-	subq $15*8,%rsp
-	CFI_ADJUST_CFA_OFFSET 15*8
+	subq $ORIG_RAX-R15, %rsp
+	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
 	call error_entry
 	DEFAULT_FRAME 0
 	movq %rsp,%rdi		/* pt_regs pointer */
@@ -1051,9 +1051,9 @@ END(\sym)
 ENTRY(\sym)
 	INTR_FRAME
 	PARAVIRT_ADJUST_EXCEPTION_FRAME
-	pushq $-1		/* ORIG_RAX: no syscall to restart */
-	CFI_ADJUST_CFA_OFFSET 8
-	subq $15*8, %rsp
+	pushq_cfi $-1		/* ORIG_RAX: no syscall to restart */
+	subq $ORIG_RAX-R15, %rsp
+	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
 	call save_paranoid
 	TRACE_IRQS_OFF
 	movq %rsp,%rdi		/* pt_regs pointer */
@@ -1069,9 +1069,9 @@ END(\sym)
 ENTRY(\sym)
 	INTR_FRAME
 	PARAVIRT_ADJUST_EXCEPTION_FRAME
-	pushq $-1		/* ORIG_RAX: no syscall to restart */
-	CFI_ADJUST_CFA_OFFSET 8
-	subq $15*8, %rsp
+	pushq_cfi $-1		/* ORIG_RAX: no syscall to restart */
+	subq $ORIG_RAX-R15, %rsp
+	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
 	call save_paranoid
 	TRACE_IRQS_OFF
 	movq %rsp,%rdi		/* pt_regs pointer */
@@ -1088,8 +1088,8 @@ END(\sym)
 ENTRY(\sym)
 	XCPT_FRAME
 	PARAVIRT_ADJUST_EXCEPTION_FRAME
-	subq $15*8,%rsp
-	CFI_ADJUST_CFA_OFFSET 15*8
+	subq $ORIG_RAX-R15, %rsp
+	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
 	call error_entry
 	DEFAULT_FRAME 0
 	movq %rsp,%rdi			/* pt_regs pointer */
@@ -1106,8 +1106,8 @@ END(\sym)
 ENTRY(\sym)
 	XCPT_FRAME
 	PARAVIRT_ADJUST_EXCEPTION_FRAME
-	subq $15*8,%rsp
-	CFI_ADJUST_CFA_OFFSET 15*8
+	subq $ORIG_RAX-R15, %rsp
+	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
 	call save_paranoid
 	DEFAULT_FRAME 0
 	TRACE_IRQS_OFF
@@ -1497,8 +1497,8 @@ ENTRY(nmi)
 	INTR_FRAME
 	PARAVIRT_ADJUST_EXCEPTION_FRAME
 	pushq_cfi $-1
-	subq $15*8, %rsp
-	CFI_ADJUST_CFA_OFFSET 15*8
+	subq $ORIG_RAX-R15, %rsp
+	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
 	call save_paranoid
 	DEFAULT_FRAME 0
 	/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
-- 
cgit v1.2.3-18-g5258


From a34107b5577968dc53cf9c2195c7c2d4a2caf9ce Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Thu, 2 Sep 2010 14:04:16 +0100
Subject: i386: Add unwind directives to syscall ptregs stubs

When these stubs are actual functions (i.e. having a return
instruction) and have stack manipulation instructions in them,
they should also be annotated to allow unwinding through them.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Acked-by: Alexander van Heukelum <heukelum@fastmail.fm>
LKML-Reference: <4C7FBCF00200007800013F99@vpn.id2.novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/entry_32.S | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 227d00920d2..d9b950ee559 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -750,14 +750,18 @@ ptregs_##name: \
 #define PTREGSCALL3(name) \
 	ALIGN; \
 ptregs_##name: \
+	CFI_STARTPROC; \
 	leal 4(%esp),%eax; \
-	pushl %eax; \
+	pushl_cfi %eax; \
 	movl PT_EDX(%eax),%ecx; \
 	movl PT_ECX(%eax),%edx; \
 	movl PT_EBX(%eax),%eax; \
 	call sys_##name; \
 	addl $4,%esp; \
-	ret
+	CFI_ADJUST_CFA_OFFSET -4; \
+	ret; \
+	CFI_ENDPROC; \
+ENDPROC(ptregs_##name)
 
 PTREGSCALL1(iopl)
 PTREGSCALL0(fork)
@@ -772,15 +776,19 @@ PTREGSCALL1(vm86old)
 /* Clone is an oddball.  The 4th arg is in %edi */
 	ALIGN;
 ptregs_clone:
+	CFI_STARTPROC
 	leal 4(%esp),%eax
-	pushl %eax
-	pushl PT_EDI(%eax)
+	pushl_cfi %eax
+	pushl_cfi PT_EDI(%eax)
 	movl PT_EDX(%eax),%ecx
 	movl PT_ECX(%eax),%edx
 	movl PT_EBX(%eax),%eax
 	call sys_clone
 	addl $8,%esp
+	CFI_ADJUST_CFA_OFFSET -8
 	ret
+	CFI_ENDPROC
+ENDPROC(ptregs_clone)
 
 .macro FIXUP_ESPFIX_STACK
 /*
-- 
cgit v1.2.3-18-g5258


From df5d1874ce1a1f0e0eceff4fa3a9d45620243a68 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Thu, 2 Sep 2010 14:07:16 +0100
Subject: x86: Use {push,pop}{l,q}_cfi in more places

... plus additionally introduce {push,pop}f{l,q}_cfi. All in the
hope that the code becomes better readable this way (it gets
quite a bit smaller in any case).

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Acked-by: Alexander van Heukelum <heukelum@fastmail.fm>
LKML-Reference: <4C7FBDA40200007800013FAF@vpn.id2.novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/dwarf2.h |  20 +++
 arch/x86/kernel/entry_32.S    | 294 ++++++++++++++----------------------------
 arch/x86/kernel/entry_64.S    |  65 ++++------
 3 files changed, 141 insertions(+), 238 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h
index 733f7e91e7a..32609919931 100644
--- a/arch/x86/include/asm/dwarf2.h
+++ b/arch/x86/include/asm/dwarf2.h
@@ -89,6 +89,16 @@
 	CFI_ADJUST_CFA_OFFSET -8
 	.endm
 
+	.macro pushfq_cfi
+	pushfq
+	CFI_ADJUST_CFA_OFFSET 8
+	.endm
+
+	.macro popfq_cfi
+	popfq
+	CFI_ADJUST_CFA_OFFSET -8
+	.endm
+
 	.macro movq_cfi reg offset=0
 	movq %\reg, \offset(%rsp)
 	CFI_REL_OFFSET \reg, \offset
@@ -109,6 +119,16 @@
 	CFI_ADJUST_CFA_OFFSET -4
 	.endm
 
+	.macro pushfl_cfi
+	pushfl
+	CFI_ADJUST_CFA_OFFSET 4
+	.endm
+
+	.macro popfl_cfi
+	popfl
+	CFI_ADJUST_CFA_OFFSET -4
+	.endm
+
 	.macro movl_cfi reg offset=0
 	movl %\reg, \offset(%esp)
 	CFI_REL_OFFSET \reg, \offset
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index d9b950ee559..9fb188d7bc7 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -115,8 +115,7 @@
 
  /* unfortunately push/pop can't be no-op */
 .macro PUSH_GS
-	pushl $0
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi $0
 .endm
 .macro POP_GS pop=0
 	addl $(4 + \pop), %esp
@@ -140,14 +139,12 @@
 #else	/* CONFIG_X86_32_LAZY_GS */
 
 .macro PUSH_GS
-	pushl %gs
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %gs
 	/*CFI_REL_OFFSET gs, 0*/
 .endm
 
 .macro POP_GS pop=0
-98:	popl %gs
-	CFI_ADJUST_CFA_OFFSET -4
+98:	popl_cfi %gs
 	/*CFI_RESTORE gs*/
   .if \pop <> 0
 	add $\pop, %esp
@@ -195,35 +192,25 @@
 .macro SAVE_ALL
 	cld
 	PUSH_GS
-	pushl %fs
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %fs
 	/*CFI_REL_OFFSET fs, 0;*/
-	pushl %es
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %es
 	/*CFI_REL_OFFSET es, 0;*/
-	pushl %ds
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %ds
 	/*CFI_REL_OFFSET ds, 0;*/
-	pushl %eax
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %eax
 	CFI_REL_OFFSET eax, 0
-	pushl %ebp
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %ebp
 	CFI_REL_OFFSET ebp, 0
-	pushl %edi
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %edi
 	CFI_REL_OFFSET edi, 0
-	pushl %esi
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %esi
 	CFI_REL_OFFSET esi, 0
-	pushl %edx
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %edx
 	CFI_REL_OFFSET edx, 0
-	pushl %ecx
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %ecx
 	CFI_REL_OFFSET ecx, 0
-	pushl %ebx
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %ebx
 	CFI_REL_OFFSET ebx, 0
 	movl $(__USER_DS), %edx
 	movl %edx, %ds
@@ -234,39 +221,29 @@
 .endm
 
 .macro RESTORE_INT_REGS
-	popl %ebx
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %ebx
 	CFI_RESTORE ebx
-	popl %ecx
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %ecx
 	CFI_RESTORE ecx
-	popl %edx
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %edx
 	CFI_RESTORE edx
-	popl %esi
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %esi
 	CFI_RESTORE esi
-	popl %edi
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %edi
 	CFI_RESTORE edi
-	popl %ebp
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %ebp
 	CFI_RESTORE ebp
-	popl %eax
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %eax
 	CFI_RESTORE eax
 .endm
 
 .macro RESTORE_REGS pop=0
 	RESTORE_INT_REGS
-1:	popl %ds
-	CFI_ADJUST_CFA_OFFSET -4
+1:	popl_cfi %ds
 	/*CFI_RESTORE ds;*/
-2:	popl %es
-	CFI_ADJUST_CFA_OFFSET -4
+2:	popl_cfi %es
 	/*CFI_RESTORE es;*/
-3:	popl %fs
-	CFI_ADJUST_CFA_OFFSET -4
+3:	popl_cfi %fs
 	/*CFI_RESTORE fs;*/
 	POP_GS \pop
 .pushsection .fixup, "ax"
@@ -320,16 +297,12 @@
 
 ENTRY(ret_from_fork)
 	CFI_STARTPROC
-	pushl %eax
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %eax
 	call schedule_tail
 	GET_THREAD_INFO(%ebp)
-	popl %eax
-	CFI_ADJUST_CFA_OFFSET -4
-	pushl $0x0202			# Reset kernel eflags
-	CFI_ADJUST_CFA_OFFSET 4
-	popfl
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %eax
+	pushl_cfi $0x0202		# Reset kernel eflags
+	popfl_cfi
 	jmp syscall_exit
 	CFI_ENDPROC
 END(ret_from_fork)
@@ -409,29 +382,23 @@ sysenter_past_esp:
 	 * enough kernel state to call TRACE_IRQS_OFF can be called - but
 	 * we immediately enable interrupts at that point anyway.
 	 */
-	pushl $(__USER_DS)
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi $(__USER_DS)
 	/*CFI_REL_OFFSET ss, 0*/
-	pushl %ebp
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %ebp
 	CFI_REL_OFFSET esp, 0
-	pushfl
+	pushfl_cfi
 	orl $X86_EFLAGS_IF, (%esp)
-	CFI_ADJUST_CFA_OFFSET 4
-	pushl $(__USER_CS)
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi $(__USER_CS)
 	/*CFI_REL_OFFSET cs, 0*/
 	/*
 	 * Push current_thread_info()->sysenter_return to the stack.
 	 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
 	 * pushed above; +8 corresponds to copy_thread's esp0 setting.
 	 */
-	pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
 	CFI_REL_OFFSET eip, 0
 
-	pushl %eax
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %eax
 	SAVE_ALL
 	ENABLE_INTERRUPTS(CLBR_NONE)
 
@@ -486,8 +453,7 @@ sysenter_audit:
 	movl %eax,%edx			/* 2nd arg: syscall number */
 	movl $AUDIT_ARCH_I386,%eax	/* 1st arg: audit arch */
 	call audit_syscall_entry
-	pushl %ebx
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %ebx
 	movl PT_EAX(%esp),%eax		/* reload syscall number */
 	jmp sysenter_do_call
 
@@ -529,8 +495,7 @@ ENDPROC(ia32_sysenter_target)
 	# system call handler stub
 ENTRY(system_call)
 	RING0_INT_FRAME			# can't unwind into user space anyway
-	pushl %eax			# save orig_eax
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %eax			# save orig_eax
 	SAVE_ALL
 	GET_THREAD_INFO(%ebp)
 					# system call tracing in operation / emulation
@@ -566,7 +531,6 @@ restore_all_notrace:
 	je ldt_ss			# returning to user-space with LDT SS
 restore_nocheck:
 	RESTORE_REGS 4			# skip orig_eax/error_code
-	CFI_ADJUST_CFA_OFFSET -4
 irq_return:
 	INTERRUPT_RETURN
 .section .fixup,"ax"
@@ -619,10 +583,8 @@ ldt_ss:
 	shr $16, %edx
 	mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */
 	mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */
-	pushl $__ESPFIX_SS
-	CFI_ADJUST_CFA_OFFSET 4
-	push %eax			/* new kernel esp */
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi $__ESPFIX_SS
+	pushl_cfi %eax			/* new kernel esp */
 	/* Disable interrupts, but do not irqtrace this section: we
 	 * will soon execute iret and the tracer was already set to
 	 * the irqstate after the iret */
@@ -666,11 +628,9 @@ work_notifysig:				# deal with pending signals and
 
 	ALIGN
 work_notifysig_v86:
-	pushl %ecx			# save ti_flags for do_notify_resume
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %ecx			# save ti_flags for do_notify_resume
 	call save_v86_state		# %eax contains pt_regs pointer
-	popl %ecx
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %ecx
 	movl %eax, %esp
 #else
 	movl %esp, %eax
@@ -803,10 +763,8 @@ ENDPROC(ptregs_clone)
 	mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
 	shl $16, %eax
 	addl %esp, %eax			/* the adjusted stack pointer */
-	pushl $__KERNEL_DS
-	CFI_ADJUST_CFA_OFFSET 4
-	pushl %eax
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi $__KERNEL_DS
+	pushl_cfi %eax
 	lss (%esp), %esp		/* switch to the normal stack segment */
 	CFI_ADJUST_CFA_OFFSET -8
 .endm
@@ -843,8 +801,7 @@ vector=FIRST_EXTERNAL_VECTOR
       .if vector <> FIRST_EXTERNAL_VECTOR
 	CFI_ADJUST_CFA_OFFSET -4
       .endif
-1:	pushl $(~vector+0x80)	/* Note: always in signed byte range */
-	CFI_ADJUST_CFA_OFFSET 4
+1:	pushl_cfi $(~vector+0x80)	/* Note: always in signed byte range */
       .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
 	jmp 2f
       .endif
@@ -884,8 +841,7 @@ ENDPROC(common_interrupt)
 #define BUILD_INTERRUPT3(name, nr, fn)	\
 ENTRY(name)				\
 	RING0_INT_FRAME;		\
-	pushl $~(nr);			\
-	CFI_ADJUST_CFA_OFFSET 4;	\
+	pushl_cfi $~(nr);		\
 	SAVE_ALL;			\
 	TRACE_IRQS_OFF			\
 	movl %esp,%eax;			\
@@ -901,21 +857,18 @@ ENDPROC(name)
 
 ENTRY(coprocessor_error)
 	RING0_INT_FRAME
-	pushl $0
-	CFI_ADJUST_CFA_OFFSET 4
-	pushl $do_coprocessor_error
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi $0
+	pushl_cfi $do_coprocessor_error
 	jmp error_code
 	CFI_ENDPROC
 END(coprocessor_error)
 
 ENTRY(simd_coprocessor_error)
 	RING0_INT_FRAME
-	pushl $0
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi $0
 #ifdef CONFIG_X86_INVD_BUG
 	/* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
-661:	pushl $do_general_protection
+661:	pushl_cfi $do_general_protection
 662:
 .section .altinstructions,"a"
 	.balign 4
@@ -930,19 +883,16 @@ ENTRY(simd_coprocessor_error)
 664:
 .previous
 #else
-	pushl $do_simd_coprocessor_error
+	pushl_cfi $do_simd_coprocessor_error
 #endif
-	CFI_ADJUST_CFA_OFFSET 4
 	jmp error_code
 	CFI_ENDPROC
 END(simd_coprocessor_error)
 
 ENTRY(device_not_available)
 	RING0_INT_FRAME
-	pushl $-1			# mark this as an int
-	CFI_ADJUST_CFA_OFFSET 4
-	pushl $do_device_not_available
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi $-1			# mark this as an int
+	pushl_cfi $do_device_not_available
 	jmp error_code
 	CFI_ENDPROC
 END(device_not_available)
@@ -964,82 +914,68 @@ END(native_irq_enable_sysexit)
 
 ENTRY(overflow)
 	RING0_INT_FRAME
-	pushl $0
-	CFI_ADJUST_CFA_OFFSET 4
-	pushl $do_overflow
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi $0
+	pushl_cfi $do_overflow
 	jmp error_code
 	CFI_ENDPROC
 END(overflow)
 
 ENTRY(bounds)
 	RING0_INT_FRAME
-	pushl $0
-	CFI_ADJUST_CFA_OFFSET 4
-	pushl $do_bounds
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi $0
+	pushl_cfi $do_bounds
 	jmp error_code
 	CFI_ENDPROC
 END(bounds)
 
 ENTRY(invalid_op)
 	RING0_INT_FRAME
-	pushl $0
-	CFI_ADJUST_CFA_OFFSET 4
-	pushl $do_invalid_op
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi $0
+	pushl_cfi $do_invalid_op
 	jmp error_code
 	CFI_ENDPROC
 END(invalid_op)
 
 ENTRY(coprocessor_segment_overrun)
 	RING0_INT_FRAME
-	pushl $0
-	CFI_ADJUST_CFA_OFFSET 4
-	pushl $do_coprocessor_segment_overrun
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi $0
+	pushl_cfi $do_coprocessor_segment_overrun
 	jmp error_code
 	CFI_ENDPROC
 END(coprocessor_segment_overrun)
 
 ENTRY(invalid_TSS)
 	RING0_EC_FRAME
-	pushl $do_invalid_TSS
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi $do_invalid_TSS
 	jmp error_code
 	CFI_ENDPROC
 END(invalid_TSS)
 
 ENTRY(segment_not_present)
 	RING0_EC_FRAME
-	pushl $do_segment_not_present
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi $do_segment_not_present
 	jmp error_code
 	CFI_ENDPROC
 END(segment_not_present)
 
 ENTRY(stack_segment)
 	RING0_EC_FRAME
-	pushl $do_stack_segment
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi $do_stack_segment
 	jmp error_code
 	CFI_ENDPROC
 END(stack_segment)
 
 ENTRY(alignment_check)
 	RING0_EC_FRAME
-	pushl $do_alignment_check
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi $do_alignment_check
 	jmp error_code
 	CFI_ENDPROC
 END(alignment_check)
 
 ENTRY(divide_error)
 	RING0_INT_FRAME
-	pushl $0			# no error code
-	CFI_ADJUST_CFA_OFFSET 4
-	pushl $do_divide_error
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi $0			# no error code
+	pushl_cfi $do_divide_error
 	jmp error_code
 	CFI_ENDPROC
 END(divide_error)
@@ -1047,10 +983,8 @@ END(divide_error)
 #ifdef CONFIG_X86_MCE
 ENTRY(machine_check)
 	RING0_INT_FRAME
-	pushl $0
-	CFI_ADJUST_CFA_OFFSET 4
-	pushl machine_check_vector
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi $0
+	pushl_cfi machine_check_vector
 	jmp error_code
 	CFI_ENDPROC
 END(machine_check)
@@ -1058,10 +992,8 @@ END(machine_check)
 
 ENTRY(spurious_interrupt_bug)
 	RING0_INT_FRAME
-	pushl $0
-	CFI_ADJUST_CFA_OFFSET 4
-	pushl $do_spurious_interrupt_bug
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi $0
+	pushl_cfi $do_spurious_interrupt_bug
 	jmp error_code
 	CFI_ENDPROC
 END(spurious_interrupt_bug)
@@ -1092,8 +1024,7 @@ ENTRY(xen_sysenter_target)
 
 ENTRY(xen_hypervisor_callback)
 	CFI_STARTPROC
-	pushl $0
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi $0
 	SAVE_ALL
 	TRACE_IRQS_OFF
 
@@ -1129,23 +1060,20 @@ ENDPROC(xen_hypervisor_callback)
 # We distinguish between categories by maintaining a status value in EAX.
 ENTRY(xen_failsafe_callback)
 	CFI_STARTPROC
-	pushl %eax
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %eax
 	movl $1,%eax
 1:	mov 4(%esp),%ds
 2:	mov 8(%esp),%es
 3:	mov 12(%esp),%fs
 4:	mov 16(%esp),%gs
 	testl %eax,%eax
-	popl %eax
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %eax
 	lea 16(%esp),%esp
 	CFI_ADJUST_CFA_OFFSET -16
 	jz 5f
 	addl $16,%esp
 	jmp iret_exc		# EAX != 0 => Category 2 (Bad IRET)
-5:	pushl $0		# EAX == 0 => Category 1 (Bad segment)
-	CFI_ADJUST_CFA_OFFSET 4
+5:	pushl_cfi $0		# EAX == 0 => Category 1 (Bad segment)
 	SAVE_ALL
 	jmp ret_from_exception
 	CFI_ENDPROC
@@ -1295,40 +1223,29 @@ syscall_table_size=(.-sys_call_table)
 
 ENTRY(page_fault)
 	RING0_EC_FRAME
-	pushl $do_page_fault
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi $do_page_fault
 	ALIGN
 error_code:
 	/* the function address is in %gs's slot on the stack */
-	pushl %fs
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %fs
 	/*CFI_REL_OFFSET fs, 0*/
-	pushl %es
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %es
 	/*CFI_REL_OFFSET es, 0*/
-	pushl %ds
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %ds
 	/*CFI_REL_OFFSET ds, 0*/
-	pushl %eax
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %eax
 	CFI_REL_OFFSET eax, 0
-	pushl %ebp
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %ebp
 	CFI_REL_OFFSET ebp, 0
-	pushl %edi
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %edi
 	CFI_REL_OFFSET edi, 0
-	pushl %esi
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %esi
 	CFI_REL_OFFSET esi, 0
-	pushl %edx
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %edx
 	CFI_REL_OFFSET edx, 0
-	pushl %ecx
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %ecx
 	CFI_REL_OFFSET ecx, 0
-	pushl %ebx
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %ebx
 	CFI_REL_OFFSET ebx, 0
 	cld
 	movl $(__KERNEL_PERCPU), %ecx
@@ -1370,12 +1287,9 @@ END(page_fault)
 	movl TSS_sysenter_sp0 + \offset(%esp), %esp
 	CFI_DEF_CFA esp, 0
 	CFI_UNDEFINED eip
-	pushfl
-	CFI_ADJUST_CFA_OFFSET 4
-	pushl $__KERNEL_CS
-	CFI_ADJUST_CFA_OFFSET 4
-	pushl $sysenter_past_esp
-	CFI_ADJUST_CFA_OFFSET 4
+	pushfl_cfi
+	pushl_cfi $__KERNEL_CS
+	pushl_cfi $sysenter_past_esp
 	CFI_REL_OFFSET eip, 0
 .endm
 
@@ -1385,8 +1299,7 @@ ENTRY(debug)
 	jne debug_stack_correct
 	FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
 debug_stack_correct:
-	pushl $-1			# mark this as an int
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi $-1			# mark this as an int
 	SAVE_ALL
 	TRACE_IRQS_OFF
 	xorl %edx,%edx			# error code 0
@@ -1406,32 +1319,27 @@ END(debug)
  */
 ENTRY(nmi)
 	RING0_INT_FRAME
-	pushl %eax
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %eax
 	movl %ss, %eax
 	cmpw $__ESPFIX_SS, %ax
-	popl %eax
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %eax
 	je nmi_espfix_stack
 	cmpl $ia32_sysenter_target,(%esp)
 	je nmi_stack_fixup
-	pushl %eax
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %eax
 	movl %esp,%eax
 	/* Do not access memory above the end of our stack page,
 	 * it might not exist.
 	 */
 	andl $(THREAD_SIZE-1),%eax
 	cmpl $(THREAD_SIZE-20),%eax
-	popl %eax
-	CFI_ADJUST_CFA_OFFSET -4
+	popl_cfi %eax
 	jae nmi_stack_correct
 	cmpl $ia32_sysenter_target,12(%esp)
 	je nmi_debug_stack_check
 nmi_stack_correct:
 	/* We have a RING0_INT_FRAME here */
-	pushl %eax
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %eax
 	SAVE_ALL
 	xorl %edx,%edx		# zero error code
 	movl %esp,%eax		# pt_regs pointer
@@ -1460,18 +1368,14 @@ nmi_espfix_stack:
 	 *
 	 * create the pointer to lss back
 	 */
-	pushl %ss
-	CFI_ADJUST_CFA_OFFSET 4
-	pushl %esp
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %ss
+	pushl_cfi %esp
 	addl $4, (%esp)
 	/* copy the iret frame of 12 bytes */
 	.rept 3
-	pushl 16(%esp)
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi 16(%esp)
 	.endr
-	pushl %eax
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi %eax
 	SAVE_ALL
 	FIXUP_ESPFIX_STACK		# %eax == %esp
 	xorl %edx,%edx			# zero error code
@@ -1485,8 +1389,7 @@ END(nmi)
 
 ENTRY(int3)
 	RING0_INT_FRAME
-	pushl $-1			# mark this as an int
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi $-1			# mark this as an int
 	SAVE_ALL
 	TRACE_IRQS_OFF
 	xorl %edx,%edx		# zero error code
@@ -1498,8 +1401,7 @@ END(int3)
 
 ENTRY(general_protection)
 	RING0_EC_FRAME
-	pushl $do_general_protection
-	CFI_ADJUST_CFA_OFFSET 4
+	pushl_cfi $do_general_protection
 	jmp error_code
 	CFI_ENDPROC
 END(general_protection)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 6f305830c80..8851a2bb8c0 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -213,23 +213,17 @@ ENDPROC(native_usergs_sysret64)
 	.macro FAKE_STACK_FRAME child_rip
 	/* push in order ss, rsp, eflags, cs, rip */
 	xorl %eax, %eax
-	pushq $__KERNEL_DS /* ss */
-	CFI_ADJUST_CFA_OFFSET	8
+	pushq_cfi $__KERNEL_DS /* ss */
 	/*CFI_REL_OFFSET	ss,0*/
-	pushq %rax /* rsp */
-	CFI_ADJUST_CFA_OFFSET	8
+	pushq_cfi %rax /* rsp */
 	CFI_REL_OFFSET	rsp,0
-	pushq $X86_EFLAGS_IF /* eflags - interrupts on */
-	CFI_ADJUST_CFA_OFFSET	8
+	pushq_cfi $X86_EFLAGS_IF /* eflags - interrupts on */
 	/*CFI_REL_OFFSET	rflags,0*/
-	pushq $__KERNEL_CS /* cs */
-	CFI_ADJUST_CFA_OFFSET	8
+	pushq_cfi $__KERNEL_CS /* cs */
 	/*CFI_REL_OFFSET	cs,0*/
-	pushq \child_rip /* rip */
-	CFI_ADJUST_CFA_OFFSET	8
+	pushq_cfi \child_rip /* rip */
 	CFI_REL_OFFSET	rip,0
-	pushq	%rax /* orig rax */
-	CFI_ADJUST_CFA_OFFSET	8
+	pushq_cfi %rax /* orig rax */
 	.endm
 
 	.macro UNFAKE_STACK_FRAME
@@ -398,10 +392,8 @@ ENTRY(ret_from_fork)
 
 	LOCK ; btr $TIF_FORK,TI_flags(%r8)
 
-	push kernel_eflags(%rip)
-	CFI_ADJUST_CFA_OFFSET 8
-	popf					# reset kernel eflags
-	CFI_ADJUST_CFA_OFFSET -8
+	pushq_cfi kernel_eflags(%rip)
+	popfq_cfi				# reset kernel eflags
 
 	call schedule_tail			# rdi: 'prev' task parameter
 
@@ -521,11 +513,9 @@ sysret_careful:
 	jnc sysret_signal
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
-	pushq %rdi
-	CFI_ADJUST_CFA_OFFSET 8
+	pushq_cfi %rdi
 	call schedule
-	popq  %rdi
-	CFI_ADJUST_CFA_OFFSET -8
+	popq_cfi %rdi
 	jmp sysret_check
 
 	/* Handle a signal */
@@ -634,11 +624,9 @@ int_careful:
 	jnc  int_very_careful
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
-	pushq %rdi
-	CFI_ADJUST_CFA_OFFSET 8
+	pushq_cfi %rdi
 	call schedule
-	popq %rdi
-	CFI_ADJUST_CFA_OFFSET -8
+	popq_cfi %rdi
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	jmp int_with_check
@@ -652,12 +640,10 @@ int_check_syscall_exit_work:
 	/* Check for syscall exit trace */
 	testl $_TIF_WORK_SYSCALL_EXIT,%edx
 	jz int_signal
-	pushq %rdi
-	CFI_ADJUST_CFA_OFFSET 8
+	pushq_cfi %rdi
 	leaq 8(%rsp),%rdi	# &ptregs -> arg1
 	call syscall_trace_leave
-	popq %rdi
-	CFI_ADJUST_CFA_OFFSET -8
+	popq_cfi %rdi
 	andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
 	jmp int_restore_rest
 
@@ -765,8 +751,7 @@ vector=FIRST_EXTERNAL_VECTOR
       .if vector <> FIRST_EXTERNAL_VECTOR
 	CFI_ADJUST_CFA_OFFSET -8
       .endif
-1:	pushq $(~vector+0x80)	/* Note: always in signed byte range */
-	CFI_ADJUST_CFA_OFFSET 8
+1:	pushq_cfi $(~vector+0x80)	/* Note: always in signed byte range */
       .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
 	jmp 2f
       .endif
@@ -821,6 +806,7 @@ ret_from_intr:
 	TRACE_IRQS_OFF
 	decl PER_CPU_VAR(irq_count)
 	leaveq
+	CFI_RESTORE		rbp
 	CFI_DEF_CFA_REGISTER	rsp
 	CFI_ADJUST_CFA_OFFSET	-8
 exit_intr:
@@ -902,11 +888,9 @@ retint_careful:
 	jnc   retint_signal
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
-	pushq %rdi
-	CFI_ADJUST_CFA_OFFSET	8
+	pushq_cfi %rdi
 	call  schedule
-	popq %rdi
-	CFI_ADJUST_CFA_OFFSET	-8
+	popq_cfi %rdi
 	GET_THREAD_INFO(%rcx)
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
@@ -955,8 +939,7 @@ END(common_interrupt)
 .macro apicinterrupt num sym do_sym
 ENTRY(\sym)
 	INTR_FRAME
-	pushq $~(\num)
-	CFI_ADJUST_CFA_OFFSET 8
+	pushq_cfi $~(\num)
 	interrupt \do_sym
 	jmp ret_from_intr
 	CFI_ENDPROC
@@ -1138,16 +1121,14 @@ zeroentry simd_coprocessor_error do_simd_coprocessor_error
 	/* edi:  new selector */
 ENTRY(native_load_gs_index)
 	CFI_STARTPROC
-	pushf
-	CFI_ADJUST_CFA_OFFSET 8
+	pushfq_cfi
 	DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
 	SWAPGS
 gs_change:
 	movl %edi,%gs
 2:	mfence		/* workaround */
 	SWAPGS
-	popf
-	CFI_ADJUST_CFA_OFFSET -8
+	popfq_cfi
 	ret
 	CFI_ENDPROC
 END(native_load_gs_index)
@@ -1214,8 +1195,7 @@ END(kernel_execve)
 /* Call softirq on interrupt stack. Interrupts are off. */
 ENTRY(call_softirq)
 	CFI_STARTPROC
-	push %rbp
-	CFI_ADJUST_CFA_OFFSET	8
+	pushq_cfi %rbp
 	CFI_REL_OFFSET rbp,0
 	mov  %rsp,%rbp
 	CFI_DEF_CFA_REGISTER rbp
@@ -1224,6 +1204,7 @@ ENTRY(call_softirq)
 	push  %rbp			# backlink for old unwinder
 	call __do_softirq
 	leaveq
+	CFI_RESTORE		rbp
 	CFI_DEF_CFA_REGISTER	rsp
 	CFI_ADJUST_CFA_OFFSET   -8
 	decl PER_CPU_VAR(irq_count)
-- 
cgit v1.2.3-18-g5258


From 7fe977dab356fbd7e86aa10bf83891761107c57c Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Thu, 2 Sep 2010 14:01:58 +0100
Subject: i386: Make kernel_execve() suitable for stack unwinding

The explicit saving and restoring of %ebx was confusing stack
unwind data consumers, and it is plain unnecessary to do this
within the asm(), since that was only introduced for PIC user
mode consumers of the original _syscall3() macro this was
derived from.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Cc: Arnd Bergmann <arnd@arndb.de>
LKML-Reference: <4C7FBC660200007800013F95@vpn.id2.novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/sys_i386_32.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
index d5e06624e34..0b0cb5fede1 100644
--- a/arch/x86/kernel/sys_i386_32.c
+++ b/arch/x86/kernel/sys_i386_32.c
@@ -33,8 +33,8 @@ int kernel_execve(const char *filename,
 		  const char *const envp[])
 {
 	long __res;
-	asm volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx"
+	asm volatile ("int $0x80"
 	: "=a" (__res)
-	: "0" (__NR_execve), "ri" (filename), "c" (argv), "d" (envp) : "memory");
+	: "0" (__NR_execve), "b" (filename), "c" (argv), "d" (envp) : "memory");
 	return __res;
 }
-- 
cgit v1.2.3-18-g5258


From 1c5f50ee347daea013671f718b70cd6bf497bef9 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Fri, 3 Sep 2010 17:04:07 +0800
Subject: x86, mm: fix uninitialized addr in kernel_physical_mapping_init()

This re-adds the lost chunk in commit 9b861528a80.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Haicheng Li <haicheng.li@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
LKML-Reference: <20100903090407.GA19771@localhost>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init_64.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 64e7bc2ded9..74f0f358b07 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -570,6 +570,7 @@ kernel_physical_mapping_init(unsigned long start,
 
 	start = (unsigned long)__va(start);
 	end = (unsigned long)__va(end);
+	addr = start;
 
 	for (; start < end; start = next) {
 		pgd_t *pgd = pgd_offset_k(start);
-- 
cgit v1.2.3-18-g5258


From 57ab43e33122ffdc2eebca5d6de035699f0a8c06 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Fri, 3 Sep 2010 18:39:39 +0200
Subject: x86, GART: Remove superfluous AMD64_GARTEN

There is a GARTEN so use that and drop the duplicate.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Cc: Dave Airlie <airlied@redhat.com>
Cc: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
LKML-Reference: <1283531981-7495-2-git-send-email-bp@amd64.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/gart.h   | 1 -
 arch/x86/kernel/aperture_64.c | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h
index 4ac5b0f33fc..fba0a72c4cc 100644
--- a/arch/x86/include/asm/gart.h
+++ b/arch/x86/include/asm/gart.h
@@ -27,7 +27,6 @@ extern int fix_aperture;
 #define AMD64_GARTAPERTUREBASE	0x94
 #define AMD64_GARTTABLEBASE	0x98
 #define AMD64_GARTCACHECTL	0x9c
-#define AMD64_GARTEN		(1<<0)
 
 #ifdef CONFIG_GART_IOMMU
 extern int gart_iommu_aperture;
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index a2e0caf26e1..6fabd406aa7 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -307,7 +307,7 @@ void __init early_gart_iommu_check(void)
 				continue;
 
 			ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
-			aper_enabled = ctl & AMD64_GARTEN;
+			aper_enabled = ctl & GARTEN;
 			aper_order = (ctl >> 1) & 7;
 			aper_size = (32 * 1024 * 1024) << aper_order;
 			aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
@@ -362,7 +362,7 @@ void __init early_gart_iommu_check(void)
 				continue;
 
 			ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
-			ctl &= ~AMD64_GARTEN;
+			ctl &= ~GARTEN;
 			write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
 		}
 	}
-- 
cgit v1.2.3-18-g5258


From 260133ab658bd2b80e07832a878e00405e19ff43 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Fri, 3 Sep 2010 18:39:40 +0200
Subject: x86, GART: Disable GART table walk probes

Current code tramples over bit F3x90[6] which can be used to
disable GART table walk probes. However, this bit should be set
for performance reasons (speed up GART table walks). We are
allowed to do that since we put GART tables in UC memory later
anyway. Make it so.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Cc: Dave Airlie <airlied@redhat.com>
Cc: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
LKML-Reference: <1283531981-7495-3-git-send-email-bp@amd64.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/gart.h   | 14 ++++++++++++++
 arch/x86/kernel/aperture_64.c | 14 ++++++++------
 arch/x86/kernel/pci-gart_64.c |  2 +-
 3 files changed, 23 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h
index fba0a72c4cc..bf357f9b25f 100644
--- a/arch/x86/include/asm/gart.h
+++ b/arch/x86/include/asm/gart.h
@@ -17,6 +17,7 @@ extern int fix_aperture;
 #define GARTEN		(1<<0)
 #define DISGARTCPU	(1<<4)
 #define DISGARTIO	(1<<5)
+#define DISTLBWALKPRB	(1<<6)
 
 /* GART cache control register bits. */
 #define INVGART		(1<<0)
@@ -56,6 +57,19 @@ static inline void gart_iommu_hole_init(void)
 
 extern int agp_amd64_init(void);
 
+static inline void gart_set_size_and_enable(struct pci_dev *dev, u32 order)
+{
+	u32 ctl;
+
+	/*
+	 * Don't enable translation but enable GART IO and CPU accesses.
+	 * Also, set DISTLBWALKPRB since GART tables memory is UC.
+	 */
+	ctl = DISTLBWALKPRB | order << 1;
+
+	pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl);
+}
+
 static inline void enable_gart_translation(struct pci_dev *dev, u64 addr)
 {
 	u32 tmp, ctl;
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 6fabd406aa7..c9cb1736844 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -505,8 +505,13 @@ out:
 
 	/* Fix up the north bridges */
 	for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
-		int bus;
-		int dev_base, dev_limit;
+		int bus, dev_base, dev_limit;
+
+		/*
+		 * Don't enable translation yet but enable GART IO and CPU
+		 * accesses and set DISTLBWALKPRB since GART table memory is UC.
+		 */
+		u32 ctl = DISTLBWALKPRB | aper_order << 1;
 
 		bus = bus_dev_ranges[i].bus;
 		dev_base = bus_dev_ranges[i].dev_base;
@@ -515,10 +520,7 @@ out:
 			if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
 				continue;
 
-			/* Don't enable translation yet. That is done later.
-			   Assume this BIOS didn't initialise the GART so
-			   just overwrite all previous bits */
-			write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, aper_order << 1);
+			write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
 			write_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE, aper_alloc >> 25);
 		}
 	}
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 0f7f130caa6..6015ee13e22 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -601,7 +601,7 @@ static void gart_fixup_northbridges(struct sys_device *dev)
 		 * Don't enable translations just yet.  That is the next
 		 * step.  Restore the pre-suspend aperture settings.
 		 */
-		pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, aperture_order << 1);
+		gart_set_size_and_enable(dev, aperture_order);
 		pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25);
 	}
 }
-- 
cgit v1.2.3-18-g5258


From d9fadd7ba99a67030783a212bcb17d11f0678433 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Thu, 2 Sep 2010 15:37:10 +0200
Subject: x86, AMD: Remove needless CPU family check (for L3 cache info)

Old 32-bit AMD CPUs (all w/o L3 cache) should always return 0
for cpuid_edx(0x80000006).

For unknown reason the 32-bit implementation differed from the
64-bit implementation. See commit 67cddd94799 ("i386: Add L3 cache
support to AMD CPUID4 emulation"). The current check is the
result of the x86 merge.

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Andi Kleen <andi@firstfloor.org>
LKML-Reference: <20100902133710.GA5449@loge.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/amd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index fc563fabde6..0f0ace5d7db 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -540,7 +540,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 #endif
 
 	if (c->extended_cpuid_level >= 0x80000006) {
-		if ((c->x86 >= 0x0f) && (cpuid_edx(0x80000006) & 0xf000))
+		if (cpuid_edx(0x80000006) & 0xf000)
 			num_cache_leaves = 4;
 		else
 			num_cache_leaves = 3;
-- 
cgit v1.2.3-18-g5258


From 592091c0e21655bfbdf68741dd5a920c2ac2bbe6 Mon Sep 17 00:00:00 2001
From: Jin Dongming <jin.dongming@np.css.fujitsu.com>
Date: Tue, 31 Aug 2010 09:13:33 +0900
Subject: therm_throt.c: Trivial printk message fix for a unsuitable
 abbreviation of 'thermal'

In unexpected_thermal_interrupt(), "LVT TMR interrupt" is used
in error message.

I don't think TMR is a suitable abbreviation for thermal.
  1.TMR has been used in IA32 Architectures Software Developer's
    Manual, and is the abbreviation for Trigger Mode Register.
  2.There is not an standard abbreviation "TMR" defined for thermal
    in IA32 Architectures Software Developer's Manual.
  3.Though we could understand it as Thermal Monitor Register, it is
    easy to be misunderstood as a *TIMER* interrupt also.

I think this patch will fix it.

Signed-off-by: Jin Dongming <jin.dongming@np.css.fujitsu.com>
Reviewed-by: Jean Delvare <khali@linux-fr.org>
Cc: Brown Len <len.brown@intel.com>
Cc: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
LKML-Reference: <4C7C492D.5020704@np.css.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/mcheck/therm_throt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index c2a8b26d4fe..1d0f743c9d6 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -348,7 +348,7 @@ static void intel_thermal_interrupt(void)
 
 static void unexpected_thermal_interrupt(void)
 {
-	printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n",
+	printk(KERN_ERR "CPU%d: Unexpected LVT thermal interrupt!\n",
 			smp_processor_id());
 	add_taint(TAINT_MACHINE_CHECK);
 }
-- 
cgit v1.2.3-18-g5258


From fe8e0c25cad28e8858ecfa5863333c70685a6811 Mon Sep 17 00:00:00 2001
From: Alexander van Heukelum <heukelum@fastmail.fm>
Date: Mon, 6 Sep 2010 20:53:42 +0200
Subject: x86, 32-bit: Align percpu area and irq stacks to THREAD_SIZE

The irq stacks, located in the percpu-area, need to be
THREAD_SIZE aligned. Add the infrastucture to align percpu
variables to larger-than-pagesize amounts within the percpu
area, and use it to specify the alignment for the irq stacks.
Also align the percpu area itself to THREAD_SIZE.

This should make irq stacks work with 8K THREAD_SIZE.

Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm>
Cc: Tejun Heo <tj@kernel.org>
Cc: hch@lst.de
LKML-Reference: <1283799222.15941.1393621887@webmail.messagingengine.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/irq_32.c      | 4 ++--
 arch/x86/kernel/vmlinux.lds.S | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 3b5609f54c4..50fbbe60e50 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -60,8 +60,8 @@ union irq_ctx {
 static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx);
 static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx);
 
-static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, hardirq_stack);
-static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, softirq_stack);
+static DEFINE_PER_CPU_MULTIPAGE_ALIGNED(union irq_ctx, hardirq_stack, THREAD_SIZE);
+static DEFINE_PER_CPU_MULTIPAGE_ALIGNED(union irq_ctx, softirq_stack, THREAD_SIZE);
 
 static void call_on_stack(void *func, void *stack)
 {
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index d0bb52296fa..bb899475355 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -273,7 +273,7 @@ SECTIONS
 	}
 
 #if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP)
-	PERCPU(PAGE_SIZE)
+	PERCPU(THREAD_SIZE)
 #endif
 
 	. = ALIGN(PAGE_SIZE);
-- 
cgit v1.2.3-18-g5258


From 0f1cf415f00286a38f5ce35b459342dbfc895b50 Mon Sep 17 00:00:00 2001
From: Christian Dietrich <qy03fugy@stud.informatik.uni-erlangen.de>
Date: Mon, 6 Sep 2010 16:36:18 +0200
Subject: x86: Remove unnecessary #ifdef ACPI/X86_IO_ACPI

The ACPI/X86_IO_ACPI ifdef isn't necessary at this point,
because it is checked in an outer ifdef level already and has no
effect here.

Cleanup only, no functional effect.

Signed-off-by: Christian Dietrich <qy03fugy@stud.informatik.uni-erlangen.de>
Acked-by: Borislav Petkov <borislav.petkov@amd.com>
Cc: vamos-dev@i4.informatik.uni-erlangen.de
LKML-Reference: <d4376e6d79b8dc0f89a4b3ce4a880904a7b93ead.1283782701.git.qy03fugy@stud.informatik.uni-erlangen.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/early-quirks.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index e5cc7e82e60..e4bd78c160d 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -97,7 +97,6 @@ static void __init nvidia_bugs(int num, int slot, int func)
 
 }
 
-#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
 #if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
 static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
 {
@@ -116,7 +115,6 @@ static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
 	d &= 0xff;
 	return d;
 }
-#endif
 
 static void __init ati_bugs(int num, int slot, int func)
 {
-- 
cgit v1.2.3-18-g5258


From 7ef8aa72ab176e0288f363d1247079732c5d5792 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@amd.com>
Date: Mon, 6 Sep 2010 15:14:17 +0200
Subject: x86, cpu: Fix renamed, not-yet-shipping AMD CPUID feature bit

The AMD SSE5 feature set as-it has been replaced by some extensions
to the AVX instruction set. Thus the bit formerly advertised as SSE5
is re-used for one of these extensions (XOP).
Although this changes the /proc/cpuinfo output, it is not user visible, as
there are no CPUs (yet) having this feature.
To avoid confusion this should be added to the stable series, too.

Cc: stable@kernel.org [.32.x .34.x, .35.x]
Signed-off-by: Andre Przywara <andre.przywara@amd.com>
LKML-Reference: <1283778860-26843-2-git-send-email-andre.przywara@amd.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/cpufeature.h | 2 +-
 arch/x86/kvm/x86.c                | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 781a50b29a4..c9c73d8dedb 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -152,7 +152,7 @@
 #define X86_FEATURE_3DNOWPREFETCH (6*32+ 8) /* 3DNow prefetch instructions */
 #define X86_FEATURE_OSVW	(6*32+ 9) /* OS Visible Workaround */
 #define X86_FEATURE_IBS		(6*32+10) /* Instruction Based Sampling */
-#define X86_FEATURE_SSE5	(6*32+11) /* SSE-5 */
+#define X86_FEATURE_XOP		(6*32+11) /* extended AVX instructions */
 #define X86_FEATURE_SKINIT	(6*32+12) /* SKINIT/STGI instructions */
 #define X86_FEATURE_WDT		(6*32+13) /* Watchdog timer */
 #define X86_FEATURE_NODEID_MSR	(6*32+19) /* NodeId MSR */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3a09c625d52..dd54779ccbe 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1996,7 +1996,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 	const u32 kvm_supported_word6_x86_features =
 		F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
 		F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
-		F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) |
+		F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) |
 		0 /* SKINIT */ | 0 /* WDT */;
 
 	/* all calls to cpuid_count() should be made on the same cpu */
-- 
cgit v1.2.3-18-g5258


From 33ed82fb6c5f032151f7e9f1ac7b667f78f426b8 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@amd.com>
Date: Mon, 6 Sep 2010 15:14:18 +0200
Subject: x86, cpu: Update AMD CPUID feature bits

AMD's public CPUID specification has been updated and some bits have
got names. Add them to properly describe new CPU features.

Signed-off-by: Andre Przywara <andre.przywara@amd.com>
LKML-Reference: <1283778860-26843-3-git-send-email-andre.przywara@amd.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/cpufeature.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index c9c73d8dedb..341835df789 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -155,7 +155,11 @@
 #define X86_FEATURE_XOP		(6*32+11) /* extended AVX instructions */
 #define X86_FEATURE_SKINIT	(6*32+12) /* SKINIT/STGI instructions */
 #define X86_FEATURE_WDT		(6*32+13) /* Watchdog timer */
+#define X86_FEATURE_LWP		(6*32+15) /* Light Weight Profiling */
+#define X86_FEATURE_FMA4	(6*32+16) /* 4 operands MAC instructions */
 #define X86_FEATURE_NODEID_MSR	(6*32+19) /* NodeId MSR */
+#define X86_FEATURE_TBM		(6*32+21) /* trailing bit manipulations */
+#define X86_FEATURE_TOPOEXT	(6*32+22) /* topology extensions CPUID leafs */
 
 /*
  * Auxiliary flags: Linux defined - For features scattered in various
-- 
cgit v1.2.3-18-g5258


From 6d886fd042634c0d3312bace63a5d0c541b721dc Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@amd.com>
Date: Mon, 6 Sep 2010 15:14:19 +0200
Subject: x86, cpu: Fix allowed CPUID bits for KVM guests

The AMD extensions to AVX (FMA4, XOP) work on the same YMM register set
as AVX, so they are safe for guests to use, as long as AVX itself
is allowed. Add F16C and AES on the way for the same reasons.

Signed-off-by: Andre Przywara <andre.przywara@amd.com>
LKML-Reference: <1283778860-26843-4-git-send-email-andre.przywara@amd.com>
Acked-by: Avi Kivity <avi@redhat.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kvm/x86.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index dd54779ccbe..6c2ecf0a806 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1991,13 +1991,14 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
 		0 /* Reserved, DCA */ | F(XMM4_1) |
 		F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
-		0 /* Reserved, AES */ | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX);
+		0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
+		F(F16C);
 	/* cpuid 0x80000001.ecx */
 	const u32 kvm_supported_word6_x86_features =
 		F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
 		F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
 		F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) |
-		0 /* SKINIT */ | 0 /* WDT */;
+		0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
 
 	/* all calls to cpuid_count() should be made on the same cpu */
 	get_cpu();
-- 
cgit v1.2.3-18-g5258


From aeb9c7d618264dcf6eea39142fefee096c3b09e2 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@amd.com>
Date: Mon, 6 Sep 2010 15:14:20 +0200
Subject: x86, kvm: add new AMD SVM feature bits

The recently updated CPUID specification names new SVM feature bits.
Add them to the list of reported features.

Signed-off-by: Andre Przywara <andre.przywara@amd,com>
LKML-Reference: <1283778860-26843-5-git-send-email-andre.przywara@amd.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/cpufeature.h | 7 +++++++
 arch/x86/kernel/cpu/scattered.c   | 6 ++++++
 2 files changed, 13 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 341835df789..bffeab7eab9 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -183,6 +183,13 @@
 #define X86_FEATURE_LBRV	(8*32+ 6) /* AMD LBR Virtualization support */
 #define X86_FEATURE_SVML	(8*32+ 7) /* "svm_lock" AMD SVM locking MSR */
 #define X86_FEATURE_NRIPS	(8*32+ 8) /* "nrip_save" AMD SVM next_rip save */
+#define X86_FEATURE_TSCRATEMSR  (8*32+ 9) /* "tsc_scale" AMD TSC scaling support */
+#define X86_FEATURE_VMCBCLEAN   (8*32+10) /* "vmcb_clean" AMD VMCB clean bits support */
+#define X86_FEATURE_FLUSHBYASID (8*32+11) /* AMD flush-by-ASID support */
+#define X86_FEATURE_DECODEASSISTS (8*32+12) /* AMD Decode Assists support */
+#define X86_FEATURE_PAUSEFILTER (8*32+13) /* AMD filtered pause intercept */
+#define X86_FEATURE_PFTHRESHOLD (8*32+14) /* AMD pause filter threshold */
+
 
 /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
 #define X86_FEATURE_FSGSBASE	(9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 34b4dad6f0b..2c77931473f 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -43,6 +43,12 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
 		{ X86_FEATURE_LBRV,		CR_EDX, 1, 0x8000000a, 0 },
 		{ X86_FEATURE_SVML,		CR_EDX, 2, 0x8000000a, 0 },
 		{ X86_FEATURE_NRIPS,		CR_EDX, 3, 0x8000000a, 0 },
+		{ X86_FEATURE_TSCRATEMSR,	CR_EDX, 4, 0x8000000a, 0 },
+		{ X86_FEATURE_VMCBCLEAN,	CR_EDX, 5, 0x8000000a, 0 },
+		{ X86_FEATURE_FLUSHBYASID,	CR_EDX, 6, 0x8000000a, 0 },
+		{ X86_FEATURE_DECODEASSISTS,	CR_EDX, 7, 0x8000000a, 0 },
+		{ X86_FEATURE_PAUSEFILTER,	CR_EDX,10, 0x8000000a, 0 },
+		{ X86_FEATURE_PFTHRESHOLD,	CR_EDX,12, 0x8000000a, 0 },
 		{ 0, 0, 0, 0, 0 }
 	};
 
-- 
cgit v1.2.3-18-g5258


From 37a2f9f30a360fb03522d15c85c78265ccd80287 Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Wed, 8 Sep 2010 10:14:27 -0500
Subject: x86, kdump: Change copy_oldmem_page() to use cached addressing

The copy of /proc/vmcore to a user buffer proceeds much faster
if the kernel addresses memory as cached.

With this patch we have seen an increase in transfer rate from
less than 15MB/s to 80-460MB/s, depending on size of the
transfer. This makes a big difference in time needed to save a
system dump.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: kexec@lists.infradead.org
Cc: <stable@kernel.org> # as far back as it would apply
LKML-Reference: <E1OtMLz-0001yp-Ia@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/crash_dump_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index 045b36cada6..bf43188ca65 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -34,7 +34,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
 	if (!csize)
 		return 0;
 
-	vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE);
+	vaddr = ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE);
 	if (!vaddr)
 		return -ENOMEM;
 
-- 
cgit v1.2.3-18-g5258


From 51b0fe39549a04858001922919ab355dee9bdfcf Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 11 Jun 2010 13:35:57 +0200
Subject: perf: Deconstify struct pmu

sed -ie 's/const struct pmu\>/struct pmu/g' `git grep -l "const struct pmu\>"`

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Yanmin <yanmin_zhang@linux.intel.com>
Cc: Deng-Cheng Zhu <dengcheng.zhu@gmail.com>
Cc: David Miller <davem@davemloft.net>
Cc: Michael Cree <mcree@orcon.net.nz>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index de6569c04cd..fdd97f2e996 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -618,7 +618,7 @@ static void x86_pmu_enable_all(int added)
 	}
 }
 
-static const struct pmu pmu;
+static struct pmu pmu;
 
 static inline int is_x86_event(struct perf_event *event)
 {
@@ -1427,7 +1427,7 @@ static inline void x86_pmu_read(struct perf_event *event)
  * Set the flag to make pmu::enable() not perform the
  * schedulability test, it will be performed at commit time
  */
-static void x86_pmu_start_txn(const struct pmu *pmu)
+static void x86_pmu_start_txn(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
@@ -1440,7 +1440,7 @@ static void x86_pmu_start_txn(const struct pmu *pmu)
  * Clear the flag and pmu::enable() will perform the
  * schedulability test.
  */
-static void x86_pmu_cancel_txn(const struct pmu *pmu)
+static void x86_pmu_cancel_txn(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
@@ -1457,7 +1457,7 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu)
  * Perform the group schedulability test as a whole
  * Return 0 if success
  */
-static int x86_pmu_commit_txn(const struct pmu *pmu)
+static int x86_pmu_commit_txn(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	int assign[X86_PMC_IDX_MAX];
@@ -1483,7 +1483,7 @@ static int x86_pmu_commit_txn(const struct pmu *pmu)
 	return 0;
 }
 
-static const struct pmu pmu = {
+static struct pmu pmu = {
 	.enable		= x86_pmu_enable,
 	.disable	= x86_pmu_disable,
 	.start		= x86_pmu_start,
@@ -1569,9 +1569,9 @@ out:
 	return ret;
 }
 
-const struct pmu *hw_perf_event_init(struct perf_event *event)
+struct pmu *hw_perf_event_init(struct perf_event *event)
 {
-	const struct pmu *tmp;
+	struct pmu *tmp;
 	int err;
 
 	err = __hw_perf_event_init(event);
-- 
cgit v1.2.3-18-g5258


From b0a873ebbf87bf38bf70b5e39a7cadc96099fa13 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 11 Jun 2010 13:35:08 +0200
Subject: perf: Register PMU implementations

Simple registration interface for struct pmu, this provides the
infrastructure for removing all the weak functions.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Yanmin <yanmin_zhang@linux.intel.com>
Cc: Deng-Cheng Zhu <dengcheng.zhu@gmail.com>
Cc: David Miller <davem@davemloft.net>
Cc: Michael Cree <mcree@orcon.net.nz>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 45 +++++++++++++++++++++++++---------------
 1 file changed, 28 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index fdd97f2e996..2c89264ee79 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -530,7 +530,7 @@ static int x86_pmu_hw_config(struct perf_event *event)
 /*
  * Setup the hardware configuration for a given attr_type
  */
-static int __hw_perf_event_init(struct perf_event *event)
+static int __x86_pmu_event_init(struct perf_event *event)
 {
 	int err;
 
@@ -1414,6 +1414,7 @@ void __init init_hw_perf_events(void)
 	pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_counters_fixed);
 	pr_info("... event mask:             %016Lx\n", x86_pmu.intel_ctrl);
 
+	perf_pmu_register(&pmu);
 	perf_cpu_notifier(x86_pmu_notifier);
 }
 
@@ -1483,18 +1484,6 @@ static int x86_pmu_commit_txn(struct pmu *pmu)
 	return 0;
 }
 
-static struct pmu pmu = {
-	.enable		= x86_pmu_enable,
-	.disable	= x86_pmu_disable,
-	.start		= x86_pmu_start,
-	.stop		= x86_pmu_stop,
-	.read		= x86_pmu_read,
-	.unthrottle	= x86_pmu_unthrottle,
-	.start_txn	= x86_pmu_start_txn,
-	.cancel_txn	= x86_pmu_cancel_txn,
-	.commit_txn	= x86_pmu_commit_txn,
-};
-
 /*
  * validate that we can schedule this event
  */
@@ -1569,12 +1558,22 @@ out:
 	return ret;
 }
 
-struct pmu *hw_perf_event_init(struct perf_event *event)
+int x86_pmu_event_init(struct perf_event *event)
 {
 	struct pmu *tmp;
 	int err;
 
-	err = __hw_perf_event_init(event);
+	switch (event->attr.type) {
+	case PERF_TYPE_RAW:
+	case PERF_TYPE_HARDWARE:
+	case PERF_TYPE_HW_CACHE:
+		break;
+
+	default:
+		return -ENOENT;
+	}
+
+	err = __x86_pmu_event_init(event);
 	if (!err) {
 		/*
 		 * we temporarily connect event to its pmu
@@ -1594,12 +1593,24 @@ struct pmu *hw_perf_event_init(struct perf_event *event)
 	if (err) {
 		if (event->destroy)
 			event->destroy(event);
-		return ERR_PTR(err);
 	}
 
-	return &pmu;
+	return err;
 }
 
+static struct pmu pmu = {
+	.event_init	= x86_pmu_event_init,
+	.enable		= x86_pmu_enable,
+	.disable	= x86_pmu_disable,
+	.start		= x86_pmu_start,
+	.stop		= x86_pmu_stop,
+	.read		= x86_pmu_read,
+	.unthrottle	= x86_pmu_unthrottle,
+	.start_txn	= x86_pmu_start_txn,
+	.cancel_txn	= x86_pmu_cancel_txn,
+	.commit_txn	= x86_pmu_commit_txn,
+};
+
 /*
  * callchain support
  */
-- 
cgit v1.2.3-18-g5258


From 24cd7f54a0d47e1d5b3de29e2456bfbd2d8447b7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 11 Jun 2010 17:32:03 +0200
Subject: perf: Reduce perf_disable() usage

Since the current perf_disable() usage is only an optimization,
remove it for now. This eases the removal of the __weak
hw_perf_enable() interface.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Yanmin <yanmin_zhang@linux.intel.com>
Cc: Deng-Cheng Zhu <dengcheng.zhu@gmail.com>
Cc: David Miller <davem@davemloft.net>
Cc: Michael Cree <mcree@orcon.net.nz>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 2c89264ee79..846070ce49c 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -969,10 +969,11 @@ static int x86_pmu_enable(struct perf_event *event)
 
 	hwc = &event->hw;
 
+	perf_disable();
 	n0 = cpuc->n_events;
-	n = collect_events(cpuc, event, false);
-	if (n < 0)
-		return n;
+	ret = n = collect_events(cpuc, event, false);
+	if (ret < 0)
+		goto out;
 
 	/*
 	 * If group events scheduling transaction was started,
@@ -980,23 +981,26 @@ static int x86_pmu_enable(struct perf_event *event)
 	 * at commit time(->commit_txn) as a whole
 	 */
 	if (cpuc->group_flag & PERF_EVENT_TXN)
-		goto out;
+		goto done_collect;
 
 	ret = x86_pmu.schedule_events(cpuc, n, assign);
 	if (ret)
-		return ret;
+		goto out;
 	/*
 	 * copy new assignment, now we know it is possible
 	 * will be used by hw_perf_enable()
 	 */
 	memcpy(cpuc->assign, assign, n*sizeof(int));
 
-out:
+done_collect:
 	cpuc->n_events = n;
 	cpuc->n_added += n - n0;
 	cpuc->n_txn += n - n0;
 
-	return 0;
+	ret = 0;
+out:
+	perf_enable();
+	return ret;
 }
 
 static int x86_pmu_start(struct perf_event *event)
@@ -1432,6 +1436,7 @@ static void x86_pmu_start_txn(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
+	perf_disable();
 	cpuc->group_flag |= PERF_EVENT_TXN;
 	cpuc->n_txn = 0;
 }
@@ -1451,6 +1456,7 @@ static void x86_pmu_cancel_txn(struct pmu *pmu)
 	 */
 	cpuc->n_added -= cpuc->n_txn;
 	cpuc->n_events -= cpuc->n_txn;
+	perf_enable();
 }
 
 /*
@@ -1480,7 +1486,7 @@ static int x86_pmu_commit_txn(struct pmu *pmu)
 	memcpy(cpuc->assign, assign, n*sizeof(int));
 
 	cpuc->group_flag &= ~PERF_EVENT_TXN;
-
+	perf_enable();
 	return 0;
 }
 
-- 
cgit v1.2.3-18-g5258


From 33696fc0d141bbbcb12f75b69608ea83282e3117 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 14 Jun 2010 08:49:00 +0200
Subject: perf: Per PMU disable

Changes perf_disable() into perf_pmu_disable().

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Yanmin <yanmin_zhang@linux.intel.com>
Cc: Deng-Cheng Zhu <dengcheng.zhu@gmail.com>
Cc: David Miller <davem@davemloft.net>
Cc: Michael Cree <mcree@orcon.net.nz>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 846070ce49c..79705ac4501 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -583,7 +583,7 @@ static void x86_pmu_disable_all(void)
 	}
 }
 
-void hw_perf_disable(void)
+static void x86_pmu_pmu_disable(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
@@ -803,7 +803,7 @@ static inline int match_prev_assignment(struct hw_perf_event *hwc,
 static int x86_pmu_start(struct perf_event *event);
 static void x86_pmu_stop(struct perf_event *event);
 
-void hw_perf_enable(void)
+static void x86_pmu_pmu_enable(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	struct perf_event *event;
@@ -969,7 +969,7 @@ static int x86_pmu_enable(struct perf_event *event)
 
 	hwc = &event->hw;
 
-	perf_disable();
+	perf_pmu_disable(event->pmu);
 	n0 = cpuc->n_events;
 	ret = n = collect_events(cpuc, event, false);
 	if (ret < 0)
@@ -999,7 +999,7 @@ done_collect:
 
 	ret = 0;
 out:
-	perf_enable();
+	perf_pmu_enable(event->pmu);
 	return ret;
 }
 
@@ -1436,7 +1436,7 @@ static void x86_pmu_start_txn(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
-	perf_disable();
+	perf_pmu_disable(pmu);
 	cpuc->group_flag |= PERF_EVENT_TXN;
 	cpuc->n_txn = 0;
 }
@@ -1456,7 +1456,7 @@ static void x86_pmu_cancel_txn(struct pmu *pmu)
 	 */
 	cpuc->n_added -= cpuc->n_txn;
 	cpuc->n_events -= cpuc->n_txn;
-	perf_enable();
+	perf_pmu_enable(pmu);
 }
 
 /*
@@ -1486,7 +1486,7 @@ static int x86_pmu_commit_txn(struct pmu *pmu)
 	memcpy(cpuc->assign, assign, n*sizeof(int));
 
 	cpuc->group_flag &= ~PERF_EVENT_TXN;
-	perf_enable();
+	perf_pmu_enable(pmu);
 	return 0;
 }
 
@@ -1605,6 +1605,8 @@ int x86_pmu_event_init(struct perf_event *event)
 }
 
 static struct pmu pmu = {
+	.pmu_enable	= x86_pmu_pmu_enable,
+	.pmu_disable	= x86_pmu_pmu_disable,
 	.event_init	= x86_pmu_event_init,
 	.enable		= x86_pmu_enable,
 	.disable	= x86_pmu_disable,
-- 
cgit v1.2.3-18-g5258


From a4eaf7f14675cb512d69f0c928055e73d0c6d252 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 16 Jun 2010 14:37:10 +0200
Subject: perf: Rework the PMU methods

Replace pmu::{enable,disable,start,stop,unthrottle} with
pmu::{add,del,start,stop}, all of which take a flags argument.

The new interface extends the capability to stop a counter while
keeping it scheduled on the PMU. We replace the throttled state with
the generic stopped state.

This also allows us to efficiently stop/start counters over certain
code paths (like IRQ handlers).

It also allows scheduling a counter without it starting, allowing for
a generic frozen state (useful for rotating stopped counters).

The stopped state is implemented in two different ways, depending on
how the architecture implemented the throttled state:

 1) We disable the counter:
    a) the pmu has per-counter enable bits, we flip that
    b) we program a NOP event, preserving the counter state

 2) We store the counter state and ignore all read/overflow events

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Yanmin <yanmin_zhang@linux.intel.com>
Cc: Deng-Cheng Zhu <dengcheng.zhu@gmail.com>
Cc: David Miller <davem@davemloft.net>
Cc: Michael Cree <mcree@orcon.net.nz>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c          | 106 +++++++++++++++++-------------
 arch/x86/kernel/cpu/perf_event_intel.c    |   2 +-
 arch/x86/kernel/cpu/perf_event_intel_ds.c |   2 +-
 3 files changed, 63 insertions(+), 47 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 79705ac4501..dd6fec71067 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -583,7 +583,7 @@ static void x86_pmu_disable_all(void)
 	}
 }
 
-static void x86_pmu_pmu_disable(struct pmu *pmu)
+static void x86_pmu_disable(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
@@ -800,10 +800,10 @@ static inline int match_prev_assignment(struct hw_perf_event *hwc,
 		hwc->last_tag == cpuc->tags[i];
 }
 
-static int x86_pmu_start(struct perf_event *event);
-static void x86_pmu_stop(struct perf_event *event);
+static void x86_pmu_start(struct perf_event *event, int flags);
+static void x86_pmu_stop(struct perf_event *event, int flags);
 
-static void x86_pmu_pmu_enable(struct pmu *pmu)
+static void x86_pmu_enable(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	struct perf_event *event;
@@ -839,7 +839,14 @@ static void x86_pmu_pmu_enable(struct pmu *pmu)
 			    match_prev_assignment(hwc, cpuc, i))
 				continue;
 
-			x86_pmu_stop(event);
+			/*
+			 * Ensure we don't accidentally enable a stopped
+			 * counter simply because we rescheduled.
+			 */
+			if (hwc->state & PERF_HES_STOPPED)
+				hwc->state |= PERF_HES_ARCH;
+
+			x86_pmu_stop(event, PERF_EF_UPDATE);
 		}
 
 		for (i = 0; i < cpuc->n_events; i++) {
@@ -851,7 +858,10 @@ static void x86_pmu_pmu_enable(struct pmu *pmu)
 			else if (i < n_running)
 				continue;
 
-			x86_pmu_start(event);
+			if (hwc->state & PERF_HES_ARCH)
+				continue;
+
+			x86_pmu_start(event, PERF_EF_RELOAD);
 		}
 		cpuc->n_added = 0;
 		perf_events_lapic_init();
@@ -952,15 +962,12 @@ static void x86_pmu_enable_event(struct perf_event *event)
 }
 
 /*
- * activate a single event
+ * Add a single event to the PMU.
  *
  * The event is added to the group of enabled events
  * but only if it can be scehduled with existing events.
- *
- * Called with PMU disabled. If successful and return value 1,
- * then guaranteed to call perf_enable() and hw_perf_enable()
  */
-static int x86_pmu_enable(struct perf_event *event)
+static int x86_pmu_add(struct perf_event *event, int flags)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	struct hw_perf_event *hwc;
@@ -975,10 +982,14 @@ static int x86_pmu_enable(struct perf_event *event)
 	if (ret < 0)
 		goto out;
 
+	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+	if (!(flags & PERF_EF_START))
+		hwc->state |= PERF_HES_ARCH;
+
 	/*
 	 * If group events scheduling transaction was started,
 	 * skip the schedulability test here, it will be peformed
-	 * at commit time(->commit_txn) as a whole
+	 * at commit time (->commit_txn) as a whole
 	 */
 	if (cpuc->group_flag & PERF_EVENT_TXN)
 		goto done_collect;
@@ -1003,27 +1014,28 @@ out:
 	return ret;
 }
 
-static int x86_pmu_start(struct perf_event *event)
+static void x86_pmu_start(struct perf_event *event, int flags)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	int idx = event->hw.idx;
 
-	if (idx == -1)
-		return -EAGAIN;
+	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+		return;
+
+	if (WARN_ON_ONCE(idx == -1))
+		return;
+
+	if (flags & PERF_EF_RELOAD) {
+		WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
+		x86_perf_event_set_period(event);
+	}
+
+	event->hw.state = 0;
 
-	x86_perf_event_set_period(event);
 	cpuc->events[idx] = event;
 	__set_bit(idx, cpuc->active_mask);
 	x86_pmu.enable(event);
 	perf_event_update_userpage(event);
-
-	return 0;
-}
-
-static void x86_pmu_unthrottle(struct perf_event *event)
-{
-	int ret = x86_pmu_start(event);
-	WARN_ON_ONCE(ret);
 }
 
 void perf_event_print_debug(void)
@@ -1080,27 +1092,29 @@ void perf_event_print_debug(void)
 	local_irq_restore(flags);
 }
 
-static void x86_pmu_stop(struct perf_event *event)
+static void x86_pmu_stop(struct perf_event *event, int flags)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	struct hw_perf_event *hwc = &event->hw;
-	int idx = hwc->idx;
-
-	if (!__test_and_clear_bit(idx, cpuc->active_mask))
-		return;
-
-	x86_pmu.disable(event);
 
-	/*
-	 * Drain the remaining delta count out of a event
-	 * that we are disabling:
-	 */
-	x86_perf_event_update(event);
+	if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) {
+		x86_pmu.disable(event);
+		cpuc->events[hwc->idx] = NULL;
+		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+		hwc->state |= PERF_HES_STOPPED;
+	}
 
-	cpuc->events[idx] = NULL;
+	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+		/*
+		 * Drain the remaining delta count out of a event
+		 * that we are disabling:
+		 */
+		x86_perf_event_update(event);
+		hwc->state |= PERF_HES_UPTODATE;
+	}
 }
 
-static void x86_pmu_disable(struct perf_event *event)
+static void x86_pmu_del(struct perf_event *event, int flags)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	int i;
@@ -1113,7 +1127,7 @@ static void x86_pmu_disable(struct perf_event *event)
 	if (cpuc->group_flag & PERF_EVENT_TXN)
 		return;
 
-	x86_pmu_stop(event);
+	x86_pmu_stop(event, PERF_EF_UPDATE);
 
 	for (i = 0; i < cpuc->n_events; i++) {
 		if (event == cpuc->event_list[i]) {
@@ -1165,7 +1179,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
 			continue;
 
 		if (perf_event_overflow(event, 1, &data, regs))
-			x86_pmu_stop(event);
+			x86_pmu_stop(event, 0);
 	}
 
 	if (handled)
@@ -1605,15 +1619,17 @@ int x86_pmu_event_init(struct perf_event *event)
 }
 
 static struct pmu pmu = {
-	.pmu_enable	= x86_pmu_pmu_enable,
-	.pmu_disable	= x86_pmu_pmu_disable,
+	.pmu_enable	= x86_pmu_enable,
+	.pmu_disable	= x86_pmu_disable,
+
 	.event_init	= x86_pmu_event_init,
-	.enable		= x86_pmu_enable,
-	.disable	= x86_pmu_disable,
+
+	.add		= x86_pmu_add,
+	.del		= x86_pmu_del,
 	.start		= x86_pmu_start,
 	.stop		= x86_pmu_stop,
 	.read		= x86_pmu_read,
-	.unthrottle	= x86_pmu_unthrottle,
+
 	.start_txn	= x86_pmu_start_txn,
 	.cancel_txn	= x86_pmu_cancel_txn,
 	.commit_txn	= x86_pmu_commit_txn,
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index ee05c90012d..82395f2378e 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -763,7 +763,7 @@ again:
 		data.period = event->hw.last_period;
 
 		if (perf_event_overflow(event, 1, &data, regs))
-			x86_pmu_stop(event);
+			x86_pmu_stop(event, 0);
 	}
 
 	/*
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 18018d1311c..9893a2f77b7 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -491,7 +491,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
 		regs.flags &= ~PERF_EFLAGS_EXACT;
 
 	if (perf_event_overflow(event, 1, &data, &regs))
-		x86_pmu_stop(event);
+		x86_pmu_stop(event, 0);
 }
 
 static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
-- 
cgit v1.2.3-18-g5258


From 15ac9a395a753cb28c674e7ea80386ffdff21785 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 6 Sep 2010 15:51:45 +0200
Subject: perf: Remove the sysfs bits

Neither the overcommit nor the reservation sysfs parameter were
actually working, remove them as they'll only get in the way.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index dd6fec71067..0fb17050360 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1396,7 +1396,6 @@ void __init init_hw_perf_events(void)
 		x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
 	}
 	x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
-	perf_max_events = x86_pmu.num_counters;
 
 	if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
 		WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
-- 
cgit v1.2.3-18-g5258


From 2df7a6e9e8e67c19e5fe2eac3f2d2223b7bb4a7b Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Fri, 3 Sep 2010 21:17:08 -0400
Subject: x86: Use correct type for %cr4

%cr4 is 64-bit in 64-bit mode (although the upper 32-bits are currently reserved).
Use unsigned long for the temporary variable to get the right size.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Acked-by: Pekka Enberg <penberg@kernel.org>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <1283563039-3466-2-git-send-email-brgerst@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/processor.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 325b7bdbeba..396b80f2d87 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -602,7 +602,7 @@ extern unsigned long		mmu_cr4_features;
 
 static inline void set_in_cr4(unsigned long mask)
 {
-	unsigned cr4;
+	unsigned long cr4;
 
 	mmu_cr4_features |= mask;
 	cr4 = read_cr4();
@@ -612,7 +612,7 @@ static inline void set_in_cr4(unsigned long mask)
 
 static inline void clear_in_cr4(unsigned long mask)
 {
-	unsigned cr4;
+	unsigned long cr4;
 
 	mmu_cr4_features &= ~mask;
 	cr4 = read_cr4();
-- 
cgit v1.2.3-18-g5258


From 6ac8bac2684235f4caf22a410549c582aa7327d6 Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Fri, 3 Sep 2010 21:17:09 -0400
Subject: x86, fpu: Merge fpu_init()

Make fpu_init() handle 32-bit setup.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Acked-by: Pekka Enberg <penberg@kernel.org>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <1283563039-3466-3-git-send-email-brgerst@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/cpu/common.c |  7 -------
 arch/x86/kernel/i387.c       | 30 ++++++++++++++----------------
 arch/x86/kernel/traps.c      | 12 ------------
 3 files changed, 14 insertions(+), 35 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 490dac63c2d..f9e23e8639e 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1264,13 +1264,6 @@ void __cpuinit cpu_init(void)
 	clear_all_debug_regs();
 	dbg_restore_debug_regs();
 
-	/*
-	 * Force FPU initialization:
-	 */
-	current_thread_info()->status = 0;
-	clear_used_math();
-	mxcsr_feature_mask_init();
-
 	fpu_init();
 	xsave_init();
 }
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 2605c50b11d..82166519497 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -85,7 +85,6 @@ static void __cpuinit init_thread_xstate(void)
 #endif
 }
 
-#ifdef CONFIG_X86_64
 /*
  * Called at bootup to set up the initial FPU state that is later cloned
  * into all processes.
@@ -93,12 +92,21 @@ static void __cpuinit init_thread_xstate(void)
 
 void __cpuinit fpu_init(void)
 {
-	unsigned long oldcr0 = read_cr0();
-
-	set_in_cr4(X86_CR4_OSFXSR);
-	set_in_cr4(X86_CR4_OSXMMEXCPT);
+	unsigned long cr0;
+	unsigned long cr4_mask = 0;
 
-	write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */
+	if (cpu_has_fxsr)
+		cr4_mask |= X86_CR4_OSFXSR;
+	if (cpu_has_xmm)
+		cr4_mask |= X86_CR4_OSXMMEXCPT;
+	if (cr4_mask)
+		set_in_cr4(cr4_mask);
+
+	cr0 = read_cr0();
+	cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */
+	if (!HAVE_HWFP)
+		cr0 |= X86_CR0_EM;
+	write_cr0(cr0);
 
 	if (!smp_processor_id())
 		init_thread_xstate();
@@ -109,16 +117,6 @@ void __cpuinit fpu_init(void)
 	clear_used_math();
 }
 
-#else	/* CONFIG_X86_64 */
-
-void __cpuinit fpu_init(void)
-{
-	if (!smp_processor_id())
-		init_thread_xstate();
-}
-
-#endif	/* CONFIG_X86_32 */
-
 void fpu_finit(struct fpu *fpu)
 {
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 60788dee0f8..d0029eb5858 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -881,18 +881,6 @@ void __init trap_init(void)
 #endif
 
 #ifdef CONFIG_X86_32
-	if (cpu_has_fxsr) {
-		printk(KERN_INFO "Enabling fast FPU save and restore... ");
-		set_in_cr4(X86_CR4_OSFXSR);
-		printk("done.\n");
-	}
-	if (cpu_has_xmm) {
-		printk(KERN_INFO
-			"Enabling unmasked SIMD FPU exception support... ");
-		set_in_cr4(X86_CR4_OSXMMEXCPT);
-		printk("done.\n");
-	}
-
 	set_system_trap_gate(SYSCALL_VECTOR, &system_call);
 	set_bit(SYSCALL_VECTOR, used_vectors);
 #endif
-- 
cgit v1.2.3-18-g5258


From 51115d4d45700fc7c08306f7ba6e68551f526ae5 Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Fri, 3 Sep 2010 21:17:10 -0400
Subject: x86, fpu: Merge tolerant_fwait()

Commit e2e75c91 merged the math exception handler, allowing both 32-bit
and 64-bit to handle math exceptions from kernel mode.  Switch to using
the 64-bit version of tolerant_fwait() without fnclex, which simply
ignores the exception if one is still pending from userspace.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Acked-by: Pekka Enberg <penberg@kernel.org>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <1283563039-3466-4-git-send-email-brgerst@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/i387.h | 19 ++++---------------
 1 file changed, 4 insertions(+), 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index a73a8d5a5e6..5d8f9a79fa5 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -77,15 +77,6 @@ static inline void sanitize_i387_state(struct task_struct *tsk)
 }
 
 #ifdef CONFIG_X86_64
-
-/* Ignore delayed exceptions from user space */
-static inline void tolerant_fwait(void)
-{
-	asm volatile("1: fwait\n"
-		     "2:\n"
-		     _ASM_EXTABLE(1b, 2b));
-}
-
 static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
 {
 	int err;
@@ -220,11 +211,6 @@ extern void finit_soft_fpu(struct i387_soft_struct *soft);
 static inline void finit_soft_fpu(struct i387_soft_struct *soft) {}
 #endif
 
-static inline void tolerant_fwait(void)
-{
-	asm volatile("fnclex ; fwait");
-}
-
 /* perform fxrstor iff the processor has extended states, otherwise frstor */
 static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
 {
@@ -344,7 +330,10 @@ static inline void __unlazy_fpu(struct task_struct *tsk)
 static inline void __clear_fpu(struct task_struct *tsk)
 {
 	if (task_thread_info(tsk)->status & TS_USEDFPU) {
-		tolerant_fwait();
+		/* Ignore delayed exceptions from user space */
+		asm volatile("1: fwait\n"
+			     "2:\n"
+			     _ASM_EXTABLE(1b, 2b));
 		task_thread_info(tsk)->status &= ~TS_USEDFPU;
 		stts();
 	}
-- 
cgit v1.2.3-18-g5258


From bfd946cb891800d408decaae268a3480775178a3 Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Fri, 3 Sep 2010 21:17:11 -0400
Subject: x86, fpu: Merge __save_init_fpu()

__save_init_fpu() is identical for 32-bit and 64-bit.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Acked-by: Pekka Enberg <penberg@kernel.org>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <1283563039-3466-5-git-send-email-brgerst@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/i387.h | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 5d8f9a79fa5..88065e3a03c 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -197,12 +197,6 @@ static inline void fpu_save_init(struct fpu *fpu)
 	fpu_clear(fpu);
 }
 
-static inline void __save_init_fpu(struct task_struct *tsk)
-{
-	fpu_save_init(&tsk->thread.fpu);
-	task_thread_info(tsk)->status &= ~TS_USEDFPU;
-}
-
 #else  /* CONFIG_X86_32 */
 
 #ifdef CONFIG_MATH_EMULATION
@@ -285,15 +279,14 @@ end:
 	;
 }
 
+#endif	/* CONFIG_X86_64 */
+
 static inline void __save_init_fpu(struct task_struct *tsk)
 {
 	fpu_save_init(&tsk->thread.fpu);
 	task_thread_info(tsk)->status &= ~TS_USEDFPU;
 }
 
-
-#endif	/* CONFIG_X86_64 */
-
 static inline int fpu_fxrstor_checking(struct fpu *fpu)
 {
 	return fxrstor_checking(&fpu->state->fxsave);
-- 
cgit v1.2.3-18-g5258


From a4d4fbc7735bba6654b20f859135f9d3f8fe7f76 Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Fri, 3 Sep 2010 21:17:12 -0400
Subject: x86-64, fpu: Disable preemption when using TS_USEDFPU

Consolidates code and fixes the below race for 64-bit.

commit 9fa2f37bfeb798728241cc4a19578ce6e4258f25
Author: torvalds <torvalds>
Date:   Tue Sep 2 07:37:25 2003 +0000

    Be a lot more careful about TS_USEDFPU and preemption

    We had some races where we testecd (or set) TS_USEDFPU together
    with sequences that depended on the setting (like clearing or
    setting the TS flag in %cr0) and we could be preempted in between,
    which screws up the FPU state, since preemption will itself change
    USEDFPU and the TS flag.

    This makes it a lot more explicit: the "internal" low-level FPU
    functions ("__xxxx_fpu()") all require preemption to be disabled,
    and the exported "real" functions will make sure that is the case.

    One case - in __switch_to() - was switched to the non-preempt-safe
    internal version, since the scheduler itself has already disabled
    preemption.

    BKrev: 3f5448b5WRiQuyzAlbajs3qoQjSobw

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Acked-by: Pekka Enberg <penberg@kernel.org>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <1283563039-3466-6-git-send-email-brgerst@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/i387.h  | 15 ---------------
 arch/x86/kernel/process_64.c |  2 +-
 2 files changed, 1 insertion(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 88065e3a03c..8b40a8379cc 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -387,19 +387,6 @@ static inline void irq_ts_restore(int TS_state)
 		stts();
 }
 
-#ifdef CONFIG_X86_64
-
-static inline void save_init_fpu(struct task_struct *tsk)
-{
-	__save_init_fpu(tsk);
-	stts();
-}
-
-#define unlazy_fpu	__unlazy_fpu
-#define clear_fpu	__clear_fpu
-
-#else  /* CONFIG_X86_32 */
-
 /*
  * These disable preemption on their own and are safe
  */
@@ -425,8 +412,6 @@ static inline void clear_fpu(struct task_struct *tsk)
 	preempt_enable();
 }
 
-#endif	/* CONFIG_X86_64 */
-
 /*
  * i387 state interaction
  */
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 3d9ea531ddd..b3d7a3a04f3 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -424,7 +424,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	load_TLS(next, cpu);
 
 	/* Must be after DS reload */
-	unlazy_fpu(prev_p);
+	__unlazy_fpu(prev_p);
 
 	/* Make sure cpu is ready for new context */
 	if (preload_fpu)
-- 
cgit v1.2.3-18-g5258


From 10c11f304986a1f84201c2261a428701f9d2dffc Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Fri, 3 Sep 2010 21:17:13 -0400
Subject: x86-64, fpu: Fix %cs value in convert_from_fxsr()

While %ds still contains the userspace selector, %cs is KERNEL_CS at
this point.  Always get %cs from pt_regs even for the current task.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Acked-by: Pekka Enberg <penberg@kernel.org>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <1283563039-3466-7-git-send-email-brgerst@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/i387.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 82166519497..f3775f5ef4a 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -389,19 +389,17 @@ convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
 #ifdef CONFIG_X86_64
 	env->fip = fxsave->rip;
 	env->foo = fxsave->rdp;
+	/*
+	 * should be actually ds/cs at fpu exception time, but
+	 * that information is not available in 64bit mode.
+	 */
+	env->fcs = task_pt_regs(tsk)->cs;
 	if (tsk == current) {
-		/*
-		 * should be actually ds/cs at fpu exception time, but
-		 * that information is not available in 64bit mode.
-		 */
-		asm("mov %%ds, %[fos]" : [fos] "=r" (env->fos));
-		asm("mov %%cs, %[fcs]" : [fcs] "=r" (env->fcs));
+		savesegment(ds, env->fos);
 	} else {
-		struct pt_regs *regs = task_pt_regs(tsk);
-
-		env->fos = 0xffff0000 | tsk->thread.ds;
-		env->fcs = regs->cs;
+		env->fos = tsk->thread.ds;
 	}
+	env->fos |= 0xffff0000;
 #else
 	env->fip = fxsave->fip;
 	env->fcs = (u16) fxsave->fcs | ((u32) fxsave->fop << 16);
-- 
cgit v1.2.3-18-g5258


From 820241356d6aa9a895fc10def15794a5a5bfcd98 Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Fri, 3 Sep 2010 21:17:14 -0400
Subject: x86-64, fpu: Simplify constraints for fxsave/fxtstor

Use the "R" constraint (legacy register) instead of listing all the
possible registers.  Clean up the comments as well.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Acked-by: Pekka Enberg <penberg@kernel.org>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <1283563039-3466-8-git-send-email-brgerst@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/i387.h | 44 +++++++++++++++++---------------------------
 1 file changed, 17 insertions(+), 27 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 8b40a8379cc..768fcb25900 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -81,6 +81,7 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
 {
 	int err;
 
+	/* See comment in fxsave() below. */
 	asm volatile("1:  rex64/fxrstor (%[fx])\n\t"
 		     "2:\n"
 		     ".section .fixup,\"ax\"\n"
@@ -89,11 +90,7 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
 		     ".previous\n"
 		     _ASM_EXTABLE(1b, 3b)
 		     : [err] "=r" (err)
-#if 0 /* See comment in fxsave() below. */
-		     : [fx] "r" (fx), "m" (*fx), "0" (0));
-#else
-		     : [fx] "cdaSDb" (fx), "m" (*fx), "0" (0));
-#endif
+		     : [fx] "R" (fx), "m" (*fx), "0" (0));
 	return err;
 }
 
@@ -140,6 +137,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
 	if (unlikely(err))
 		return -EFAULT;
 
+	/* See comment in fxsave() below. */
 	asm volatile("1:  rex64/fxsave (%[fx])\n\t"
 		     "2:\n"
 		     ".section .fixup,\"ax\"\n"
@@ -148,11 +146,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
 		     ".previous\n"
 		     _ASM_EXTABLE(1b, 3b)
 		     : [err] "=r" (err), "=m" (*fx)
-#if 0 /* See comment in fxsave() below. */
-		     : [fx] "r" (fx), "0" (0));
-#else
-		     : [fx] "cdaSDb" (fx), "0" (0));
-#endif
+		     : [fx] "R" (fx), "0" (0));
 	if (unlikely(err) &&
 	    __clear_user(fx, sizeof(struct i387_fxsave_struct)))
 		err = -EFAULT;
@@ -165,26 +159,22 @@ static inline void fpu_fxsave(struct fpu *fpu)
 	/* Using "rex64; fxsave %0" is broken because, if the memory operand
 	   uses any extended registers for addressing, a second REX prefix
 	   will be generated (to the assembler, rex64 followed by semicolon
-	   is a separate instruction), and hence the 64-bitness is lost. */
-#if 0
-	/* Using "fxsaveq %0" would be the ideal choice, but is only supported
-	   starting with gas 2.16. */
-	__asm__ __volatile__("fxsaveq %0"
-			     : "=m" (fpu->state->fxsave));
-#elif 0
-	/* Using, as a workaround, the properly prefixed form below isn't
+	   is a separate instruction), and hence the 64-bitness is lost.
+	   Using "fxsaveq %0" would be the ideal choice, but is only supported
+	   starting with gas 2.16.
+	asm volatile("fxsaveq %0"
+		     : "=m" (fpu->state->fxsave));
+	   Using, as a workaround, the properly prefixed form below isn't
 	   accepted by any binutils version so far released, complaining that
 	   the same type of prefix is used twice if an extended register is
-	   needed for addressing (fix submitted to mainline 2005-11-21). */
-	__asm__ __volatile__("rex64/fxsave %0"
-			     : "=m" (fpu->state->fxsave));
-#else
-	/* This, however, we can work around by forcing the compiler to select
+	   needed for addressing (fix submitted to mainline 2005-11-21).
+	asm volatile("rex64/fxsave %0"
+		     : "=m" (fpu->state->fxsave));
+	   This, however, we can work around by forcing the compiler to select
 	   an addressing mode that doesn't require extended registers. */
-	__asm__ __volatile__("rex64/fxsave (%1)"
-			     : "=m" (fpu->state->fxsave)
-			     : "cdaSDb" (&fpu->state->fxsave));
-#endif
+	asm volatile("rex64/fxsave (%[fx])"
+		     : "=m" (fpu->state->fxsave)
+		     : [fx] "R" (&fpu->state->fxsave));
 }
 
 static inline void fpu_save_init(struct fpu *fpu)
-- 
cgit v1.2.3-18-g5258


From a334fe43d85f570ae907acf988a053c5eff78d6e Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Fri, 3 Sep 2010 21:17:15 -0400
Subject: x86-32, fpu: Remove math_emulate stub

check_fpu() in bugs.c halts boot if no FPU is found and math emulation
isn't enabled.  Therefore this stub will never be used.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Acked-by: Pekka Enberg <penberg@kernel.org>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <1283563039-3466-9-git-send-email-brgerst@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/traps.c | 23 ++++++-----------------
 1 file changed, 6 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index d0029eb5858..d43968503dd 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -776,21 +776,10 @@ asmlinkage void math_state_restore(void)
 }
 EXPORT_SYMBOL_GPL(math_state_restore);
 
-#ifndef CONFIG_MATH_EMULATION
-void math_emulate(struct math_emu_info *info)
-{
-	printk(KERN_EMERG
-		"math-emulation not enabled and no coprocessor found.\n");
-	printk(KERN_EMERG "killing %s.\n", current->comm);
-	force_sig(SIGFPE, current);
-	schedule();
-}
-#endif /* CONFIG_MATH_EMULATION */
-
 dotraplinkage void __kprobes
 do_device_not_available(struct pt_regs *regs, long error_code)
 {
-#ifdef CONFIG_X86_32
+#ifdef CONFIG_MATH_EMULATION
 	if (read_cr0() & X86_CR0_EM) {
 		struct math_emu_info info = { };
 
@@ -798,12 +787,12 @@ do_device_not_available(struct pt_regs *regs, long error_code)
 
 		info.regs = regs;
 		math_emulate(&info);
-	} else {
-		math_state_restore(); /* interrupts still off */
-		conditional_sti(regs);
+		return;
 	}
-#else
-	math_state_restore();
+#endif
+	math_state_restore(); /* interrupts still off */
+#ifdef CONFIG_X86_32
+	conditional_sti(regs);
 #endif
 }
 
-- 
cgit v1.2.3-18-g5258


From 8eb91a577d7763d21628f6761045328784b1911c Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Fri, 3 Sep 2010 21:17:16 -0400
Subject: x86, fpu: Remove unnecessary ifdefs from i387 code.

Remove ifdefs for code that the compiler can optimize away on 64-bit.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Acked-by: Pekka Enberg <penberg@kernel.org>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <1283563039-3466-10-git-send-email-brgerst@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/i387.h | 12 ++++++------
 arch/x86/kernel/i387.c      |  4 ----
 2 files changed, 6 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 768fcb25900..42b507e16d1 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -55,6 +55,12 @@ extern int save_i387_xstate_ia32(void __user *buf);
 extern int restore_i387_xstate_ia32(void __user *buf);
 #endif
 
+#ifdef CONFIG_MATH_EMULATION
+extern void finit_soft_fpu(struct i387_soft_struct *soft);
+#else
+static inline void finit_soft_fpu(struct i387_soft_struct *soft) {}
+#endif
+
 #define X87_FSW_ES (1 << 7)	/* Exception Summary */
 
 static __always_inline __pure bool use_xsaveopt(void)
@@ -189,12 +195,6 @@ static inline void fpu_save_init(struct fpu *fpu)
 
 #else  /* CONFIG_X86_32 */
 
-#ifdef CONFIG_MATH_EMULATION
-extern void finit_soft_fpu(struct i387_soft_struct *soft);
-#else
-static inline void finit_soft_fpu(struct i387_soft_struct *soft) {}
-#endif
-
 /* perform fxrstor iff the processor has extended states, otherwise frstor */
 static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
 {
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index f3775f5ef4a..e795e360bd5 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -79,10 +79,8 @@ static void __cpuinit init_thread_xstate(void)
 
 	if (cpu_has_fxsr)
 		xstate_size = sizeof(struct i387_fxsave_struct);
-#ifdef CONFIG_X86_32
 	else
 		xstate_size = sizeof(struct i387_fsave_struct);
-#endif
 }
 
 /*
@@ -119,12 +117,10 @@ void __cpuinit fpu_init(void)
 
 void fpu_finit(struct fpu *fpu)
 {
-#ifdef CONFIG_X86_32
 	if (!HAVE_HWFP) {
 		finit_soft_fpu(&fpu->state->soft);
 		return;
 	}
-#endif
 
 	if (cpu_has_fxsr) {
 		struct i387_fxsave_struct *fx = &fpu->state->fxsave;
-- 
cgit v1.2.3-18-g5258


From eec73f813ab0954253e5e2168119c4555f83f07d Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Fri, 3 Sep 2010 21:17:17 -0400
Subject: x86, fpu: Remove PSHUFB_XMM5_* macros

The PSHUFB_XMM5_* macros are no longer used.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Acked-by: Pekka Enberg <penberg@kernel.org>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <1283563039-3466-11-git-send-email-brgerst@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/i387.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 42b507e16d1..907967e4fa8 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -465,7 +465,4 @@ extern void fpu_finit(struct fpu *fpu);
 
 #endif /* __ASSEMBLY__ */
 
-#define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5
-#define PSHUFB_XMM5_XMM6 .byte 0x66, 0x0f, 0x38, 0x00, 0xf5
-
 #endif /* _ASM_X86_I387_H */
-- 
cgit v1.2.3-18-g5258


From 58a992b9cbaf449aeebd3575c3695a9eb5d95b5e Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Fri, 3 Sep 2010 21:17:18 -0400
Subject: x86-32, fpu: Rewrite fpu_save_init()

Rewrite fpu_save_init() to prepare for merging with 64-bit.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Acked-by: Pekka Enberg <penberg@kernel.org>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <1283563039-3466-12-git-send-email-brgerst@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/i387.h | 47 +++++++++++++++++++++------------------------
 1 file changed, 22 insertions(+), 25 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 907967e4fa8..b45abefb89f 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -73,6 +73,11 @@ static __always_inline __pure bool use_xsave(void)
 	return static_cpu_has(X86_FEATURE_XSAVE);
 }
 
+static __always_inline __pure bool use_fxsr(void)
+{
+        return static_cpu_has(X86_FEATURE_FXSR);
+}
+
 extern void __sanitize_i387_state(struct task_struct *);
 
 static inline void sanitize_i387_state(struct task_struct *tsk)
@@ -211,6 +216,12 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
 	return 0;
 }
 
+static inline void fpu_fxsave(struct fpu *fpu)
+{
+	asm volatile("fxsave %[fx]"
+		     : [fx] "=m" (fpu->state->fxsave));
+}
+
 /* We need a safe address that is cheap to find and that is already
    in L1 during context switch. The best choices are unfortunately
    different for UP and SMP */
@@ -226,36 +237,24 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
 static inline void fpu_save_init(struct fpu *fpu)
 {
 	if (use_xsave()) {
-		struct xsave_struct *xstate = &fpu->state->xsave;
-		struct i387_fxsave_struct *fx = &fpu->state->fxsave;
-
 		fpu_xsave(fpu);
 
 		/*
 		 * xsave header may indicate the init state of the FP.
 		 */
-		if (!(xstate->xsave_hdr.xstate_bv & XSTATE_FP))
-			goto end;
-
-		if (unlikely(fx->swd & X87_FSW_ES))
-			asm volatile("fnclex");
-
-		/*
-		 * we can do a simple return here or be paranoid :)
-		 */
-		goto clear_state;
+		if (!(fpu->state->xsave.xsave_hdr.xstate_bv & XSTATE_FP))
+			return;
+	} else if (use_fxsr()) {
+		fpu_fxsave(fpu);
+	} else {
+		asm volatile("fsave %[fx]; fwait"
+			     : [fx] "=m" (fpu->state->fsave));
+		return;
 	}
 
-	/* Use more nops than strictly needed in case the compiler
-	   varies code */
-	alternative_input(
-		"fnsave %[fx] ;fwait;" GENERIC_NOP8 GENERIC_NOP4,
-		"fxsave %[fx]\n"
-		"bt $7,%[fsw] ; jnc 1f ; fnclex\n1:",
-		X86_FEATURE_FXSR,
-		[fx] "m" (fpu->state->fxsave),
-		[fsw] "m" (fpu->state->fxsave.swd) : "memory");
-clear_state:
+	if (unlikely(fpu->state->fxsave.swd & X87_FSW_ES))
+		asm volatile("fnclex");
+
 	/* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
 	   is pending.  Clear the x87 state here by setting it to fixed
 	   values. safe_address is a random variable that should be in L1 */
@@ -265,8 +264,6 @@ clear_state:
 		"fildl %[addr]", 	/* set F?P to defined value */
 		X86_FEATURE_FXSAVE_LEAK,
 		[addr] "m" (safe_address));
-end:
-	;
 }
 
 #endif	/* CONFIG_X86_64 */
-- 
cgit v1.2.3-18-g5258


From b2b57fe053c9cf8b8af5a0e826a465996afed0ff Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Fri, 3 Sep 2010 21:17:19 -0400
Subject: x86, fpu: Merge fpu_save_init()

Make 64-bit use the 32-bit version of fpu_save_init().  Remove
unused clear_fpu_state().

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Acked-by: Pekka Enberg <penberg@kernel.org>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <1283563039-3466-13-git-send-email-brgerst@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/i387.h | 48 ++++-----------------------------------------
 1 file changed, 4 insertions(+), 44 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index b45abefb89f..70626ed96cb 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -105,36 +105,6 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
 	return err;
 }
 
-/* AMD CPUs don't save/restore FDP/FIP/FOP unless an exception
-   is pending. Clear the x87 state here by setting it to fixed
-   values. The kernel data segment can be sometimes 0 and sometimes
-   new user value. Both should be ok.
-   Use the PDA as safe address because it should be already in L1. */
-static inline void fpu_clear(struct fpu *fpu)
-{
-	struct xsave_struct *xstate = &fpu->state->xsave;
-	struct i387_fxsave_struct *fx = &fpu->state->fxsave;
-
-	/*
-	 * xsave header may indicate the init state of the FP.
-	 */
-	if (use_xsave() &&
-	    !(xstate->xsave_hdr.xstate_bv & XSTATE_FP))
-		return;
-
-	if (unlikely(fx->swd & X87_FSW_ES))
-		asm volatile("fnclex");
-	alternative_input(ASM_NOP8 ASM_NOP2,
-			  "    emms\n"		/* clear stack tags */
-			  "    fildl %%gs:0",	/* load to clear state */
-			  X86_FEATURE_FXSAVE_LEAK);
-}
-
-static inline void clear_fpu_state(struct task_struct *tsk)
-{
-	fpu_clear(&tsk->thread.fpu);
-}
-
 static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
 {
 	int err;
@@ -188,16 +158,6 @@ static inline void fpu_fxsave(struct fpu *fpu)
 		     : [fx] "R" (&fpu->state->fxsave));
 }
 
-static inline void fpu_save_init(struct fpu *fpu)
-{
-	if (use_xsave())
-		fpu_xsave(fpu);
-	else
-		fpu_fxsave(fpu);
-
-	fpu_clear(fpu);
-}
-
 #else  /* CONFIG_X86_32 */
 
 /* perform fxrstor iff the processor has extended states, otherwise frstor */
@@ -222,6 +182,8 @@ static inline void fpu_fxsave(struct fpu *fpu)
 		     : [fx] "=m" (fpu->state->fxsave));
 }
 
+#endif	/* CONFIG_X86_64 */
+
 /* We need a safe address that is cheap to find and that is already
    in L1 during context switch. The best choices are unfortunately
    different for UP and SMP */
@@ -259,15 +221,13 @@ static inline void fpu_save_init(struct fpu *fpu)
 	   is pending.  Clear the x87 state here by setting it to fixed
 	   values. safe_address is a random variable that should be in L1 */
 	alternative_input(
-		GENERIC_NOP8 GENERIC_NOP2,
+		ASM_NOP8 ASM_NOP2,
 		"emms\n\t"	  	/* clear stack tags */
-		"fildl %[addr]", 	/* set F?P to defined value */
+		"fildl %P[addr]",	/* set F?P to defined value */
 		X86_FEATURE_FXSAVE_LEAK,
 		[addr] "m" (safe_address));
 }
 
-#endif	/* CONFIG_X86_64 */
-
 static inline void __save_init_fpu(struct task_struct *tsk)
 {
 	fpu_save_init(&tsk->thread.fpu);
-- 
cgit v1.2.3-18-g5258


From db7829c6cc32f3c0c9a324118d743acb1abff081 Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Thu, 9 Sep 2010 18:17:26 +0200
Subject: x86, percpu: Optimize this_cpu_ptr

Allow arches to implement __this_cpu_ptr, and provide an x86 version.

Before:
	movq $foo, %rax
	movq %gs:this_cpu_off, %rdx
	addq %rdx, %rax

After:
	movq $foo, %rax
	addq %gs:this_cpu_off, %rax

The benefit is doing it in one less instruction and not clobbering
a temporary register.

tj: * Beefed up the comment a bit and renamed in-macro temp variable
      to match neighboring macros.

    * Folded fix for const pointer case found in linux-next.

    * Fixed sparse notation.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 arch/x86/include/asm/percpu.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index cd28f9ad910..f899e01a8ac 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -47,6 +47,20 @@
 #ifdef CONFIG_SMP
 #define __percpu_arg(x)		"%%"__stringify(__percpu_seg)":%P" #x
 #define __my_cpu_offset		percpu_read(this_cpu_off)
+
+/*
+ * Compared to the generic __my_cpu_offset version, the following
+ * saves one instruction and avoids clobbering a temp register.
+ */
+#define __this_cpu_ptr(ptr)				\
+({							\
+	unsigned long tcp_ptr__;			\
+	__verify_pcpu_ptr(ptr);				\
+	asm volatile("add " __percpu_arg(1) ", %0"	\
+		     : "=r" (tcp_ptr__)			\
+		     : "m" (this_cpu_off), "0" (ptr));	\
+	(typeof(*(ptr)) __kernel __force *)tcp_ptr__;	\
+})
 #else
 #define __percpu_arg(x)		"%P" #x
 #endif
-- 
cgit v1.2.3-18-g5258


From a7f07cfbaa1dd5bf9e615948f280c92e7928e6f7 Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venki@google.com>
Date: Fri, 10 Sep 2010 15:55:49 -0700
Subject: x86, mtrr: Refactor MTRR type overlap check code

Move the MTRR type overlap check into a new function. No functional change in
this patch. Just making it easier to add multiple region overlap check in
the following patch.

Signed-off-by: Venkatesh Pallipadi <venki@google.com>
LKML-Reference: <1284159350-19841-2-git-send-email-venki@google.com>
Reviewed-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/cpu/mtrr/generic.c | 44 +++++++++++++++++++++++++-------------
 1 file changed, 29 insertions(+), 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 7d28d7d0388..14f4f0c0329 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -64,6 +64,33 @@ static inline void k8_check_syscfg_dram_mod_en(void)
 	}
 }
 
+/*
+ * Check and return the effective type for MTRR-MTRR type overlap.
+ * Returns 1 if the effective type is UNCACHEABLE, else returns 0
+ */
+static int check_type_overlap(u8 *prev, u8 *curr)
+{
+	if (*prev == MTRR_TYPE_UNCACHABLE || *curr == MTRR_TYPE_UNCACHABLE) {
+		*prev = MTRR_TYPE_UNCACHABLE;
+		*curr = MTRR_TYPE_UNCACHABLE;
+		return 1;
+	}
+
+	if ((*prev == MTRR_TYPE_WRBACK && *curr == MTRR_TYPE_WRTHROUGH) ||
+	    (*prev == MTRR_TYPE_WRTHROUGH && *curr == MTRR_TYPE_WRBACK)) {
+		*prev = MTRR_TYPE_WRTHROUGH;
+		*curr = MTRR_TYPE_WRTHROUGH;
+	}
+
+	if (*prev != *curr) {
+		*prev = MTRR_TYPE_UNCACHABLE;
+		*curr = MTRR_TYPE_UNCACHABLE;
+		return 1;
+	}
+
+	return 0;
+}
+
 /*
  * Returns the effective MTRR type for the region
  * Error returns:
@@ -138,21 +165,8 @@ u8 mtrr_type_lookup(u64 start, u64 end)
 			continue;
 		}
 
-		if (prev_match == MTRR_TYPE_UNCACHABLE ||
-		    curr_match == MTRR_TYPE_UNCACHABLE) {
-			return MTRR_TYPE_UNCACHABLE;
-		}
-
-		if ((prev_match == MTRR_TYPE_WRBACK &&
-		     curr_match == MTRR_TYPE_WRTHROUGH) ||
-		    (prev_match == MTRR_TYPE_WRTHROUGH &&
-		     curr_match == MTRR_TYPE_WRBACK)) {
-			prev_match = MTRR_TYPE_WRTHROUGH;
-			curr_match = MTRR_TYPE_WRTHROUGH;
-		}
-
-		if (prev_match != curr_match)
-			return MTRR_TYPE_UNCACHABLE;
+		if (check_type_overlap(&prev_match, &curr_match))
+			return curr_match;
 	}
 
 	if (mtrr_tom2) {
-- 
cgit v1.2.3-18-g5258


From 351e5a703ad994405bd900da330823d3b4a372e0 Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venki@google.com>
Date: Fri, 10 Sep 2010 15:55:50 -0700
Subject: x86, mtrr: Support mtrr lookup for range spanning across MTRR range

mtrr_type_lookup [start:end] looked up the resultant MTRR type for that
range, based on fixed and all variable MTRR ranges. It did check for multiple
MTRR var ranges overlapping [start:end] and returned the net type.

However, if the [start:end] range spanned across any var MTRR range,
mtrr_type_lookup would return an error return of 0xFE. This was based on
typical usage of mtrr_type_lookup in PAT mapping, where region being
mapped would not normally span across MTRR ranges and also trying
to keep the code simple.

Mark recently reported the problem with this limitation. When there are
two continguous MTRR's of type "writeback" and if there is a memory mapping
over a region starting in one MTRR range and ending in another MTRR range,
such mapping will fallback to "uncached" due to the above limitation.

Change below adds support for such lookups spanning multiple MTRR ranges.
We now have a wrapper mtrr_type_lookup that dynamically splits such a region
into smaller chunks that fit within one MTRR range and does a
__mtrr_type_lookup on it and combine the results later.

Reported-by: Mark Langsdorf <mark.langsdorf@amd.com>
Signed-off-by: Venkatesh Pallipadi <venki@google.com>
LKML-Reference: <1284159350-19841-3-git-send-email-venki@google.com>
Reviewed-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/cpu/mtrr/generic.c | 84 ++++++++++++++++++++++++++++++++++----
 1 file changed, 77 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 14f4f0c0329..9f27228ceff 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -64,6 +64,18 @@ static inline void k8_check_syscfg_dram_mod_en(void)
 	}
 }
 
+/* Get the size of contiguous MTRR range */
+static u64 get_mtrr_size(u64 mask)
+{
+	u64 size;
+
+	mask >>= PAGE_SHIFT;
+	mask |= size_or_mask;
+	size = -mask;
+	size <<= PAGE_SHIFT;
+	return size;
+}
+
 /*
  * Check and return the effective type for MTRR-MTRR type overlap.
  * Returns 1 if the effective type is UNCACHEABLE, else returns 0
@@ -92,17 +104,19 @@ static int check_type_overlap(u8 *prev, u8 *curr)
 }
 
 /*
- * Returns the effective MTRR type for the region
- * Error returns:
- * - 0xFE - when the range is "not entirely covered" by _any_ var range MTRR
- * - 0xFF - when MTRR is not enabled
+ * Error/Semi-error returns:
+ * 0xFF - when MTRR is not enabled
+ * *repeat == 1 implies [start:end] spanned across MTRR range and type returned
+ *		corresponds only to [start:*partial_end].
+ *		Caller has to lookup again for [*partial_end:end].
  */
-u8 mtrr_type_lookup(u64 start, u64 end)
+static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat)
 {
 	int i;
 	u64 base, mask;
 	u8 prev_match, curr_match;
 
+	*repeat = 0;
 	if (!mtrr_state_set)
 		return 0xFF;
 
@@ -153,8 +167,34 @@ u8 mtrr_type_lookup(u64 start, u64 end)
 
 		start_state = ((start & mask) == (base & mask));
 		end_state = ((end & mask) == (base & mask));
-		if (start_state != end_state)
-			return 0xFE;
+
+		if (start_state != end_state) {
+			/*
+			 * We have start:end spanning across an MTRR.
+			 * We split the region into
+			 * either
+			 * (start:mtrr_end) (mtrr_end:end)
+			 * or
+			 * (start:mtrr_start) (mtrr_start:end)
+			 * depending on kind of overlap.
+			 * Return the type for first region and a pointer to
+			 * the start of second region so that caller will
+			 * lookup again on the second region.
+			 * Note: This way we handle multiple overlaps as well.
+			 */
+			if (start_state)
+				*partial_end = base + get_mtrr_size(mask);
+			else
+				*partial_end = base;
+
+			if (unlikely(*partial_end <= start)) {
+				WARN_ON(1);
+				*partial_end = start + PAGE_SIZE;
+			}
+
+			end = *partial_end - 1; /* end is inclusive */
+			*repeat = 1;
+		}
 
 		if ((start & mask) != (base & mask))
 			continue;
@@ -180,6 +220,36 @@ u8 mtrr_type_lookup(u64 start, u64 end)
 	return mtrr_state.def_type;
 }
 
+/*
+ * Returns the effective MTRR type for the region
+ * Error return:
+ * 0xFF - when MTRR is not enabled
+ */
+u8 mtrr_type_lookup(u64 start, u64 end)
+{
+	u8 type, prev_type;
+	int repeat;
+	u64 partial_end;
+
+	type = __mtrr_type_lookup(start, end, &partial_end, &repeat);
+
+	/*
+	 * Common path is with repeat = 0.
+	 * However, we can have cases where [start:end] spans across some
+	 * MTRR range. Do repeated lookups for that case here.
+	 */
+	while (repeat) {
+		prev_type = type;
+		start = partial_end;
+		type = __mtrr_type_lookup(start, end, &partial_end, &repeat);
+
+		if (check_type_overlap(&prev_type, &type))
+			return type;
+	}
+
+	return type;
+}
+
 /* Get the MSR pair relating to a var range */
 static void
 get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr)
-- 
cgit v1.2.3-18-g5258


From d0ed0c32662e756e7daf85e70a5a27a9c1111331 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Sat, 11 Sep 2010 22:10:54 -0700
Subject: x86: Remove pr_<level> uses of KERN_<level>

Signed-off-by: Joe Perches <joe@perches.com>
Cc: Jiri Kosina <trivial@kernel.org>
LKML-Reference: <d40c60f4b036a26db8492848695bdafaa3b42791.1284267142.git.joe@perches.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apb_timer.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index 8dd77800ff5..4004417a6b5 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -398,7 +398,7 @@ static int apbt_cpuhp_notify(struct notifier_block *n,
 		}
 		break;
 	default:
-		pr_debug(KERN_INFO "APBT notified %lu, no action\n", action);
+		pr_debug("APBT notified %lu, no action\n", action);
 	}
 	return NOTIFY_OK;
 }
@@ -552,7 +552,7 @@ bad_count:
 		pr_debug("APB CS going back %lx:%lx:%lx ",
 			 t2, last_read, t2 - last_read);
 bad_count_x3:
-		pr_debug(KERN_INFO "tripple check enforced\n");
+		pr_debug("triple check enforced\n");
 		t0 = apbt_readl(phy_cs_timer_id,
 				APBTMR_N_CURRENT_VALUE);
 		udelay(1);
-- 
cgit v1.2.3-18-g5258


From b0b2072df3b544f56b90173c2cde7a374c51546b Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Fri, 10 Sep 2010 13:28:01 +0200
Subject: perf_events: Fix BTS interrupt handling to avoid being dazed by NMI
 (v2)

Fix a bug introduced with commit de725de and the change in the
meaning of the return value of intel_pmu_handle_irq(). With the
current code, when you are using the BTS, you get 'dazed by NMI'
each time the BTS buffer fills up.

BTS does interrupt on the PMU vector, thus NMI. You need to take
this into account in the return value of the function.

This version fixes initial patch which was missing changes to
perf_event_intel_ds.c.

Signed-off-by: Stephane Eranian <eranian@google.com>
Acked-by: Don Zickus <dzickus@redhat.com>
Cc: peterz@infradead.org
Cc: paulus@samba.org
Cc: davem@davemloft.net
Cc: fweisbec@gmail.com
Cc: perfmon2-devel@lists.sf.net
Cc: eranian@gmail.com
Cc: robert.richter@amd.com
LKML-Reference: <4c8a1686.aae9d80a.5aa4.5e35@mx.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel.c    |  6 +++---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 11 ++++++-----
 2 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 82395f2378e..c8f5c088cad 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -713,18 +713,18 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
 	struct cpu_hw_events *cpuc;
 	int bit, loops;
 	u64 status;
-	int handled = 0;
+	int handled;
 
 	perf_sample_data_init(&data, 0);
 
 	cpuc = &__get_cpu_var(cpu_hw_events);
 
 	intel_pmu_disable_all();
-	intel_pmu_drain_bts_buffer();
+	handled = intel_pmu_drain_bts_buffer();
 	status = intel_pmu_get_status();
 	if (!status) {
 		intel_pmu_enable_all(0);
-		return 0;
+		return handled;
 	}
 
 	loops = 0;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 9893a2f77b7..4977f9c400e 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -214,7 +214,7 @@ static void intel_pmu_disable_bts(void)
 	update_debugctlmsr(debugctlmsr);
 }
 
-static void intel_pmu_drain_bts_buffer(void)
+static int intel_pmu_drain_bts_buffer(void)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	struct debug_store *ds = cpuc->ds;
@@ -231,16 +231,16 @@ static void intel_pmu_drain_bts_buffer(void)
 	struct pt_regs regs;
 
 	if (!event)
-		return;
+		return 0;
 
 	if (!ds)
-		return;
+		return 0;
 
 	at  = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
 	top = (struct bts_record *)(unsigned long)ds->bts_index;
 
 	if (top <= at)
-		return;
+		return 0;
 
 	ds->bts_index = ds->bts_buffer_base;
 
@@ -256,7 +256,7 @@ static void intel_pmu_drain_bts_buffer(void)
 	perf_prepare_sample(&header, &data, event, &regs);
 
 	if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1))
-		return;
+		return 1;
 
 	for (; at < top; at++) {
 		data.ip		= at->from;
@@ -270,6 +270,7 @@ static void intel_pmu_drain_bts_buffer(void)
 	/* There's new data available. */
 	event->hw.interrupts++;
 	event->pending_kill = POLL_IN;
+	return 1;
 }
 
 /*
-- 
cgit v1.2.3-18-g5258


From 6376b2297502e72255b7eb2893c6044ad5a7b5f4 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Wed, 15 Sep 2010 10:04:28 +0900
Subject: kprobes: Make functions static

Make following (internal) functions static to make sparse
happier :-)

 * get_optimized_kprobe: only called from static functions
 * kretprobe_table_unlock: _lock function is static
 * kprobes_optinsn_template_holder: never called but holding asm code

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Acked-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
LKML-Reference: <1284512670-2369-4-git-send-email-namhyung@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/kprobes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 770ebfb349e..05c20a48841 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -1129,7 +1129,7 @@ static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr,
 	*(unsigned long *)addr = val;
 }
 
-void __kprobes kprobes_optinsn_template_holder(void)
+static void __used __kprobes kprobes_optinsn_template_holder(void)
 {
 	asm volatile (
 			".global optprobe_template_entry\n"
-- 
cgit v1.2.3-18-g5258


From 6abded71d730322df96c5b7f4ab952ffd5a0080d Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Wed, 15 Sep 2010 10:04:29 +0900
Subject: kprobes: Remove __dummy_buf

Remove __dummy_buf which is needed for kallsyms_lookup only.
use kallsysm_lookup_size_offset instead.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Acked-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
LKML-Reference: <1284512670-2369-5-git-send-email-namhyung@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/kprobes.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 05c20a48841..e05952af5d2 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -230,9 +230,6 @@ static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
 	return 0;
 }
 
-/* Dummy buffers for kallsyms_lookup */
-static char __dummy_buf[KSYM_NAME_LEN];
-
 /* Check if paddr is at an instruction boundary */
 static int __kprobes can_probe(unsigned long paddr)
 {
@@ -241,7 +238,7 @@ static int __kprobes can_probe(unsigned long paddr)
 	struct insn insn;
 	kprobe_opcode_t buf[MAX_INSN_SIZE];
 
-	if (!kallsyms_lookup(paddr, NULL, &offset, NULL, __dummy_buf))
+	if (!kallsyms_lookup_size_offset(paddr, NULL, &offset))
 		return 0;
 
 	/* Decode instructions */
@@ -1269,11 +1266,9 @@ static int __kprobes can_optimize(unsigned long paddr)
 	unsigned long addr, size = 0, offset = 0;
 	struct insn insn;
 	kprobe_opcode_t buf[MAX_INSN_SIZE];
-	/* Dummy buffers for lookup_symbol_attrs */
-	static char __dummy_buf[KSYM_NAME_LEN];
 
 	/* Lookup symbol including addr */
-	if (!kallsyms_lookup(paddr, &size, &offset, NULL, __dummy_buf))
+	if (!kallsyms_lookup_size_offset(paddr, &size, &offset))
 		return 0;
 
 	/* Check there is enough space for a relative jump. */
-- 
cgit v1.2.3-18-g5258


From 892df7f81c31ce7f85778aa78094e8d1f19b8413 Mon Sep 17 00:00:00 2001
From: Udo van den Heuvel <udovdh@xs4all.nl>
Date: Tue, 14 Sep 2010 07:15:08 +0200
Subject: x86: HPET force enable for CX700 / VIA Epia LT

Allow using HPET with the hpet=force command line option on VIA EPIA
CX700 systems.

Signed-off-by: Udo van den Heuvel <udovdh@xs4all.nl>
Cc: Robert Hancock <hancockrwd@gmail.com>
LKML-Reference:  <4C8F04DC.5060303@xs4all.nl>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/quirks.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 939b9e98245..8bbe8c56916 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -344,6 +344,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235,
 			 vt8237_force_enable_hpet);
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237,
 			 vt8237_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_CX700,
+			 vt8237_force_enable_hpet);
 
 static void ati_force_hpet_resume(void)
 {
-- 
cgit v1.2.3-18-g5258


From 75e3cfbed6f71a8f151dc6e413b6ce3c390030cb Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Fri, 27 Aug 2010 11:09:48 -0700
Subject: x86, intr-remap: Set redirection hint in the IRTE

Currently the redirection hint in the interrupt-remapping table entry
is set to 0, which means the remapped interrupt is directed to the
processors listed in the destination. So in logical flat mode
in the presence of intr-remapping, this results in a single
interrupt multi-casted to multiple cpu's as specified by the destination
bit mask. But what we really want is to send that interrupt to one of the cpus
based on the lowest priority delivery mode.

Set the redirection hint in the IRTE to '1' to indicate that we want
the remapped interrupt to be directed to only one of the processors
listed in the destination.

This fixes the issue of same interrupt getting delivered to multiple cpu's
in the logical flat mode in the presence of interrupt-remapping. While
there is no functional issue observed with this behavior, this will
impact performance of such configurations (<=8 cpu's using logical flat
mode in the presence of interrupt-remapping)

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <20100827181049.013051492@sbsiddha-MOBL3.sc.intel.com>
Cc: Weidong Han <weidong.han@intel.com>
Cc: <stable@kernel.org> # [v2.6.32+]
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/apic/io_apic.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index f1efebaf551..90f8a75f548 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1392,6 +1392,7 @@ int setup_ioapic_entry(int apic_id, int irq,
 		irte.dlvry_mode = apic->irq_delivery_mode;
 		irte.vector = vector;
 		irte.dest_id = IRTE_DEST(destination);
+		irte.redir_hint = 1;
 
 		/* Set source-id of interrupt request */
 		set_ioapic_sid(&irte, apic_id);
@@ -3343,6 +3344,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
 		irte.dlvry_mode = apic->irq_delivery_mode;
 		irte.vector = cfg->vector;
 		irte.dest_id = IRTE_DEST(dest);
+		irte.redir_hint = 1;
 
 		/* Set source-id of interrupt request */
 		if (pdev)
-- 
cgit v1.2.3-18-g5258


From 62a92f4c69cd1d9361ad8c16be1dd16e6821bc15 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Fri, 27 Aug 2010 11:09:49 -0700
Subject: x86, intr-remap: Remove IRTE setup duplicate code

Remove IRTE setup duplicate code with prepare_irte().

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <20100827181049.095067319@sbsiddha-MOBL3.sc.intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/irq_remapping.h | 27 +++++++++++++++++++++++++++
 arch/x86/kernel/apic/io_apic.c       | 27 ++-------------------------
 2 files changed, 29 insertions(+), 25 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index f275e224450..8d841505344 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -3,4 +3,31 @@
 
 #define IRTE_DEST(dest) ((x2apic_mode) ? dest : dest << 8)
 
+#ifdef CONFIG_INTR_REMAP
+static inline void prepare_irte(struct irte *irte, int vector,
+			        unsigned int dest)
+{
+	memset(irte, 0, sizeof(*irte));
+
+	irte->present = 1;
+	irte->dst_mode = apic->irq_dest_mode;
+	/*
+	 * Trigger mode in the IRTE will always be edge, and for IO-APIC, the
+	 * actual level or edge trigger will be setup in the IO-APIC
+	 * RTE. This will help simplify level triggered irq migration.
+	 * For more details, see the comments (in io_apic.c) explainig IO-APIC
+	 * irq migration in the presence of interrupt-remapping.
+	*/
+	irte->trigger_mode = 0;
+	irte->dlvry_mode = apic->irq_delivery_mode;
+	irte->vector = vector;
+	irte->dest_id = IRTE_DEST(dest);
+	irte->redir_hint = 1;
+}
+#else
+static void prepare_irte(struct irte *irte, int vector, unsigned int dest)
+{
+}
+#endif
+
 #endif	/* _ASM_X86_IRQ_REMAPPING_H */
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 90f8a75f548..e8c95a22614 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1377,22 +1377,7 @@ int setup_ioapic_entry(int apic_id, int irq,
 		if (index < 0)
 			panic("Failed to allocate IRTE for ioapic %d\n", apic_id);
 
-		memset(&irte, 0, sizeof(irte));
-
-		irte.present = 1;
-		irte.dst_mode = apic->irq_dest_mode;
-		/*
-		 * Trigger mode in the IRTE will always be edge, and the
-		 * actual level or edge trigger will be setup in the IO-APIC
-		 * RTE. This will help simplify level triggered irq migration.
-		 * For more details, see the comments above explainig IO-APIC
-		 * irq migration in the presence of interrupt-remapping.
-		 */
-		irte.trigger_mode = 0;
-		irte.dlvry_mode = apic->irq_delivery_mode;
-		irte.vector = vector;
-		irte.dest_id = IRTE_DEST(destination);
-		irte.redir_hint = 1;
+		prepare_irte(&irte, vector, destination);
 
 		/* Set source-id of interrupt request */
 		set_ioapic_sid(&irte, apic_id);
@@ -3336,15 +3321,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
 		ir_index = map_irq_to_irte_handle(irq, &sub_handle);
 		BUG_ON(ir_index == -1);
 
-		memset (&irte, 0, sizeof(irte));
-
-		irte.present = 1;
-		irte.dst_mode = apic->irq_dest_mode;
-		irte.trigger_mode = 0; /* edge */
-		irte.dlvry_mode = apic->irq_delivery_mode;
-		irte.vector = cfg->vector;
-		irte.dest_id = IRTE_DEST(dest);
-		irte.redir_hint = 1;
+		prepare_irte(&irte, cfg->vector, dest);
 
 		/* Set source-id of interrupt request */
 		if (pdev)
-- 
cgit v1.2.3-18-g5258


From fa47f7e52874683a9659df2f1f143105f676dc0f Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Fri, 27 Aug 2010 11:09:50 -0700
Subject: x86, x2apic: Simplify apic init in SMP and UP builds

Move enable_IR_x2apic() inside the default_setup_apic_routing(),
and for SMP platforms, move the default_setup_apic_routing() after
smp_sanity_check(). This cleans up the code that tries to avoid multiple
calls to default_setup_apic_routing() when smp_sanity_check() fails (which
goes through the APIC_init_uniprocessor() path).

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <20100827181049.173087246@sbsiddha-MOBL3.sc.intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/apic/apic.c     | 3 ---
 arch/x86/kernel/apic/probe_64.c | 3 +++
 arch/x86/kernel/smpboot.c       | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index e3b534cda49..8cf86fb3b4e 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1665,10 +1665,7 @@ int __init APIC_init_uniprocessor(void)
 	}
 #endif
 
-#ifndef CONFIG_SMP
-	enable_IR_x2apic();
 	default_setup_apic_routing();
-#endif
 
 	verify_local_APIC();
 	connect_bsp_APIC();
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index 83e9be4778e..f9e4e6a5407 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -54,6 +54,9 @@ static int apicid_phys_pkg_id(int initial_apic_id, int index_msb)
  */
 void __init default_setup_apic_routing(void)
 {
+
+	enable_IR_x2apic();
+
 #ifdef CONFIG_X86_X2APIC
 	if (x2apic_mode
 #ifdef CONFIG_X86_UV
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 8b3bfc4dd70..87a8c6b00f8 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1109,8 +1109,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
 	}
 	set_cpu_sibling_map(0);
 
-	enable_IR_x2apic();
-	default_setup_apic_routing();
 
 	if (smp_sanity_check(max_cpus) < 0) {
 		printk(KERN_INFO "SMP disabled\n");
@@ -1118,6 +1116,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
 		goto out;
 	}
 
+	default_setup_apic_routing();
+
 	preempt_disable();
 	if (read_apic_id() != boot_cpu_physical_apicid) {
 		panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
-- 
cgit v1.2.3-18-g5258


From 3ee48b6af49cf534ca2f481ecc484b156a41451d Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Thu, 16 Sep 2010 11:44:02 -0500
Subject: mm, x86: Saving vmcore with non-lazy freeing of vmas

During the reading of /proc/vmcore the kernel is doing
ioremap()/iounmap() repeatedly. And the buildup of un-flushed
vm_area_struct's is causing a great deal of overhead. (rb_next()
is chewing up most of that time).

This solution is to provide function set_iounmap_nonlazy(). It
causes a subsequent call to iounmap() to immediately purge the
vma area (with try_purge_vmap_area_lazy()).

With this patch we have seen the time for writing a 250MB
compressed dump drop from 71 seconds to 44 seconds.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: kexec@lists.infradead.org
Cc: <stable@kernel.org>
LKML-Reference: <E1OwHZ4-0005WK-Tw@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/io.h       | 1 +
 arch/x86/kernel/crash_dump_64.c | 1 +
 2 files changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 30a3e977612..6a45ec41ec2 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -206,6 +206,7 @@ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
 
 extern void iounmap(volatile void __iomem *addr);
 
+extern void set_iounmap_nonlazy(void);
 
 #ifdef __KERNEL__
 
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index bf43188ca65..994828899e0 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -46,6 +46,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
 	} else
 		memcpy(buf, vaddr + offset, csize);
 
+	set_iounmap_nonlazy();
 	iounmap(vaddr);
 	return csize;
 }
-- 
cgit v1.2.3-18-g5258


From 3518dd14ca888085797ca8d3a9e11c8ef9e7ae68 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Fri, 17 Sep 2010 18:07:45 +0200
Subject: x86, cacheinfo: Fix dependency of AMD L3 CID

L3 cache index disable code uses PCI accesses to AMD northbridge functions.
Currently the code is #ifdef CONFIG_CPU_SUP_AMD.
But it should be #if (defined(CONFIG_CPU_SUP_AMD) && defined(CONFIG_PCI))
which in the end is a dependency to K8_NB.

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
LKML-Reference: <20100917160744.GF4958@loge.amd.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/cpu/intel_cacheinfo.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 898c2f4eab8..2521cdcb877 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -306,7 +306,7 @@ struct _cache_attr {
 	ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count);
 };
 
-#ifdef CONFIG_CPU_SUP_AMD
+#ifdef CONFIG_K8_NB
 
 /*
  * L3 cache descriptors
@@ -556,12 +556,12 @@ static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
 static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
 		show_cache_disable_1, store_cache_disable_1);
 
-#else	/* CONFIG_CPU_SUP_AMD */
+#else	/* CONFIG_K8_NB */
 static void __cpuinit
 amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index)
 {
 };
-#endif /* CONFIG_CPU_SUP_AMD */
+#endif /* CONFIG_K8_NB */
 
 static int
 __cpuinit cpuid4_cache_lookup_regs(int index,
@@ -1000,7 +1000,7 @@ static struct attribute *default_attrs[] = {
 
 static struct attribute *default_l3_attrs[] = {
 	DEFAULT_SYSFS_CACHE_ATTRS,
-#ifdef CONFIG_CPU_SUP_AMD
+#ifdef CONFIG_K8_NB
 	&cache_disable_0.attr,
 	&cache_disable_1.attr,
 #endif
-- 
cgit v1.2.3-18-g5258


From 900f9ac9f12dc3dd6fc8e33e16df172eafcaead6 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Fri, 17 Sep 2010 18:02:54 +0200
Subject: x86, k8-gart: Decouple handling of garts and northbridges

So far we only provide num_k8_northbridges. This is required in
different areas (e.g. L3 cache index disable, GART). But not all AMD
CPUs provide a GART. Thus it is useful to split off the GART handling
from the generic caching of AMD northbridge misc devices.

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
LKML-Reference: <20100917160254.GC4958@loge.amd.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/k8.h             | 13 +++++----
 arch/x86/kernel/cpu/intel_cacheinfo.c |  4 +--
 arch/x86/kernel/k8.c                  | 52 ++++++++++++++++++++---------------
 arch/x86/kernel/pci-gart_64.c         | 27 ++++++++++++------
 4 files changed, 58 insertions(+), 38 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/k8.h
index af00bd1d208..9cee145dcac 100644
--- a/arch/x86/include/asm/k8.h
+++ b/arch/x86/include/asm/k8.h
@@ -7,24 +7,27 @@ extern struct pci_device_id k8_nb_ids[];
 struct bootnode;
 
 extern int early_is_k8_nb(u32 value);
-extern struct pci_dev **k8_northbridges;
-extern int num_k8_northbridges;
 extern int cache_k8_northbridges(void);
 extern void k8_flush_garts(void);
 extern int k8_get_nodes(struct bootnode *nodes);
 extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn);
 extern int k8_scan_nodes(void);
 
+struct k8_northbridge_info {
+	u16 num;
+	u8 gart_supported;
+	struct pci_dev **nb_misc;
+};
+extern struct k8_northbridge_info k8_northbridges;
+
 #ifdef CONFIG_K8_NB
-extern int num_k8_northbridges;
 
 static inline struct pci_dev *node_to_k8_nb_misc(int node)
 {
-	return (node < num_k8_northbridges) ? k8_northbridges[node] : NULL;
+	return (node < k8_northbridges.num) ? k8_northbridges.nb_misc[node] : NULL;
 }
 
 #else
-#define num_k8_northbridges 0
 
 static inline struct pci_dev *node_to_k8_nb_misc(int node)
 {
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 2521cdcb877..6fdfb0b20f8 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -369,7 +369,7 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
 			return;
 
 	/* not in virtualized environments */
-	if (num_k8_northbridges == 0)
+	if (k8_northbridges.num == 0)
 		return;
 
 	/*
@@ -377,7 +377,7 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
 	 * never freed but this is done only on shutdown so it doesn't matter.
 	 */
 	if (!l3_caches) {
-		int size = num_k8_northbridges * sizeof(struct amd_l3_cache *);
+		int size = k8_northbridges.num * sizeof(struct amd_l3_cache *);
 
 		l3_caches = kzalloc(size, GFP_ATOMIC);
 		if (!l3_caches)
diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/k8.c
index 0f7bc20cfcd..5de1b6b3963 100644
--- a/arch/x86/kernel/k8.c
+++ b/arch/x86/kernel/k8.c
@@ -10,9 +10,6 @@
 #include <linux/spinlock.h>
 #include <asm/k8.h>
 
-int num_k8_northbridges;
-EXPORT_SYMBOL(num_k8_northbridges);
-
 static u32 *flush_words;
 
 struct pci_device_id k8_nb_ids[] = {
@@ -22,7 +19,7 @@ struct pci_device_id k8_nb_ids[] = {
 };
 EXPORT_SYMBOL(k8_nb_ids);
 
-struct pci_dev **k8_northbridges;
+struct k8_northbridge_info k8_northbridges;
 EXPORT_SYMBOL(k8_northbridges);
 
 static struct pci_dev *next_k8_northbridge(struct pci_dev *dev)
@@ -40,36 +37,44 @@ int cache_k8_northbridges(void)
 	int i;
 	struct pci_dev *dev;
 
-	if (num_k8_northbridges)
+	if (k8_northbridges.num)
 		return 0;
 
 	dev = NULL;
 	while ((dev = next_k8_northbridge(dev)) != NULL)
-		num_k8_northbridges++;
+		k8_northbridges.num++;
+
+	/* some CPU families (e.g. family 0x11) do not support GART */
+	if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10)
+		k8_northbridges.gart_supported = 1;
 
-	k8_northbridges = kmalloc((num_k8_northbridges + 1) * sizeof(void *),
-				  GFP_KERNEL);
-	if (!k8_northbridges)
+	k8_northbridges.nb_misc = kmalloc((k8_northbridges.num + 1) *
+					  sizeof(void *), GFP_KERNEL);
+	if (!k8_northbridges.nb_misc)
 		return -ENOMEM;
 
-	if (!num_k8_northbridges) {
-		k8_northbridges[0] = NULL;
+	if (!k8_northbridges.num) {
+		k8_northbridges.nb_misc[0] = NULL;
 		return 0;
 	}
 
-	flush_words = kmalloc(num_k8_northbridges * sizeof(u32), GFP_KERNEL);
-	if (!flush_words) {
-		kfree(k8_northbridges);
-		return -ENOMEM;
+	if (k8_northbridges.gart_supported) {
+		flush_words = kmalloc(k8_northbridges.num * sizeof(u32),
+				      GFP_KERNEL);
+		if (!flush_words) {
+			kfree(k8_northbridges.nb_misc);
+			return -ENOMEM;
+		}
 	}
 
 	dev = NULL;
 	i = 0;
 	while ((dev = next_k8_northbridge(dev)) != NULL) {
-		k8_northbridges[i] = dev;
-		pci_read_config_dword(dev, 0x9c, &flush_words[i++]);
+		k8_northbridges.nb_misc[i] = dev;
+		if (k8_northbridges.gart_supported)
+			pci_read_config_dword(dev, 0x9c, &flush_words[i++]);
 	}
-	k8_northbridges[i] = NULL;
+	k8_northbridges.nb_misc[i] = NULL;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(cache_k8_northbridges);
@@ -93,22 +98,25 @@ void k8_flush_garts(void)
 	unsigned long flags;
 	static DEFINE_SPINLOCK(gart_lock);
 
+	if (!k8_northbridges.gart_supported)
+		return;
+
 	/* Avoid races between AGP and IOMMU. In theory it's not needed
 	   but I'm not sure if the hardware won't lose flush requests
 	   when another is pending. This whole thing is so expensive anyways
 	   that it doesn't matter to serialize more. -AK */
 	spin_lock_irqsave(&gart_lock, flags);
 	flushed = 0;
-	for (i = 0; i < num_k8_northbridges; i++) {
-		pci_write_config_dword(k8_northbridges[i], 0x9c,
+	for (i = 0; i < k8_northbridges.num; i++) {
+		pci_write_config_dword(k8_northbridges.nb_misc[i], 0x9c,
 				       flush_words[i]|1);
 		flushed++;
 	}
-	for (i = 0; i < num_k8_northbridges; i++) {
+	for (i = 0; i < k8_northbridges.num; i++) {
 		u32 w;
 		/* Make sure the hardware actually executed the flush*/
 		for (;;) {
-			pci_read_config_dword(k8_northbridges[i],
+			pci_read_config_dword(k8_northbridges.nb_misc[i],
 					      0x9c, &w);
 			if (!(w & 1))
 				break;
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 0f7f130caa6..8f214a2643f 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -560,8 +560,11 @@ static void enable_gart_translations(void)
 {
 	int i;
 
-	for (i = 0; i < num_k8_northbridges; i++) {
-		struct pci_dev *dev = k8_northbridges[i];
+	if (!k8_northbridges.gart_supported)
+		return;
+
+	for (i = 0; i < k8_northbridges.num; i++) {
+		struct pci_dev *dev = k8_northbridges.nb_misc[i];
 
 		enable_gart_translation(dev, __pa(agp_gatt_table));
 	}
@@ -592,10 +595,13 @@ static void gart_fixup_northbridges(struct sys_device *dev)
 	if (!fix_up_north_bridges)
 		return;
 
+	if (!k8_northbridges.gart_supported)
+		return;
+
 	pr_info("PCI-DMA: Restoring GART aperture settings\n");
 
-	for (i = 0; i < num_k8_northbridges; i++) {
-		struct pci_dev *dev = k8_northbridges[i];
+	for (i = 0; i < k8_northbridges.num; i++) {
+		struct pci_dev *dev = k8_northbridges.nb_misc[i];
 
 		/*
 		 * Don't enable translations just yet.  That is the next
@@ -649,8 +655,8 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
 
 	aper_size = aper_base = info->aper_size = 0;
 	dev = NULL;
-	for (i = 0; i < num_k8_northbridges; i++) {
-		dev = k8_northbridges[i];
+	for (i = 0; i < k8_northbridges.num; i++) {
+		dev = k8_northbridges.nb_misc[i];
 		new_aper_base = read_aperture(dev, &new_aper_size);
 		if (!new_aper_base)
 			goto nommu;
@@ -718,10 +724,13 @@ static void gart_iommu_shutdown(void)
 	if (!no_agp)
 		return;
 
-	for (i = 0; i < num_k8_northbridges; i++) {
+	if (!k8_northbridges.gart_supported)
+		return;
+
+	for (i = 0; i < k8_northbridges.num; i++) {
 		u32 ctl;
 
-		dev = k8_northbridges[i];
+		dev = k8_northbridges.nb_misc[i];
 		pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
 
 		ctl &= ~GARTEN;
@@ -739,7 +748,7 @@ int __init gart_iommu_init(void)
 	unsigned long scratch;
 	long i;
 
-	if (num_k8_northbridges == 0)
+	if (!k8_northbridges.gart_supported)
 		return 0;
 
 #ifndef CONFIG_AGP_AMD64
-- 
cgit v1.2.3-18-g5258


From bc83cccc761953f878088cdfa682de0970b5561f Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Fri, 17 Sep 2010 15:36:40 -0700
Subject: x86, mwait: Move mwait constants to a common header file

We have MWAIT constants spread across three different .c files, for no
good reason.  Move them all into a common header file.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Reviewed-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Len Brown <lenb@kernel.org>
LKML-Reference: <tip-*@git.kernel.org>
---
 arch/x86/include/asm/mwait.h  | 15 +++++++++++++++
 arch/x86/kernel/acpi/cstate.c | 11 +----------
 2 files changed, 16 insertions(+), 10 deletions(-)
 create mode 100644 arch/x86/include/asm/mwait.h

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
new file mode 100644
index 00000000000..bcdff997668
--- /dev/null
+++ b/arch/x86/include/asm/mwait.h
@@ -0,0 +1,15 @@
+#ifndef _ASM_X86_MWAIT_H
+#define _ASM_X86_MWAIT_H
+
+#define MWAIT_SUBSTATE_MASK		0xf
+#define MWAIT_CSTATE_MASK		0xf
+#define MWAIT_SUBSTATE_SIZE		4
+#define MWAIT_MAX_NUM_CSTATES		8
+
+#define CPUID_MWAIT_LEAF		5
+#define CPUID5_ECX_EXTENSIONS_SUPPORTED 0x1
+#define CPUID5_ECX_INTERRUPT_BREAK	0x2
+
+#define MWAIT_ECX_INTERRUPT_BREAK	0x1
+
+#endif /* _ASM_X86_MWAIT_H */
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index fb7a5f052e2..bcc4adda7b9 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -13,6 +13,7 @@
 
 #include <acpi/processor.h>
 #include <asm/acpi.h>
+#include <asm/mwait.h>
 
 /*
  * Initialize bm_flags based on the CPU cache properties
@@ -65,16 +66,6 @@ static struct cstate_entry *cpu_cstate_entry;	/* per CPU ptr */
 
 static short mwait_supported[ACPI_PROCESSOR_MAX_POWER];
 
-#define MWAIT_SUBSTATE_MASK	(0xf)
-#define MWAIT_CSTATE_MASK	(0xf)
-#define MWAIT_SUBSTATE_SIZE	(4)
-
-#define CPUID_MWAIT_LEAF (5)
-#define CPUID5_ECX_EXTENSIONS_SUPPORTED (0x1)
-#define CPUID5_ECX_INTERRUPT_BREAK	(0x2)
-
-#define MWAIT_ECX_INTERRUPT_BREAK	(0x1)
-
 #define NATIVE_CSTATE_BEYOND_HALT	(2)
 
 static long acpi_processor_ffh_cstate_probe_cpu(void *_cx)
-- 
cgit v1.2.3-18-g5258


From ea53069231f9317062910d6e772cca4ce93de8c8 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Fri, 17 Sep 2010 15:39:11 -0700
Subject: x86, hotplug: Use mwait to offline a processor, fix the legacy case

The code in native_play_dead() has a number of problems:

1. We should use MWAIT when available, to put ourselves into a deeper
   sleep state.
2. We use the existence of CLFLUSH to determine if WBINVD is safe, but
   that is totally bogus -- WBINVD is 486+, whereas CLFLUSH is a much
   later addition.
3. We should do WBINVD inside the loop, just in case of something like
   setting an A bit on page tables.  Pointed out by Arjan van de Ven.

This code is based in part of a previous patch by Venki Pallipadi, but
unlike that patch this one keeps all the detection code local instead
of pre-caching a bunch of information.  We're shutting down the CPU;
there is absolutely no hurry.

This patch moves all the code to C and deletes the global
wbinvd_halt() which is broken anyway.

Originally-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Reviewed-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Len Brown <lenb@kernel.org>
Cc: Venkatesh Pallipadi <venki@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.hl>
LKML-Reference: <20090522232230.162239000@intel.com>
---
 arch/x86/include/asm/processor.h | 23 ---------------
 arch/x86/kernel/smpboot.c        | 63 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 62 insertions(+), 24 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 325b7bdbeba..f358241e412 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -764,29 +764,6 @@ extern unsigned long		idle_halt;
 extern unsigned long		idle_nomwait;
 extern bool			c1e_detected;
 
-/*
- * on systems with caches, caches must be flashed as the absolute
- * last instruction before going into a suspended halt.  Otherwise,
- * dirty data can linger in the cache and become stale on resume,
- * leading to strange errors.
- *
- * perform a variety of operations to guarantee that the compiler
- * will not reorder instructions.  wbinvd itself is serializing
- * so the processor will not reorder.
- *
- * Systems without cache can just go into halt.
- */
-static inline void wbinvd_halt(void)
-{
-	mb();
-	/* check for clflush to determine if wbinvd is legal */
-	if (cpu_has_clflush)
-		asm volatile("cli; wbinvd; 1: hlt; jmp 1b" : : : "memory");
-	else
-		while (1)
-			halt();
-}
-
 extern void enable_sep_cpu(void);
 extern int sysenter_setup(void);
 
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 8b3bfc4dd70..07bf4233441 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -62,6 +62,7 @@
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <asm/mtrr.h>
+#include <asm/mwait.h>
 #include <asm/vmi.h>
 #include <asm/apic.h>
 #include <asm/setup.h>
@@ -1383,11 +1384,71 @@ void play_dead_common(void)
 	local_irq_disable();
 }
 
+/*
+ * We need to flush the caches before going to sleep, lest we have
+ * dirty data in our caches when we come back up.
+ */
+static inline void mwait_play_dead(void)
+{
+	unsigned int eax, ebx, ecx, edx;
+	unsigned int highest_cstate = 0;
+	unsigned int highest_subcstate = 0;
+	int i;
+
+	if (!cpu_has(&current_cpu_data, X86_FEATURE_MWAIT))
+		return;
+	if (current_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
+		return;
+
+	eax = CPUID_MWAIT_LEAF;
+	ecx = 0;
+	native_cpuid(&eax, &ebx, &ecx, &edx);
+
+	/*
+	 * eax will be 0 if EDX enumeration is not valid.
+	 * Initialized below to cstate, sub_cstate value when EDX is valid.
+	 */
+	if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) {
+		eax = 0;
+	} else {
+		edx >>= MWAIT_SUBSTATE_SIZE;
+		for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
+			if (edx & MWAIT_SUBSTATE_MASK) {
+				highest_cstate = i;
+				highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
+			}
+		}
+		eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
+			(highest_subcstate - 1);
+	}
+
+	while (1) {
+		mb();
+		wbinvd();
+		__monitor(&current_thread_info()->flags, 0, 0);
+		mb();
+		__mwait(eax, 0);
+	}
+}
+
+static inline void hlt_play_dead(void)
+{
+	while (1) {
+		mb();
+		if (current_cpu_data.x86 >= 4)
+			wbinvd();
+		mb();
+		native_halt();
+	}
+}
+
 void native_play_dead(void)
 {
 	play_dead_common();
 	tboot_shutdown(TB_SHUTDOWN_WFS);
-	wbinvd_halt();
+
+	mwait_play_dead();	/* Only returns on failure */
+	hlt_play_dead();
 }
 
 #else /* ... !CONFIG_HOTPLUG_CPU */
-- 
cgit v1.2.3-18-g5258


From a68e5c94f7d3dd64fef34dd5d97e365cae4bb42a Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Fri, 17 Sep 2010 17:06:46 -0700
Subject: x86, hotplug: Move WBINVD back outside the play_dead loop

On processors with hyperthreading, when only one thread is offlined
the other thread can cause a spurious wakeup on the idled thread.  We
do not want to re-WBINVD when that happens.

Ideally, we should simply skip WBINVD unless we're the last thread on
a particular core to shut down, but there might be similar issues
elsewhere in the system.

Thus, revert to previous behavior of only WBINVD outside the loop.
Partly as a result, remove the mb()'s around it: they are not
necessary since wbinvd() is a serializing instruction, but they were
intended to make sure the compiler didn't do any funny loop
optimizations.

Reported-by: Asit Mallick <asit.k.mallick@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: Arjan van de Ven <arjan@linux.kernel.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Venkatesh Pallipadi <venki@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.hl>
LKML-Reference: <tip-ea53069231f9317062910d6e772cca4ce93de8c8@git.kernel.org>
---
 arch/x86/kernel/smpboot.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 07bf4233441..55c80ffb871 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1422,9 +1422,9 @@ static inline void mwait_play_dead(void)
 			(highest_subcstate - 1);
 	}
 
+	wbinvd();
+
 	while (1) {
-		mb();
-		wbinvd();
 		__monitor(&current_thread_info()->flags, 0, 0);
 		mb();
 		__mwait(eax, 0);
@@ -1433,11 +1433,10 @@ static inline void mwait_play_dead(void)
 
 static inline void hlt_play_dead(void)
 {
+	if (current_cpu_data.x86 >= 4)
+		wbinvd();
+
 	while (1) {
-		mb();
-		if (current_cpu_data.x86 >= 4)
-			wbinvd();
-		mb();
 		native_halt();
 	}
 }
-- 
cgit v1.2.3-18-g5258


From 995bd3bb5c78f3ff71339803c0b8337ed36d64fb Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 15 Sep 2010 15:11:57 +0200
Subject: x86: Hpet: Avoid the comparator readback penalty

Due to the overly intelligent design of HPETs, we need to workaround
the problem that the compare value which we write is already behind
the actual counter value at the point where the value hits the real
compare register. This happens for two reasons:

1) We read out the counter, add the delta and write the result to the
   compare register. When a NMI or SMI hits between the read out and
   the write then the counter can be ahead of the event already

2) The write to the compare register is delayed by up to two HPET
   cycles in certain chipsets.

We worked around this by reading back the compare register to make
sure that the written value has hit the hardware. For certain ICH9+
chipsets this can require two readouts, as the first one can return
the previous compare register value. That's bad performance wise for
the normal case where the event is far enough in the future.

As we already know that the write can be delayed by up to two cycles
we can avoid the read back of the compare register completely if we
make the decision whether the delta has elapsed already or not based
on the following calculation:

  cmp = event - actual_count;

If cmp is less than 8 HPET clock cycles, then we decide that the event
has happened already and return -ETIME. That covers the above #1 and
#2 problems which would cause a wait for HPET wraparound (~306
seconds).

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Nix <nix@esperi.org.uk>
Tested-by: Artur Skawina <art.08.09@gmail.com>
Cc: Damien Wyart <damien.wyart@free.fr>
Tested-by: John Drescher <drescherjm@gmail.com>
Cc: Venkatesh Pallipadi <venki@google.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Andreas Herrmann <andreas.herrmann3@amd.com>
Tested-by: Borislav Petkov <borislav.petkov@amd.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <alpine.LFD.2.00.1009151500060.2416@localhost6.localdomain6>
---
 arch/x86/kernel/hpet.c | 51 +++++++++++++++++++++-----------------------------
 1 file changed, 21 insertions(+), 30 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 410fdb3f193..0b568b30a4d 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -380,44 +380,35 @@ static int hpet_next_event(unsigned long delta,
 			   struct clock_event_device *evt, int timer)
 {
 	u32 cnt;
+	s32 res;
 
 	cnt = hpet_readl(HPET_COUNTER);
 	cnt += (u32) delta;
 	hpet_writel(cnt, HPET_Tn_CMP(timer));
 
 	/*
-	 * We need to read back the CMP register on certain HPET
-	 * implementations (ATI chipsets) which seem to delay the
-	 * transfer of the compare register into the internal compare
-	 * logic. With small deltas this might actually be too late as
-	 * the counter could already be higher than the compare value
-	 * at that point and we would wait for the next hpet interrupt
-	 * forever. We found out that reading the CMP register back
-	 * forces the transfer so we can rely on the comparison with
-	 * the counter register below. If the read back from the
-	 * compare register does not match the value we programmed
-	 * then we might have a real hardware problem. We can not do
-	 * much about it here, but at least alert the user/admin with
-	 * a prominent warning.
-	 *
-	 * An erratum on some chipsets (ICH9,..), results in
-	 * comparator read immediately following a write returning old
-	 * value. Workaround for this is to read this value second
-	 * time, when first read returns old value.
-	 *
-	 * In fact the write to the comparator register is delayed up
-	 * to two HPET cycles so the workaround we tried to restrict
-	 * the readback to those known to be borked ATI chipsets
-	 * failed miserably. So we give up on optimizations forever
-	 * and penalize all HPET incarnations unconditionally.
+	 * HPETs are a complete disaster. The compare register is
+	 * based on a equal comparison and neither provides a less
+	 * than or equal functionality (which would require to take
+	 * the wraparound into account) nor a simple count down event
+	 * mode. Further the write to the comparator register is
+	 * delayed internally up to two HPET clock cycles in certain
+	 * chipsets (ATI, ICH9,10). We worked around that by reading
+	 * back the compare register, but that required another
+	 * workaround for ICH9,10 chips where the first readout after
+	 * write can return the old stale value. We already have a
+	 * minimum delta of 5us enforced, but a NMI or SMI hitting
+	 * between the counter readout and the comparator write can
+	 * move us behind that point easily. Now instead of reading
+	 * the compare register back several times, we make the ETIME
+	 * decision based on the following: Return ETIME if the
+	 * counter value after the write is less than 8 HPET cycles
+	 * away from the event or if the counter is already ahead of
+	 * the event.
 	 */
-	if (unlikely((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt)) {
-		if (hpet_readl(HPET_Tn_CMP(timer)) != cnt)
-			printk_once(KERN_WARNING
-				"hpet: compare register read back failed.\n");
-	}
+	res = (s32)(cnt - hpet_readl(HPET_COUNTER));
 
-	return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0;
+	return res < 8 ? -ETIME : 0;
 }
 
 static void hpet_legacy_set_mode(enum clock_event_mode mode,
-- 
cgit v1.2.3-18-g5258


From 838a2e55e6a4e9e8a10451ed2ef0f7a08dabdb04 Mon Sep 17 00:00:00 2001
From: Arnaud Lacombe <lacombar@gmail.com>
Date: Sat, 4 Sep 2010 17:10:20 -0400
Subject: kbuild: migrate all arch to the kconfig mainmenu upgrade

Signed-off-by: Arnaud Lacombe <lacombar@gmail.com>
Reviewed-by: Sam Ravnborg <sam@ravnborg.org>
Reviewed-by: Michal Marek <mmarek@suse.cz>
---
 arch/x86/Kconfig | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index dcb0593b4a6..6c30b9e93d8 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1,6 +1,3 @@
-# x86 configuration
-mainmenu "Linux Kernel Configuration for x86"
-
 # Select 32 or 64 bit
 config 64BIT
 	bool "64-bit kernel" if ARCH = "x86"
-- 
cgit v1.2.3-18-g5258


From 23ac4ae827e6264e21b898f2cd3f601450aa02a6 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Fri, 17 Sep 2010 18:03:43 +0200
Subject: x86, k8: Rename k8.[ch] to amd_nb.[ch] and CONFIG_K8_NB to
 CONFIG_AMD_NB

The file names are somehow misleading as the code is not specific to
AMD K8 CPUs anymore. The files accomodate code for other AMD CPU
northbridges as well.

Same is true for the config option which is valid for AMD CPU
northbridges in general and not specific to K8.

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
LKML-Reference: <20100917160343.GD4958@loge.amd.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/Kconfig                      |   4 +-
 arch/x86/include/asm/amd_nb.h         |  39 +++++++++
 arch/x86/include/asm/k8.h             |  39 ---------
 arch/x86/kernel/Makefile              |   2 +-
 arch/x86/kernel/amd_nb.c              | 145 ++++++++++++++++++++++++++++++++++
 arch/x86/kernel/aperture_64.c         |   2 +-
 arch/x86/kernel/cpu/intel_cacheinfo.c |  10 +--
 arch/x86/kernel/k8.c                  | 145 ----------------------------------
 arch/x86/kernel/pci-gart_64.c         |   2 +-
 arch/x86/kernel/setup.c               |   2 +-
 arch/x86/mm/k8topology_64.c           |   2 +-
 arch/x86/mm/numa_64.c                 |   2 +-
 12 files changed, 197 insertions(+), 197 deletions(-)
 create mode 100644 arch/x86/include/asm/amd_nb.h
 delete mode 100644 arch/x86/include/asm/k8.h
 create mode 100644 arch/x86/kernel/amd_nb.c
 delete mode 100644 arch/x86/kernel/k8.c

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cea0cd9a316..7fd41f0d754 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -670,7 +670,7 @@ config GART_IOMMU
 	bool "GART IOMMU support" if EMBEDDED
 	default y
 	select SWIOTLB
-	depends on X86_64 && PCI && K8_NB
+	depends on X86_64 && PCI && AMD_NB
 	---help---
 	  Support for full DMA access of devices with 32bit memory access only
 	  on systems with more than 3GB. This is usually needed for USB,
@@ -2076,7 +2076,7 @@ config OLPC_OPENFIRMWARE
 
 endif # X86_32
 
-config K8_NB
+config AMD_NB
 	def_bool y
 	depends on CPU_SUP_AMD && PCI
 
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
new file mode 100644
index 00000000000..c8517f81b21
--- /dev/null
+++ b/arch/x86/include/asm/amd_nb.h
@@ -0,0 +1,39 @@
+#ifndef _ASM_X86_AMD_NB_H
+#define _ASM_X86_AMD_NB_H
+
+#include <linux/pci.h>
+
+extern struct pci_device_id k8_nb_ids[];
+struct bootnode;
+
+extern int early_is_k8_nb(u32 value);
+extern int cache_k8_northbridges(void);
+extern void k8_flush_garts(void);
+extern int k8_get_nodes(struct bootnode *nodes);
+extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn);
+extern int k8_scan_nodes(void);
+
+struct k8_northbridge_info {
+	u16 num;
+	u8 gart_supported;
+	struct pci_dev **nb_misc;
+};
+extern struct k8_northbridge_info k8_northbridges;
+
+#ifdef CONFIG_AMD_NB
+
+static inline struct pci_dev *node_to_k8_nb_misc(int node)
+{
+	return (node < k8_northbridges.num) ? k8_northbridges.nb_misc[node] : NULL;
+}
+
+#else
+
+static inline struct pci_dev *node_to_k8_nb_misc(int node)
+{
+	return NULL;
+}
+#endif
+
+
+#endif /* _ASM_X86_AMD_NB_H */
diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/k8.h
deleted file mode 100644
index 9cee145dcac..00000000000
--- a/arch/x86/include/asm/k8.h
+++ /dev/null
@@ -1,39 +0,0 @@
-#ifndef _ASM_X86_K8_H
-#define _ASM_X86_K8_H
-
-#include <linux/pci.h>
-
-extern struct pci_device_id k8_nb_ids[];
-struct bootnode;
-
-extern int early_is_k8_nb(u32 value);
-extern int cache_k8_northbridges(void);
-extern void k8_flush_garts(void);
-extern int k8_get_nodes(struct bootnode *nodes);
-extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn);
-extern int k8_scan_nodes(void);
-
-struct k8_northbridge_info {
-	u16 num;
-	u8 gart_supported;
-	struct pci_dev **nb_misc;
-};
-extern struct k8_northbridge_info k8_northbridges;
-
-#ifdef CONFIG_K8_NB
-
-static inline struct pci_dev *node_to_k8_nb_misc(int node)
-{
-	return (node < k8_northbridges.num) ? k8_northbridges.nb_misc[node] : NULL;
-}
-
-#else
-
-static inline struct pci_dev *node_to_k8_nb_misc(int node)
-{
-	return NULL;
-}
-#endif
-
-
-#endif /* _ASM_X86_K8_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 0925676266b..25dc82d3e76 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -87,7 +87,7 @@ obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
 obj-$(CONFIG_HPET_TIMER) 	+= hpet.o
 obj-$(CONFIG_APB_TIMER)		+= apb_timer.o
 
-obj-$(CONFIG_K8_NB)		+= k8.o
+obj-$(CONFIG_AMD_NB)		+= amd_nb.o
 obj-$(CONFIG_DEBUG_RODATA_TEST)	+= test_rodata.o
 obj-$(CONFIG_DEBUG_NX_TEST)	+= test_nx.o
 
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
new file mode 100644
index 00000000000..4ffc38df7ac
--- /dev/null
+++ b/arch/x86/kernel/amd_nb.c
@@ -0,0 +1,145 @@
+/*
+ * Shared support code for AMD K8 northbridges and derivates.
+ * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2.
+ */
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <asm/amd_nb.h>
+
+static u32 *flush_words;
+
+struct pci_device_id k8_nb_ids[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
+	{}
+};
+EXPORT_SYMBOL(k8_nb_ids);
+
+struct k8_northbridge_info k8_northbridges;
+EXPORT_SYMBOL(k8_northbridges);
+
+static struct pci_dev *next_k8_northbridge(struct pci_dev *dev)
+{
+	do {
+		dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
+		if (!dev)
+			break;
+	} while (!pci_match_id(&k8_nb_ids[0], dev));
+	return dev;
+}
+
+int cache_k8_northbridges(void)
+{
+	int i;
+	struct pci_dev *dev;
+
+	if (k8_northbridges.num)
+		return 0;
+
+	dev = NULL;
+	while ((dev = next_k8_northbridge(dev)) != NULL)
+		k8_northbridges.num++;
+
+	/* some CPU families (e.g. family 0x11) do not support GART */
+	if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10)
+		k8_northbridges.gart_supported = 1;
+
+	k8_northbridges.nb_misc = kmalloc((k8_northbridges.num + 1) *
+					  sizeof(void *), GFP_KERNEL);
+	if (!k8_northbridges.nb_misc)
+		return -ENOMEM;
+
+	if (!k8_northbridges.num) {
+		k8_northbridges.nb_misc[0] = NULL;
+		return 0;
+	}
+
+	if (k8_northbridges.gart_supported) {
+		flush_words = kmalloc(k8_northbridges.num * sizeof(u32),
+				      GFP_KERNEL);
+		if (!flush_words) {
+			kfree(k8_northbridges.nb_misc);
+			return -ENOMEM;
+		}
+	}
+
+	dev = NULL;
+	i = 0;
+	while ((dev = next_k8_northbridge(dev)) != NULL) {
+		k8_northbridges.nb_misc[i] = dev;
+		if (k8_northbridges.gart_supported)
+			pci_read_config_dword(dev, 0x9c, &flush_words[i++]);
+	}
+	k8_northbridges.nb_misc[i] = NULL;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cache_k8_northbridges);
+
+/* Ignores subdevice/subvendor but as far as I can figure out
+   they're useless anyways */
+int __init early_is_k8_nb(u32 device)
+{
+	struct pci_device_id *id;
+	u32 vendor = device & 0xffff;
+	device >>= 16;
+	for (id = k8_nb_ids; id->vendor; id++)
+		if (vendor == id->vendor && device == id->device)
+			return 1;
+	return 0;
+}
+
+void k8_flush_garts(void)
+{
+	int flushed, i;
+	unsigned long flags;
+	static DEFINE_SPINLOCK(gart_lock);
+
+	if (!k8_northbridges.gart_supported)
+		return;
+
+	/* Avoid races between AGP and IOMMU. In theory it's not needed
+	   but I'm not sure if the hardware won't lose flush requests
+	   when another is pending. This whole thing is so expensive anyways
+	   that it doesn't matter to serialize more. -AK */
+	spin_lock_irqsave(&gart_lock, flags);
+	flushed = 0;
+	for (i = 0; i < k8_northbridges.num; i++) {
+		pci_write_config_dword(k8_northbridges.nb_misc[i], 0x9c,
+				       flush_words[i]|1);
+		flushed++;
+	}
+	for (i = 0; i < k8_northbridges.num; i++) {
+		u32 w;
+		/* Make sure the hardware actually executed the flush*/
+		for (;;) {
+			pci_read_config_dword(k8_northbridges.nb_misc[i],
+					      0x9c, &w);
+			if (!(w & 1))
+				break;
+			cpu_relax();
+		}
+	}
+	spin_unlock_irqrestore(&gart_lock, flags);
+	if (!flushed)
+		printk("nothing to flush?\n");
+}
+EXPORT_SYMBOL_GPL(k8_flush_garts);
+
+static __init int init_k8_nbs(void)
+{
+	int err = 0;
+
+	err = cache_k8_northbridges();
+
+	if (err < 0)
+		printk(KERN_NOTICE "K8 NB: Cannot enumerate AMD northbridges.\n");
+
+	return err;
+}
+
+/* This has to go after the PCI subsystem */
+fs_initcall(init_k8_nbs);
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index a2e0caf26e1..e91e042a5c8 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -27,7 +27,7 @@
 #include <asm/gart.h>
 #include <asm/pci-direct.h>
 #include <asm/dma.h>
-#include <asm/k8.h>
+#include <asm/amd_nb.h>
 #include <asm/x86_init.h>
 
 int gart_iommu_aperture;
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 6fdfb0b20f8..12cd823c8d0 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -17,7 +17,7 @@
 
 #include <asm/processor.h>
 #include <linux/smp.h>
-#include <asm/k8.h>
+#include <asm/amd_nb.h>
 #include <asm/smp.h>
 
 #define LVL_1_INST	1
@@ -306,7 +306,7 @@ struct _cache_attr {
 	ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count);
 };
 
-#ifdef CONFIG_K8_NB
+#ifdef CONFIG_AMD_NB
 
 /*
  * L3 cache descriptors
@@ -556,12 +556,12 @@ static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
 static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
 		show_cache_disable_1, store_cache_disable_1);
 
-#else	/* CONFIG_K8_NB */
+#else	/* CONFIG_AMD_NB */
 static void __cpuinit
 amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index)
 {
 };
-#endif /* CONFIG_K8_NB */
+#endif /* CONFIG_AMD_NB */
 
 static int
 __cpuinit cpuid4_cache_lookup_regs(int index,
@@ -1000,7 +1000,7 @@ static struct attribute *default_attrs[] = {
 
 static struct attribute *default_l3_attrs[] = {
 	DEFAULT_SYSFS_CACHE_ATTRS,
-#ifdef CONFIG_K8_NB
+#ifdef CONFIG_AMD_NB
 	&cache_disable_0.attr,
 	&cache_disable_1.attr,
 #endif
diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/k8.c
deleted file mode 100644
index 5de1b6b3963..00000000000
--- a/arch/x86/kernel/k8.c
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Shared support code for AMD K8 northbridges and derivates.
- * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2.
- */
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/errno.h>
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <asm/k8.h>
-
-static u32 *flush_words;
-
-struct pci_device_id k8_nb_ids[] = {
-	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
-	{}
-};
-EXPORT_SYMBOL(k8_nb_ids);
-
-struct k8_northbridge_info k8_northbridges;
-EXPORT_SYMBOL(k8_northbridges);
-
-static struct pci_dev *next_k8_northbridge(struct pci_dev *dev)
-{
-	do {
-		dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
-		if (!dev)
-			break;
-	} while (!pci_match_id(&k8_nb_ids[0], dev));
-	return dev;
-}
-
-int cache_k8_northbridges(void)
-{
-	int i;
-	struct pci_dev *dev;
-
-	if (k8_northbridges.num)
-		return 0;
-
-	dev = NULL;
-	while ((dev = next_k8_northbridge(dev)) != NULL)
-		k8_northbridges.num++;
-
-	/* some CPU families (e.g. family 0x11) do not support GART */
-	if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10)
-		k8_northbridges.gart_supported = 1;
-
-	k8_northbridges.nb_misc = kmalloc((k8_northbridges.num + 1) *
-					  sizeof(void *), GFP_KERNEL);
-	if (!k8_northbridges.nb_misc)
-		return -ENOMEM;
-
-	if (!k8_northbridges.num) {
-		k8_northbridges.nb_misc[0] = NULL;
-		return 0;
-	}
-
-	if (k8_northbridges.gart_supported) {
-		flush_words = kmalloc(k8_northbridges.num * sizeof(u32),
-				      GFP_KERNEL);
-		if (!flush_words) {
-			kfree(k8_northbridges.nb_misc);
-			return -ENOMEM;
-		}
-	}
-
-	dev = NULL;
-	i = 0;
-	while ((dev = next_k8_northbridge(dev)) != NULL) {
-		k8_northbridges.nb_misc[i] = dev;
-		if (k8_northbridges.gart_supported)
-			pci_read_config_dword(dev, 0x9c, &flush_words[i++]);
-	}
-	k8_northbridges.nb_misc[i] = NULL;
-	return 0;
-}
-EXPORT_SYMBOL_GPL(cache_k8_northbridges);
-
-/* Ignores subdevice/subvendor but as far as I can figure out
-   they're useless anyways */
-int __init early_is_k8_nb(u32 device)
-{
-	struct pci_device_id *id;
-	u32 vendor = device & 0xffff;
-	device >>= 16;
-	for (id = k8_nb_ids; id->vendor; id++)
-		if (vendor == id->vendor && device == id->device)
-			return 1;
-	return 0;
-}
-
-void k8_flush_garts(void)
-{
-	int flushed, i;
-	unsigned long flags;
-	static DEFINE_SPINLOCK(gart_lock);
-
-	if (!k8_northbridges.gart_supported)
-		return;
-
-	/* Avoid races between AGP and IOMMU. In theory it's not needed
-	   but I'm not sure if the hardware won't lose flush requests
-	   when another is pending. This whole thing is so expensive anyways
-	   that it doesn't matter to serialize more. -AK */
-	spin_lock_irqsave(&gart_lock, flags);
-	flushed = 0;
-	for (i = 0; i < k8_northbridges.num; i++) {
-		pci_write_config_dword(k8_northbridges.nb_misc[i], 0x9c,
-				       flush_words[i]|1);
-		flushed++;
-	}
-	for (i = 0; i < k8_northbridges.num; i++) {
-		u32 w;
-		/* Make sure the hardware actually executed the flush*/
-		for (;;) {
-			pci_read_config_dword(k8_northbridges.nb_misc[i],
-					      0x9c, &w);
-			if (!(w & 1))
-				break;
-			cpu_relax();
-		}
-	}
-	spin_unlock_irqrestore(&gart_lock, flags);
-	if (!flushed)
-		printk("nothing to flush?\n");
-}
-EXPORT_SYMBOL_GPL(k8_flush_garts);
-
-static __init int init_k8_nbs(void)
-{
-	int err = 0;
-
-	err = cache_k8_northbridges();
-
-	if (err < 0)
-		printk(KERN_NOTICE "K8 NB: Cannot enumerate AMD northbridges.\n");
-
-	return err;
-}
-
-/* This has to go after the PCI subsystem */
-fs_initcall(init_k8_nbs);
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 8f214a2643f..67e5665ce42 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -39,7 +39,7 @@
 #include <asm/cacheflush.h>
 #include <asm/swiotlb.h>
 #include <asm/dma.h>
-#include <asm/k8.h>
+#include <asm/amd_nb.h>
 #include <asm/x86_init.h>
 
 static unsigned long iommu_bus_base;	/* GART remapping area (physical) */
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index c3a4fbb2b99..77eea0362a4 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -107,7 +107,7 @@
 #include <asm/percpu.h>
 #include <asm/topology.h>
 #include <asm/apicdef.h>
-#include <asm/k8.h>
+#include <asm/amd_nb.h>
 #ifdef CONFIG_X86_64
 #include <asm/numa_64.h>
 #endif
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c
index 970ed579d4e..ab75b181812 100644
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/k8topology_64.c
@@ -22,7 +22,7 @@
 #include <asm/numa.h>
 #include <asm/mpspec.h>
 #include <asm/apic.h>
-#include <asm/k8.h>
+#include <asm/amd_nb.h>
 
 static struct bootnode __initdata nodes[8];
 static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE;
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index a7bcc23ef96..4962f1aeda6 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -18,7 +18,7 @@
 #include <asm/dma.h>
 #include <asm/numa.h>
 #include <asm/acpi.h>
-#include <asm/k8.h>
+#include <asm/amd_nb.h>
 
 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
 EXPORT_SYMBOL(node_data);
-- 
cgit v1.2.3-18-g5258


From f49aa448561fe9215f43405cac6f31eb86317792 Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@redhat.com>
Date: Fri, 17 Sep 2010 11:08:51 -0400
Subject: jump label: Make dynamic no-op selection available outside of ftrace

Move Steve's code for finding the best 5-byte no-op from ftrace.c to
alternative.c. The idea is that other consumers (in this case jump label)
want to make use of that code.

Signed-off-by: Jason Baron <jbaron@redhat.com>
LKML-Reference: <96259ae74172dcac99c0020c249743c523a92e18.1284733808.git.jbaron@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/include/asm/alternative.h |  8 +++++
 arch/x86/kernel/alternative.c      | 64 ++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/ftrace.c           | 63 +------------------------------------
 arch/x86/kernel/setup.c            |  6 ++++
 4 files changed, 79 insertions(+), 62 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index bc6abb7bc7e..27a35b68918 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -180,4 +180,12 @@ static inline void apply_paravirt(struct paravirt_patch_site *start,
 extern void *text_poke(void *addr, const void *opcode, size_t len);
 extern void *text_poke_smp(void *addr, const void *opcode, size_t len);
 
+#if defined(CONFIG_DYNAMIC_FTRACE)
+#define IDEAL_NOP_SIZE_5 5
+extern unsigned char ideal_nop5[IDEAL_NOP_SIZE_5];
+extern void arch_init_ideal_nop5(void);
+#else
+static inline void arch_init_ideal_nop5(void) {}
+#endif
+
 #endif /* _ASM_X86_ALTERNATIVE_H */
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index f65ab8b014c..1849d8036ee 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -641,3 +641,67 @@ void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
 	return addr;
 }
 
+#if defined(CONFIG_DYNAMIC_FTRACE)
+
+unsigned char ideal_nop5[IDEAL_NOP_SIZE_5];
+
+void __init arch_init_ideal_nop5(void)
+{
+	extern const unsigned char ftrace_test_p6nop[];
+	extern const unsigned char ftrace_test_nop5[];
+	extern const unsigned char ftrace_test_jmp[];
+	int faulted = 0;
+
+	/*
+	 * There is no good nop for all x86 archs.
+	 * We will default to using the P6_NOP5, but first we
+	 * will test to make sure that the nop will actually
+	 * work on this CPU. If it faults, we will then
+	 * go to a lesser efficient 5 byte nop. If that fails
+	 * we then just use a jmp as our nop. This isn't the most
+	 * efficient nop, but we can not use a multi part nop
+	 * since we would then risk being preempted in the middle
+	 * of that nop, and if we enabled tracing then, it might
+	 * cause a system crash.
+	 *
+	 * TODO: check the cpuid to determine the best nop.
+	 */
+	asm volatile (
+		"ftrace_test_jmp:"
+		"jmp ftrace_test_p6nop\n"
+		"nop\n"
+		"nop\n"
+		"nop\n"  /* 2 byte jmp + 3 bytes */
+		"ftrace_test_p6nop:"
+		P6_NOP5
+		"jmp 1f\n"
+		"ftrace_test_nop5:"
+		".byte 0x66,0x66,0x66,0x66,0x90\n"
+		"1:"
+		".section .fixup, \"ax\"\n"
+		"2:	movl $1, %0\n"
+		"	jmp ftrace_test_nop5\n"
+		"3:	movl $2, %0\n"
+		"	jmp 1b\n"
+		".previous\n"
+		_ASM_EXTABLE(ftrace_test_p6nop, 2b)
+		_ASM_EXTABLE(ftrace_test_nop5, 3b)
+		: "=r"(faulted) : "0" (faulted));
+
+	switch (faulted) {
+	case 0:
+		pr_info("converting mcount calls to 0f 1f 44 00 00\n");
+		memcpy(ideal_nop5, ftrace_test_p6nop, IDEAL_NOP_SIZE_5);
+		break;
+	case 1:
+		pr_info("converting mcount calls to 66 66 66 66 90\n");
+		memcpy(ideal_nop5, ftrace_test_nop5, IDEAL_NOP_SIZE_5);
+		break;
+	case 2:
+		pr_info("converting mcount calls to jmp . + 5\n");
+		memcpy(ideal_nop5, ftrace_test_jmp, IDEAL_NOP_SIZE_5);
+		break;
+	}
+
+}
+#endif
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index cd37469b54e..3afb33f14d2 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -257,14 +257,9 @@ do_ftrace_mod_code(unsigned long ip, void *new_code)
 	return mod_code_status;
 }
 
-
-
-
-static unsigned char ftrace_nop[MCOUNT_INSN_SIZE];
-
 static unsigned char *ftrace_nop_replace(void)
 {
-	return ftrace_nop;
+	return ideal_nop5;
 }
 
 static int
@@ -338,62 +333,6 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
 
 int __init ftrace_dyn_arch_init(void *data)
 {
-	extern const unsigned char ftrace_test_p6nop[];
-	extern const unsigned char ftrace_test_nop5[];
-	extern const unsigned char ftrace_test_jmp[];
-	int faulted = 0;
-
-	/*
-	 * There is no good nop for all x86 archs.
-	 * We will default to using the P6_NOP5, but first we
-	 * will test to make sure that the nop will actually
-	 * work on this CPU. If it faults, we will then
-	 * go to a lesser efficient 5 byte nop. If that fails
-	 * we then just use a jmp as our nop. This isn't the most
-	 * efficient nop, but we can not use a multi part nop
-	 * since we would then risk being preempted in the middle
-	 * of that nop, and if we enabled tracing then, it might
-	 * cause a system crash.
-	 *
-	 * TODO: check the cpuid to determine the best nop.
-	 */
-	asm volatile (
-		"ftrace_test_jmp:"
-		"jmp ftrace_test_p6nop\n"
-		"nop\n"
-		"nop\n"
-		"nop\n"  /* 2 byte jmp + 3 bytes */
-		"ftrace_test_p6nop:"
-		P6_NOP5
-		"jmp 1f\n"
-		"ftrace_test_nop5:"
-		".byte 0x66,0x66,0x66,0x66,0x90\n"
-		"1:"
-		".section .fixup, \"ax\"\n"
-		"2:	movl $1, %0\n"
-		"	jmp ftrace_test_nop5\n"
-		"3:	movl $2, %0\n"
-		"	jmp 1b\n"
-		".previous\n"
-		_ASM_EXTABLE(ftrace_test_p6nop, 2b)
-		_ASM_EXTABLE(ftrace_test_nop5, 3b)
-		: "=r"(faulted) : "0" (faulted));
-
-	switch (faulted) {
-	case 0:
-		pr_info("converting mcount calls to 0f 1f 44 00 00\n");
-		memcpy(ftrace_nop, ftrace_test_p6nop, MCOUNT_INSN_SIZE);
-		break;
-	case 1:
-		pr_info("converting mcount calls to 66 66 66 66 90\n");
-		memcpy(ftrace_nop, ftrace_test_nop5, MCOUNT_INSN_SIZE);
-		break;
-	case 2:
-		pr_info("converting mcount calls to jmp . + 5\n");
-		memcpy(ftrace_nop, ftrace_test_jmp, MCOUNT_INSN_SIZE);
-		break;
-	}
-
 	/* The return code is retured via data */
 	*(unsigned long *)data = 0;
 
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index c3a4fbb2b99..00e167870f7 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -112,6 +112,7 @@
 #include <asm/numa_64.h>
 #endif
 #include <asm/mce.h>
+#include <asm/alternative.h>
 
 /*
  * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
@@ -726,6 +727,7 @@ void __init setup_arch(char **cmdline_p)
 {
 	int acpi = 0;
 	int k8 = 0;
+	unsigned long flags;
 
 #ifdef CONFIG_X86_32
 	memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
@@ -1071,6 +1073,10 @@ void __init setup_arch(char **cmdline_p)
 	x86_init.oem.banner();
 
 	mcheck_init();
+
+	local_irq_save(flags);
+	arch_init_ideal_nop5();
+	local_irq_restore(flags);
 }
 
 #ifdef CONFIG_X86_32
-- 
cgit v1.2.3-18-g5258


From fa6f2cc77081792e4edca9168420a3422299ef15 Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@redhat.com>
Date: Fri, 17 Sep 2010 11:08:56 -0400
Subject: jump label: Make text_poke_early() globally visible

Make text_poke_early available outside of alternative.c. The jump label
patchset wants to make use of it in order to set up the optimal no-op
sequences at run-time.

Signed-off-by: Jason Baron <jbaron@redhat.com>
LKML-Reference: <04cfddf2ba77bcabfc3e524f1849d871d6a1cf9d.1284733808.git.jbaron@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/include/asm/alternative.h | 2 ++
 arch/x86/kernel/alternative.c      | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 27a35b68918..634bf782dca 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -160,6 +160,8 @@ static inline void apply_paravirt(struct paravirt_patch_site *start,
 #define __parainstructions_end	NULL
 #endif
 
+extern void *text_poke_early(void *addr, const void *opcode, size_t len);
+
 /*
  * Clear and restore the kernel write-protection flag on the local CPU.
  * Allows the kernel to edit read-only pages.
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 1849d8036ee..083bd010d92 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -195,7 +195,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len)
 
 extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
 extern s32 __smp_locks[], __smp_locks_end[];
-static void *text_poke_early(void *addr, const void *opcode, size_t len);
+void *text_poke_early(void *addr, const void *opcode, size_t len);
 
 /* Replace instructions with better alternatives for this CPU type.
    This runs before SMP is initialized to avoid SMP problems with
@@ -522,7 +522,7 @@ void __init alternative_instructions(void)
  * instructions. And on the local CPU you need to be protected again NMI or MCE
  * handlers seeing an inconsistent instruction while you patch.
  */
-static void *__init_or_module text_poke_early(void *addr, const void *opcode,
+void *__init_or_module text_poke_early(void *addr, const void *opcode,
 					      size_t len)
 {
 	unsigned long flags;
-- 
cgit v1.2.3-18-g5258


From ce5f68246bf2385d6174856708d0b746dc378f20 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Mon, 20 Sep 2010 13:04:45 -0700
Subject: x86, hotplug: In the MWAIT case of play_dead, CLFLUSH the cache line

When we're using MWAIT for play_dead, explicitly CLFLUSH the cache
line before executing MONITOR.  This is a potential workaround for the
Xeon 7400 erratum AAI65 after having a spurious wakeup and returning
around the loop.  "Potential" here because it is not certain that that
erratum could actually trigger; however, the CLFLUSH should be
harmless.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Acked-by: Venkatesh Pallipadi <venki@google.com>
Cc: Asit Mallick <asit.k.mallick@intel.com>
Cc: Arjan van de Ven <arjan@linux.kernel.org>
Cc: Len Brown <lenb@kernel.org>
---
 arch/x86/kernel/smpboot.c | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 55c80ffb871..fdccfe9dc63 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1394,9 +1394,12 @@ static inline void mwait_play_dead(void)
 	unsigned int highest_cstate = 0;
 	unsigned int highest_subcstate = 0;
 	int i;
+	void *mwait_ptr;
 
 	if (!cpu_has(&current_cpu_data, X86_FEATURE_MWAIT))
 		return;
+	if (!cpu_has(&current_cpu_data, X86_FEATURE_CLFLSH))
+		return;
 	if (current_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
 		return;
 
@@ -1422,10 +1425,25 @@ static inline void mwait_play_dead(void)
 			(highest_subcstate - 1);
 	}
 
+	/*
+	 * This should be a memory location in a cache line which is
+	 * unlikely to be touched by other processors.  The actual
+	 * content is immaterial as it is not actually modified in any way.
+	 */
+	mwait_ptr = &current_thread_info()->flags;
+
 	wbinvd();
 
 	while (1) {
-		__monitor(&current_thread_info()->flags, 0, 0);
+		/*
+		 * The CLFLUSH is a workaround for erratum AAI65 for
+		 * the Xeon 7400 series.  It's not clear it is actually
+		 * needed, but it should be harmless in either case.
+		 * The WBINVD is insufficient due to the spurious-wakeup
+		 * case where we return around the loop.
+		 */
+		clflush(mwait_ptr);
+		__monitor(mwait_ptr, 0, 0);
 		mb();
 		__mwait(eax, 0);
 	}
-- 
cgit v1.2.3-18-g5258


From c2b9ff24a0df649d4d40947878b5b5ac39c7299e Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Mon, 20 Sep 2010 18:01:46 -0700
Subject: x86, cpu: Re-run get_cpu_cap() after adjusting the CPUID level

At least on Intel, adjusting the max CPUID level can expose new CPUID
features, so we need to re-run get_cpu_cap() after changing the CPUID
level.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/cpu/common.c | 2 +-
 arch/x86/kernel/cpu/cpu.h    | 1 +
 arch/x86/kernel/cpu/intel.c  | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 490dac63c2d..f2f9ac7da25 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -545,7 +545,7 @@ void __cpuinit cpu_detect(struct cpuinfo_x86 *c)
 	}
 }
 
-static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
+void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
 {
 	u32 tfms, xlvl;
 	u32 ebx;
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 3624e8a0f71..c16456bc11a 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -32,6 +32,7 @@ struct cpu_dev {
 extern const struct cpu_dev *const __x86_cpu_dev_start[],
 			    *const __x86_cpu_dev_end[];
 
+extern void get_cpu_cap(struct cpuinfo_x86 *c);
 extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
 
 #endif
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 85f69cdeae1..b4389441efb 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -39,6 +39,7 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
 			misc_enable &= ~MSR_IA32_MISC_ENABLE_LIMIT_CPUID;
 			wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
 			c->cpuid_level = cpuid_eax(0);
+			get_cpu_cap(c);
 		}
 	}
 
-- 
cgit v1.2.3-18-g5258


From bf5438fca2950b03c21ad868090cc1a8fcd49536 Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@redhat.com>
Date: Fri, 17 Sep 2010 11:09:00 -0400
Subject: jump label: Base patch for jump label

base patch to implement 'jump labeling'. Based on a new 'asm goto' inline
assembly gcc mechanism, we can now branch to labels from an 'asm goto'
statment. This allows us to create a 'no-op' fastpath, which can subsequently
be patched with a jump to the slowpath code. This is useful for code which
might be rarely used, but which we'd like to be able to call, if needed.
Tracepoints are the current usecase that these are being implemented for.

Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Jason Baron <jbaron@redhat.com>
LKML-Reference: <ee8b3595967989fdaf84e698dc7447d315ce972a.1284733808.git.jbaron@redhat.com>

[ cleaned up some formating ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/include/asm/alternative.h | 3 ++-
 arch/x86/kernel/alternative.c      | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 634bf782dca..76561d20ea2 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -4,6 +4,7 @@
 #include <linux/types.h>
 #include <linux/stddef.h>
 #include <linux/stringify.h>
+#include <linux/jump_label.h>
 #include <asm/asm.h>
 
 /*
@@ -182,7 +183,7 @@ extern void *text_poke_early(void *addr, const void *opcode, size_t len);
 extern void *text_poke(void *addr, const void *opcode, size_t len);
 extern void *text_poke_smp(void *addr, const void *opcode, size_t len);
 
-#if defined(CONFIG_DYNAMIC_FTRACE)
+#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL)
 #define IDEAL_NOP_SIZE_5 5
 extern unsigned char ideal_nop5[IDEAL_NOP_SIZE_5];
 extern void arch_init_ideal_nop5(void);
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 083bd010d92..cb0e6d385f6 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -641,7 +641,7 @@ void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
 	return addr;
 }
 
-#if defined(CONFIG_DYNAMIC_FTRACE)
+#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL)
 
 unsigned char ideal_nop5[IDEAL_NOP_SIZE_5];
 
-- 
cgit v1.2.3-18-g5258


From 4c3ef6d79328c0e23ade60cbfc8d496123a6855c Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@redhat.com>
Date: Fri, 17 Sep 2010 11:09:08 -0400
Subject: jump label: Add jump_label_text_reserved() to reserve jump points

Add a jump_label_text_reserved(void *start, void *end), so that other
pieces of code that want to modify kernel text, can first verify that
jump label has not reserved the instruction.

Acked-by: Masami Hiramatsu <mhiramat@redhat.com>
Signed-off-by: Jason Baron <jbaron@redhat.com>
LKML-Reference: <06236663a3a7b1c1f13576bb9eccb6d9c17b7bfe.1284733808.git.jbaron@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/kernel/kprobes.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index e05952af5d2..1cbd54c0df9 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -1218,7 +1218,8 @@ static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
 	}
 	/* Check whether the address range is reserved */
 	if (ftrace_text_reserved(src, src + len - 1) ||
-	    alternatives_text_reserved(src, src + len - 1))
+	    alternatives_text_reserved(src, src + len - 1) ||
+	    jump_label_text_reserved(src, src + len - 1))
 		return -EBUSY;
 
 	return len;
-- 
cgit v1.2.3-18-g5258


From d9f5ab7b1c0a520867af389bab5d5fcdbd0e407e Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@redhat.com>
Date: Fri, 17 Sep 2010 11:09:22 -0400
Subject: jump label: x86 support

add x86 support for jump label. I'm keeping this patch separate so its clear
to arch maintainers what was required for x86 support this new feature.
Hopefully, it wouldn't be too painful for other archs.

Signed-off-by: Jason Baron <jbaron@redhat.com>
LKML-Reference: <f838f49f40fbea0254036194be66dc48b598dcea.1284733808.git.jbaron@redhat.com>

[ cleaned up some formatting ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/Kconfig                  |  1 +
 arch/x86/include/asm/jump_label.h | 47 ++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/Makefile          |  2 +-
 arch/x86/kernel/jump_label.c      | 50 +++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/module.c          |  3 +++
 5 files changed, 102 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/include/asm/jump_label.h
 create mode 100644 arch/x86/kernel/jump_label.c

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cea0cd9a316..afcd6632c94 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -59,6 +59,7 @@ config X86
 	select ANON_INODES
 	select HAVE_ARCH_KMEMCHECK
 	select HAVE_USER_RETURN_NOTIFIER
+	select HAVE_ARCH_JUMP_LABEL if !CC_OPTIMIZE_FOR_SIZE
 
 config INSTRUCTION_DECODER
 	def_bool (KPROBES || PERF_EVENTS)
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
new file mode 100644
index 00000000000..b4a2cb40337
--- /dev/null
+++ b/arch/x86/include/asm/jump_label.h
@@ -0,0 +1,47 @@
+#ifndef _ASM_X86_JUMP_LABEL_H
+#define _ASM_X86_JUMP_LABEL_H
+
+#ifdef __KERNEL__
+
+#include <linux/types.h>
+#include <asm/nops.h>
+
+#define JUMP_LABEL_NOP_SIZE 5
+
+# define JUMP_LABEL_INITIAL_NOP ".byte 0xe9 \n\t .long 0\n\t"
+
+# define JUMP_LABEL(key, label)					\
+	do {							\
+		asm goto("1:"					\
+			JUMP_LABEL_INITIAL_NOP			\
+			".pushsection __jump_table,  \"a\" \n\t"\
+			_ASM_PTR "1b, %l[" #label "], %c0 \n\t" \
+			".popsection \n\t"			\
+			: :  "i" (key) :  : label);		\
+	} while (0)
+
+#endif /* __KERNEL__ */
+
+#ifdef CONFIG_X86_64
+
+typedef u64 jump_label_t;
+
+struct jump_entry {
+	jump_label_t code;
+	jump_label_t target;
+	jump_label_t key;
+};
+
+#else
+
+typedef u32 jump_label_t;
+
+struct jump_entry {
+	jump_label_t code;
+	jump_label_t target;
+	jump_label_t key;
+};
+
+#endif
+
+#endif
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 0925676266b..24fa1718ddb 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -32,7 +32,7 @@ GCOV_PROFILE_paravirt.o		:= n
 obj-y			:= process_$(BITS).o signal.o entry_$(BITS).o
 obj-y			+= traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
 obj-y			+= time.o ioport.o ldt.o dumpstack.o
-obj-y			+= setup.o x86_init.o i8259.o irqinit.o
+obj-y			+= setup.o x86_init.o i8259.o irqinit.o jump_label.o
 obj-$(CONFIG_X86_VISWS)	+= visws_quirks.o
 obj-$(CONFIG_X86_32)	+= probe_roms_32.o
 obj-$(CONFIG_X86_32)	+= sys_i386_32.o i386_ksyms_32.o
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
new file mode 100644
index 00000000000..961b6b30ba9
--- /dev/null
+++ b/arch/x86/kernel/jump_label.c
@@ -0,0 +1,50 @@
+/*
+ * jump label x86 support
+ *
+ * Copyright (C) 2009 Jason Baron <jbaron@redhat.com>
+ *
+ */
+#include <linux/jump_label.h>
+#include <linux/memory.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/jhash.h>
+#include <linux/cpu.h>
+#include <asm/kprobes.h>
+#include <asm/alternative.h>
+
+#ifdef HAVE_JUMP_LABEL
+
+union jump_code_union {
+	char code[JUMP_LABEL_NOP_SIZE];
+	struct {
+		char jump;
+		int offset;
+	} __attribute__((packed));
+};
+
+void arch_jump_label_transform(struct jump_entry *entry,
+			       enum jump_label_type type)
+{
+	union jump_code_union code;
+
+	if (type == JUMP_LABEL_ENABLE) {
+		code.jump = 0xe9;
+		code.offset = entry->target -
+				(entry->code + JUMP_LABEL_NOP_SIZE);
+	} else
+		memcpy(&code, ideal_nop5, JUMP_LABEL_NOP_SIZE);
+	get_online_cpus();
+	mutex_lock(&text_mutex);
+	text_poke_smp((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE);
+	mutex_unlock(&text_mutex);
+	put_online_cpus();
+}
+
+void arch_jump_label_text_poke_early(jump_label_t addr)
+{
+	text_poke_early((void *)addr, ideal_nop5, JUMP_LABEL_NOP_SIZE);
+}
+
+#endif
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index e0bc186d750..5399f58de7e 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -239,6 +239,9 @@ int module_finalize(const Elf_Ehdr *hdr,
 		apply_paravirt(pseg, pseg + para->sh_size);
 	}
 
+	/* make jump label nops */
+	jump_label_apply_nops(me);
+
 	return module_bug_finalize(hdr, sechdrs, me);
 }
 
-- 
cgit v1.2.3-18-g5258


From 95fccd465eefb3d6bf80dae0496607b534d38313 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 22 Sep 2010 17:37:43 -0400
Subject: jump label: Remove duplicate structure for x86

The structure in the x86 jump label code uses the typedef jump_label_t,
which is defined by the #ifdef arch type. The structure does not need
to be duplicated there.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/include/asm/jump_label.h | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
index b4a2cb40337..f52d42e8058 100644
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -23,18 +23,10 @@
 #endif /* __KERNEL__ */
 
 #ifdef CONFIG_X86_64
-
 typedef u64 jump_label_t;
-
-struct jump_entry {
-	jump_label_t code;
-	jump_label_t target;
-	jump_label_t key;
-};
-
 #else
-
 typedef u32 jump_label_t;
+#endif
 
 struct jump_entry {
 	jump_label_t code;
@@ -43,5 +35,3 @@ struct jump_entry {
 };
 
 #endif
-
-#endif
-- 
cgit v1.2.3-18-g5258


From 234bb549eea16ec7d5207ba747fb8dbf489e64c1 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Thu, 2 Sep 2010 13:46:34 +0100
Subject: x86, cleanups: Use clear_page/copy_page rather than memset/memcpy

When operating on whole pages, use clear_page() and copy_page() in
favor of memset() and memcpy(); after all that's what they are
intended for.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
LKML-Reference: <4C7FB8CA0200007800013F51@vpn.id2.novell.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/machine_kexec_64.c | 4 ++--
 arch/x86/kvm/lapic.c               | 3 +--
 arch/x86/mm/init_32.c              | 4 ++--
 arch/x86/mm/init_64.c              | 2 +-
 4 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 035c8c52918..b3ea9db39db 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -36,7 +36,7 @@ static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
 		if (!page)
 			goto out;
 		pud = (pud_t *)page_address(page);
-		memset(pud, 0, PAGE_SIZE);
+		clear_page(pud);
 		set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
 	}
 	pud = pud_offset(pgd, addr);
@@ -45,7 +45,7 @@ static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
 		if (!page)
 			goto out;
 		pmd = (pmd_t *)page_address(page);
-		memset(pmd, 0, PAGE_SIZE);
+		clear_page(pmd);
 		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
 	}
 	pmd = pmd_offset(pud, addr);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 77d8c0f4817..22b06f7660f 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1056,14 +1056,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
 
 	vcpu->arch.apic = apic;
 
-	apic->regs_page = alloc_page(GFP_KERNEL);
+	apic->regs_page = alloc_page(GFP_KERNEL|__GFP_ZERO);
 	if (apic->regs_page == NULL) {
 		printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
 		       vcpu->vcpu_id);
 		goto nomem_free_apic;
 	}
 	apic->regs = page_address(apic->regs_page);
-	memset(apic->regs, 0, PAGE_SIZE);
 	apic->vcpu = vcpu;
 
 	hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index bca79091b9d..558f2d33207 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -67,7 +67,7 @@ static __init void *alloc_low_page(void)
 		panic("alloc_low_page: ran out of memory");
 
 	adr = __va(pfn * PAGE_SIZE);
-	memset(adr, 0, PAGE_SIZE);
+	clear_page(adr);
 	return adr;
 }
 
@@ -558,7 +558,7 @@ char swsusp_pg_dir[PAGE_SIZE]
 
 static inline void save_pg_dir(void)
 {
-	memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE);
+	copy_page(swsusp_pg_dir, swapper_pg_dir);
 }
 #else /* !CONFIG_ACPI_SLEEP */
 static inline void save_pg_dir(void)
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 9a6674689a2..7c48ad4faca 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -293,7 +293,7 @@ static __ref void *alloc_low_page(unsigned long *phys)
 		panic("alloc_low_page: ran out of memory");
 
 	adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
-	memset(adr, 0, PAGE_SIZE);
+	clear_page(adr);
 	*phys  = pfn * PAGE_SIZE;
 	return adr;
 }
-- 
cgit v1.2.3-18-g5258


From 46eb3b64dddd20f44e76b08676fa642dd374bf1d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 22 Sep 2010 23:10:23 -0400
Subject: jump label/x86/sparc64: Remove !CC_OPTIMIZE_FOR_SIZE config
 conditions

The !CC_OPTIMIZE_FOR_SIZE was added to enable the jump label functionality
because Jason noticed that the gcc option would not optimize the labels
and may even hurt performance.

But this is a gcc problem not a kernel one. Removing this condition should
add motivation to the gcc developers to actually fix it.

Cc: Jason Baron <jbaron@redhat.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index afcd6632c94..b431a0824a9 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -59,7 +59,7 @@ config X86
 	select ANON_INODES
 	select HAVE_ARCH_KMEMCHECK
 	select HAVE_USER_RETURN_NOTIFIER
-	select HAVE_ARCH_JUMP_LABEL if !CC_OPTIMIZE_FOR_SIZE
+	select HAVE_ARCH_JUMP_LABEL
 
 config INSTRUCTION_DECODER
 	def_bool (KPROBES || PERF_EVENTS)
-- 
cgit v1.2.3-18-g5258


From 6554287b1de0448f1e02e200d02b43914e997d15 Mon Sep 17 00:00:00 2001
From: Bart Oldeman <bartoldeman@gmail.com>
Date: Thu, 23 Sep 2010 13:16:58 -0400
Subject: x86, vm86: Fix preemption bug for int1 debug and int3 breakpoint
 handlers.

Impact: fix kernel bug such as:
BUG: scheduling while atomic: dosemu.bin/19680/0x00000004
See also Ubuntu bug 455067 at
https://bugs.launchpad.net/ubuntu/+source/linux/+bug/455067

Commits 4915a35e35a037254550a2ba9f367a812bc37d40
("Use preempt_conditional_sti/cli in do_int3, like on x86_64.")
and 3d2a71a596bd9c761c8487a2178e95f8a61da083
("x86, traps: converge do_debug handlers")
started disabling preemption in int1 and int3 handlers on i386.
The problem with vm86 is that the call to handle_vm86_trap() may jump
straight to entry_32.S and never returns so preempt is never enabled
again, and there is an imbalance in the preempt count.

Commit be716615fe596ee117292dc615e95f707fb67fd1 ("x86, vm86:
fix preemption bug"), which was later (accidentally?) reverted by commit
08d68323d1f0c34452e614263b212ca556dae47f ("hw-breakpoints: modifying
generic debug exception to use thread-specific debug registers")
fixed the problem for debug exceptions but not for breakpoints.

There are three solutions to this problem.

1. Reenable preemption before calling handle_vm86_trap(). This
was the approach that was later reverted.

2. Do not disable preemption for i386 in breakpoint and debug handlers.
This was the situation before October 2008. As far as I understand
preemption only needs to be disabled on x86_64 because a seperate stack is
used, but it's nice to have things work the same way on
i386 and x86_64.

3. Let handle_vm86_trap() return instead of jumping to assembly code.
By setting a flag in _TIF_WORK_MASK, either TIF_IRET or TIF_NOTIFY_RESUME,
the code in entry_32.S is instructed to return to 32 bit mode from
V86 mode. The logic in entry_32.S was already present to handle signals.
(I chose TIF_IRET because it's slightly more efficient in
do_notify_resume() in signal.c, but in fact TIF_IRET can probably be
replaced by TIF_NOTIFY_RESUME everywhere.)

I'm submitting approach 3, because I believe it is the most elegant
and prevents future confusion. Still, an obvious
preempt_conditional_cli(regs); is necessary in traps.c to correct the
bug.

[ hpa: This is technically a regression, but because:
  1. the regression is so old,
  2. the patch seems relatively high risk, justifying more testing, and
  3. we're late in the 2.6.36-rc cycle,

  I'm queuing it up for the 2.6.37 merge window.  It might, however,
  justify as a -stable backport at a latter time, hence Cc: stable. ]

Signed-off-by: Bart Oldeman <bartoldeman@users.sourceforge.net>
LKML-Reference: <alpine.DEB.2.00.1009231312330.4732@localhost.localdomain>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: K.Prasad <prasad@linux.vnet.ibm.com>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Alexander van Heukelum <heukelum@fastmail.fm>
Cc: <stable@kernel.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/traps.c   |  1 +
 arch/x86/kernel/vm86_32.c | 10 ++++++++--
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 60788dee0f8..9f4edeb2132 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -575,6 +575,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
 	if (regs->flags & X86_VM_MASK) {
 		handle_vm86_trap((struct kernel_vm86_regs *) regs,
 				error_code, 1);
+		preempt_conditional_cli(regs);
 		return;
 	}
 
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 5ffb5622f79..61fb9851962 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -551,8 +551,14 @@ cannot_handle:
 int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno)
 {
 	if (VMPI.is_vm86pus) {
-		if ((trapno == 3) || (trapno == 1))
-			return_to_32bit(regs, VM86_TRAP + (trapno << 8));
+		if ((trapno == 3) || (trapno == 1)) {
+			KVM86->regs32->ax = VM86_TRAP + (trapno << 8);
+			/* setting this flag forces the code in entry_32.S to
+			   call save_v86_state() and change the stack pointer
+			   to KVM86->regs32 */
+			set_thread_flag(TIF_IRET);
+			return 0;
+		}
 		do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs));
 		return 0;
 	}
-- 
cgit v1.2.3-18-g5258


From 76fb657017588a0912f0d1d140cb807446e4ef05 Mon Sep 17 00:00:00 2001
From: Daniel Drake <dsd@laptop.org>
Date: Thu, 23 Sep 2010 17:28:04 +0100
Subject: x86, olpc: Only enable PCI configuration type override on XO-1

This configuration type override is for XO-1 only and must not happen
on XO-1.5.

Signed-off-by: Daniel Drake <dsd@laptop.org>
LKML-Reference: <20100923162805.0F6549D401B@zog.reactivated.net>
Cc: Andres Solomon <dilinger@queued.net>
Cc: Grant Likely <grant.likely@secretlab.ca>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/Kconfig       | 2 +-
 arch/x86/kernel/olpc.c | 6 ++++--
 arch/x86/pci/olpc.c    | 2 +-
 3 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cea0cd9a316..0ed4c9bfcd1 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1900,7 +1900,7 @@ config PCI_GODIRECT
 	bool "Direct"
 
 config PCI_GOOLPC
-	bool "OLPC"
+	bool "OLPC XO-1"
 	depends on OLPC
 
 config PCI_GOANY
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c
index 0e0cdde519b..635888cf050 100644
--- a/arch/x86/kernel/olpc.c
+++ b/arch/x86/kernel/olpc.c
@@ -242,8 +242,10 @@ static int __init olpc_init(void)
 			(unsigned char *) &olpc_platform_info.ecver, 1);
 
 #ifdef CONFIG_PCI_OLPC
-	/* If the VSA exists let it emulate PCI, if not emulate in kernel */
-	if (!cs5535_has_vsa2())
+	/* If the VSA exists let it emulate PCI, if not emulate in kernel.
+	 * XO-1 only. */
+	if (olpc_platform_info.boardrev < olpc_board_pre(0xd0) &&
+			!cs5535_has_vsa2())
 		x86_init.pci.arch_init = pci_olpc_init;
 #endif
 
diff --git a/arch/x86/pci/olpc.c b/arch/x86/pci/olpc.c
index b34815408f5..13700ec8e2e 100644
--- a/arch/x86/pci/olpc.c
+++ b/arch/x86/pci/olpc.c
@@ -304,7 +304,7 @@ static struct pci_raw_ops pci_olpc_conf = {
 
 int __init pci_olpc_init(void)
 {
-	printk(KERN_INFO "PCI: Using configuration type OLPC\n");
+	printk(KERN_INFO "PCI: Using configuration type OLPC XO-1\n");
 	raw_pci_ops = &pci_olpc_conf;
 	is_lx = is_geode_lx();
 	return 0;
-- 
cgit v1.2.3-18-g5258


From 3e3c486012a3dbb113c0ca15ee265d309d77aea9 Mon Sep 17 00:00:00 2001
From: Daniel Drake <dsd@laptop.org>
Date: Thu, 23 Sep 2010 17:28:46 +0100
Subject: x86, olpc: Rework BIOS signature check

The XO-1.5 laptop is not currently detected as an OLPC machine because
it fails this XO-1-centric check.

Now that we have OLPC OFW support in the kernel, a more sensible
check is to see if we found OFW during boot and check the architecture
property.

Also remove a now-meaningless codepath, as we're always going to have
OFW support with OLPC.

Signed-off-by: Daniel Drake <dsd@laptop.org>
LKML-Reference: <20100923162846.D8D409D401B@zog.reactivated.net>
Cc: Andres Salomon <dilinger@queued.net>
Cc: Grant Likely <grant.likely@secretlab.ca>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/Kconfig                |  3 ++-
 arch/x86/include/asm/olpc_ofw.h |  4 +++
 arch/x86/kernel/olpc.c          | 58 ++++++++++++++++++-----------------------
 arch/x86/kernel/olpc_ofw.c      |  6 +++++
 4 files changed, 37 insertions(+), 34 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 0ed4c9bfcd1..c2552557134 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2061,6 +2061,7 @@ config SCx200HR_TIMER
 config OLPC
 	bool "One Laptop Per Child support"
 	select GPIOLIB
+	select OLPC_OPENFIRMWARE
 	---help---
 	  Add support for detecting the unique features of the OLPC
 	  XO hardware.
@@ -2068,7 +2069,7 @@ config OLPC
 config OLPC_OPENFIRMWARE
 	bool "Support for OLPC's Open Firmware"
 	depends on !X86_64 && !X86_PAE
-	default y if OLPC
+	default n
 	help
 	  This option adds support for the implementation of Open Firmware
 	  that is used on the OLPC XO-1 Children's Machine.
diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h
index 08fde475cb3..2a8478140bb 100644
--- a/arch/x86/include/asm/olpc_ofw.h
+++ b/arch/x86/include/asm/olpc_ofw.h
@@ -21,10 +21,14 @@ extern void olpc_ofw_detect(void);
 /* install OFW's pde permanently into the kernel's pgtable */
 extern void setup_olpc_ofw_pgd(void);
 
+/* check if OFW was detected during boot */
+extern bool olpc_ofw_present(void);
+
 #else /* !CONFIG_OLPC_OPENFIRMWARE */
 
 static inline void olpc_ofw_detect(void) { }
 static inline void setup_olpc_ofw_pgd(void) { }
+static inline bool olpc_ofw_present(void) { return false; }
 
 #endif /* !CONFIG_OLPC_OPENFIRMWARE */
 
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c
index 635888cf050..37c49934c78 100644
--- a/arch/x86/kernel/olpc.c
+++ b/arch/x86/kernel/olpc.c
@@ -183,8 +183,21 @@ err:
 }
 EXPORT_SYMBOL_GPL(olpc_ec_cmd);
 
-#ifdef CONFIG_OLPC_OPENFIRMWARE
-static void __init platform_detect(void)
+static bool __init check_ofw_architecture(void)
+{
+	size_t propsize;
+	char olpc_arch[5];
+	const void *args[] = { NULL, "architecture", olpc_arch, (void *)5 };
+	void *res[] = { &propsize };
+
+	if (olpc_ofw("getprop", args, res)) {
+		printk(KERN_ERR "ofw: getprop call failed!\n");
+		return false;
+	}
+	return propsize == 5 && strncmp("OLPC", olpc_arch, 5) == 0;
+}
+
+static u32 __init get_board_revision(void)
 {
 	size_t propsize;
 	__be32 rev;
@@ -193,46 +206,27 @@ static void __init platform_detect(void)
 
 	if (olpc_ofw("getprop", args, res) || propsize != 4) {
 		printk(KERN_ERR "ofw: getprop call failed!\n");
-		rev = cpu_to_be32(0);
+		return cpu_to_be32(0);
 	}
-	olpc_platform_info.boardrev = be32_to_cpu(rev);
+	return be32_to_cpu(rev);
 }
-#else
-static void __init platform_detect(void)
+
+static bool __init platform_detect(void)
 {
-	/* stopgap until OFW support is added to the kernel */
-	olpc_platform_info.boardrev = olpc_board(0xc2);
+	if (!check_ofw_architecture())
+		return false;
+	olpc_platform_info.flags |= OLPC_F_PRESENT;
+	olpc_platform_info.boardrev = get_board_revision();
+	return true;
 }
-#endif
 
 static int __init olpc_init(void)
 {
-	unsigned char *romsig;
-
-	/* The ioremap check is dangerous; limit what we run it on */
-	if (!is_geode() || cs5535_has_vsa2())
+	if (!olpc_ofw_present() || !platform_detect())
 		return 0;
 
 	spin_lock_init(&ec_lock);
 
-	romsig = ioremap(0xffffffc0, 16);
-	if (!romsig)
-		return 0;
-
-	if (strncmp(romsig, "CL1   Q", 7))
-		goto unmap;
-	if (strncmp(romsig+6, romsig+13, 3)) {
-		printk(KERN_INFO "OLPC BIOS signature looks invalid.  "
-				"Assuming not OLPC\n");
-		goto unmap;
-	}
-
-	printk(KERN_INFO "OLPC board with OpenFirmware %.16s\n", romsig);
-	olpc_platform_info.flags |= OLPC_F_PRESENT;
-
-	/* get the platform revision */
-	platform_detect();
-
 	/* assume B1 and above models always have a DCON */
 	if (olpc_board_at_least(olpc_board(0xb1)))
 		olpc_platform_info.flags |= OLPC_F_DCON;
@@ -254,8 +248,6 @@ static int __init olpc_init(void)
 			olpc_platform_info.boardrev >> 4,
 			olpc_platform_info.ecver);
 
-unmap:
-	iounmap(romsig);
 	return 0;
 }
 
diff --git a/arch/x86/kernel/olpc_ofw.c b/arch/x86/kernel/olpc_ofw.c
index 3218aa71ab5..78732046437 100644
--- a/arch/x86/kernel/olpc_ofw.c
+++ b/arch/x86/kernel/olpc_ofw.c
@@ -74,6 +74,12 @@ int __olpc_ofw(const char *name, int nr_args, const void **args, int nr_res,
 }
 EXPORT_SYMBOL_GPL(__olpc_ofw);
 
+bool olpc_ofw_present(void)
+{
+	return olpc_ofw_cif != NULL;
+}
+EXPORT_SYMBOL_GPL(olpc_ofw_present);
+
 /* OFW cif _should_ be above this address */
 #define OFW_MIN 0xff000000
 
-- 
cgit v1.2.3-18-g5258


From 3b4b682becdfa9f42321aa024d5cc84f71f06d8c Mon Sep 17 00:00:00 2001
From: Ma Ling <ling.ma@intel.com>
Date: Fri, 17 Sep 2010 03:12:40 +0800
Subject: x86, mem: Optimize memmove for small size and unaligned cases

movs instruction will combine data to accelerate moving data,
however we need to concern two cases about it.

1. movs instruction need long lantency to startup,
   so here we use general mov instruction to copy data.
2. movs instruction is not good for unaligned case,
   even if src offset is 0x10, dest offset is 0x0,
   we avoid and handle the case by general mov instruction.

Signed-off-by: Ma Ling <ling.ma@intel.com>
LKML-Reference: <1284664360-6138-1-git-send-email-ling.ma@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/lib/memcpy_32.c  | 213 ++++++++++++++++++++++++++++++++++++-------
 arch/x86/lib/memmove_64.c | 225 ++++++++++++++++++++++++++++++++++++----------
 2 files changed, 362 insertions(+), 76 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c
index 81130d477ee..b908a59eccf 100644
--- a/arch/x86/lib/memcpy_32.c
+++ b/arch/x86/lib/memcpy_32.c
@@ -22,36 +22,187 @@ EXPORT_SYMBOL(memset);
 
 void *memmove(void *dest, const void *src, size_t n)
 {
-	int d0, d1, d2;
-
-	if (dest < src) {
-		if ((dest + n) < src)
-			 return memcpy(dest, src, n);
-		else
-			__asm__ __volatile__(
-				"rep\n\t"
-				"movsb\n\t"
-				: "=&c" (d0), "=&S" (d1), "=&D" (d2)
-				:"0" (n),
-				 "1" (src),
-				 "2" (dest)
-				:"memory");
-	} else {
-		if((src + n) < dest)
-			return memcpy(dest, src, n);
-		else
-			__asm__ __volatile__(
-				"std\n\t"
-				"rep\n\t"
-				"movsb\n\t"
-				"cld"
-				: "=&c" (d0), "=&S" (d1), "=&D" (d2)
-				:"0" (n),
-				 "1" (n-1+src),
-				 "2" (n-1+dest)
-				:"memory");
-	}
-
-	return dest;
+	int d0,d1,d2,d3,d4,d5;
+	char *ret = dest;
+
+	__asm__ __volatile__(
+		/* Handle more 16bytes in loop */
+		"cmp $0x10, %0\n\t"
+		"jb	1f\n\t"
+
+		/* Decide forward/backward copy mode */
+		"cmp %2, %1\n\t"
+		"jb	2f\n\t"
+
+		/*
+		 * movs instruction have many startup latency
+		 * so we handle small size by general register.
+		 */
+		"cmp  $680, %0\n\t"
+		"jb 3f\n\t"
+		/*
+		 * movs instruction is only good for aligned case.
+		 */
+		"mov %1, %3\n\t"
+		"xor %2, %3\n\t"
+		"and $0xff, %3\n\t"
+		"jz 4f\n\t"
+		"3:\n\t"
+		"sub $0x10, %0\n\t"
+
+		/*
+		 * We gobble 16byts forward in each loop.
+		 */
+		"3:\n\t"
+		"sub $0x10, %0\n\t"
+		"mov 0*4(%1), %3\n\t"
+		"mov 1*4(%1), %4\n\t"
+		"mov  %3, 0*4(%2)\n\t"
+		"mov  %4, 1*4(%2)\n\t"
+		"mov 2*4(%1), %3\n\t"
+		"mov 3*4(%1), %4\n\t"
+		"mov  %3, 2*4(%2)\n\t"
+		"mov  %4, 3*4(%2)\n\t"
+		"lea  0x10(%1), %1\n\t"
+		"lea  0x10(%2), %2\n\t"
+		"jae 3b\n\t"
+		"add $0x10, %0\n\t"
+		"jmp 1f\n\t"
+
+		/*
+		 * Handle data forward by movs.
+		 */
+		".p2align 4\n\t"
+		"4:\n\t"
+		"mov -4(%1, %0), %3\n\t"
+		"lea -4(%2, %0), %4\n\t"
+		"shr $2, %0\n\t"
+		"rep movsl\n\t"
+		"mov %3, (%4)\n\t"
+		"jmp 11f\n\t"
+		/*
+		 * Handle data backward by movs.
+		 */
+		".p2align 4\n\t"
+		"6:\n\t"
+		"mov (%1), %3\n\t"
+		"mov %2, %4\n\t"
+		"lea -4(%1, %0), %1\n\t"
+		"lea -4(%2, %0), %2\n\t"
+		"shr $2, %0\n\t"
+		"std\n\t"
+		"rep movsl\n\t"
+		"mov %3,(%4)\n\t"
+		"cld\n\t"
+		"jmp 11f\n\t"
+
+		/*
+		 * Start to prepare for backward copy.
+		 */
+		".p2align 4\n\t"
+		"2:\n\t"
+		"cmp  $680, %0\n\t"
+		"jb 5f\n\t"
+		"mov %1, %3\n\t"
+		"xor %2, %3\n\t"
+		"and $0xff, %3\n\t"
+		"jz 6b\n\t"
+
+		/*
+		 * Calculate copy position to tail.
+		 */
+		"5:\n\t"
+		"add %0, %1\n\t"
+		"add %0, %2\n\t"
+		"sub $0x10, %0\n\t"
+
+		/*
+		 * We gobble 16byts backward in each loop.
+		 */
+		"7:\n\t"
+		"sub $0x10, %0\n\t"
+
+		"mov -1*4(%1), %3\n\t"
+		"mov -2*4(%1), %4\n\t"
+		"mov  %3, -1*4(%2)\n\t"
+		"mov  %4, -2*4(%2)\n\t"
+		"mov -3*4(%1), %3\n\t"
+		"mov -4*4(%1), %4\n\t"
+		"mov  %3, -3*4(%2)\n\t"
+		"mov  %4, -4*4(%2)\n\t"
+		"lea  -0x10(%1), %1\n\t"
+		"lea  -0x10(%2), %2\n\t"
+		"jae 7b\n\t"
+		/*
+		 * Calculate copy position to head.
+		 */
+		"add $0x10, %0\n\t"
+		"sub %0, %1\n\t"
+		"sub %0, %2\n\t"
+
+		/*
+		 * Move data from 8 bytes to 15 bytes.
+		 */
+		".p2align 4\n\t"
+		"1:\n\t"
+		"cmp $8, %0\n\t"
+		"jb 8f\n\t"
+		"mov 0*4(%1), %3\n\t"
+		"mov 1*4(%1), %4\n\t"
+		"mov -2*4(%1, %0), %5\n\t"
+		"mov -1*4(%1, %0), %1\n\t"
+
+		"mov  %3, 0*4(%2)\n\t"
+		"mov  %4, 1*4(%2)\n\t"
+		"mov  %5, -2*4(%2, %0)\n\t"
+		"mov  %1, -1*4(%2, %0)\n\t"
+		"jmp 11f\n\t"
+
+		/*
+		 * Move data from 4 bytes to 7 bytes.
+		 */
+		".p2align 4\n\t"
+		"8:\n\t"
+		"cmp $4, %0\n\t"
+		"jb 9f\n\t"
+		"mov 0*4(%1), %3\n\t"
+		"mov -1*4(%1, %0), %4\n\t"
+		"mov  %3, 0*4(%2)\n\t"
+		"mov  %4, -1*4(%2, %0)\n\t"
+		"jmp 11f\n\t"
+
+		/*
+		 * Move data from 2 bytes to 3 bytes.
+		 */
+		".p2align 4\n\t"
+		"9:\n\t"
+		"cmp $2, %0\n\t"
+		"jb 10f\n\t"
+		"movw 0*2(%1), %%dx\n\t"
+		"movw -1*2(%1, %0), %%bx\n\t"
+		"movw %%dx, 0*2(%2)\n\t"
+		"movw %%bx, -1*2(%2, %0)\n\t"
+		"jmp 11f\n\t"
+
+		/*
+		 * Move data for 1 byte.
+		 */
+		".p2align 4\n\t"
+		"10:\n\t"
+		"cmp $1, %0\n\t"
+		"jb 11f\n\t"
+		"movb (%1), %%cl\n\t"
+		"movb %%cl, (%2)\n\t"
+		".p2align 4\n\t"
+		"11:"
+		: "=&c" (d0), "=&S" (d1), "=&D" (d2),
+		  "=r" (d3),"=r" (d4), "=r"(d5)
+		:"0" (n),
+		 "1" (src),
+		 "2" (dest)
+		:"memory");
+
+	return ret;
+
 }
 EXPORT_SYMBOL(memmove);
diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c
index ecacc4b3d9e..6d0f0ec41b3 100644
--- a/arch/x86/lib/memmove_64.c
+++ b/arch/x86/lib/memmove_64.c
@@ -8,50 +8,185 @@
 #undef memmove
 void *memmove(void *dest, const void *src, size_t count)
 {
-	unsigned long d0, d1, d2, d3;
-	if (dest < src) {
-		if ((dest + count) < src)
-			 return memcpy(dest, src, count);
-		else
-			__asm__ __volatile__(
-				"movq %0, %3\n\t"
-				"shr $3, %0\n\t"
-				"andq $7, %3\n\t"
-				"rep\n\t"
-				"movsq\n\t"
-				"movq %3, %0\n\t"
-				"rep\n\t"
-				"movsb"
-				: "=&c" (d0), "=&S" (d1), "=&D" (d2), "=r" (d3)
-				:"0" (count),
-				 "1" (src),
-				 "2" (dest)
-				:"memory");
-	} else {
-		if((src + count) < dest)
-			return memcpy(dest, src, count);
-		else
-			__asm__ __volatile__(
-				"movq %0, %3\n\t"
-				"lea -8(%1, %0), %1\n\t"
-				"lea -8(%2, %0), %2\n\t"
-				"shr $3, %0\n\t"
-				"andq $7, %3\n\t"
-				"std\n\t"
-				"rep\n\t"
-				"movsq\n\t"
-				"lea 7(%1), %1\n\t"
-				"lea 7(%2), %2\n\t"
-				"movq %3, %0\n\t"
-				"rep\n\t"
-				"movsb\n\t"
-				"cld"
-				: "=&c" (d0), "=&S" (d1), "=&D" (d2), "=r" (d3)
-				:"0" (count),
-				 "1" (src),
-				 "2" (dest)
-				:"memory");
-	}
-	return dest;
+	unsigned long d0,d1,d2,d3,d4,d5,d6,d7;
+	char *ret;
+
+	__asm__ __volatile__(
+		/* Handle more 32bytes in loop */
+		"mov %2, %3\n\t"
+		"cmp $0x20, %0\n\t"
+		"jb	1f\n\t"
+
+		/* Decide forward/backward copy mode */
+		"cmp %2, %1\n\t"
+		"jb	2f\n\t"
+
+		/*
+		 * movsq instruction have many startup latency
+		 * so we handle small size by general register.
+		 */
+		"cmp  $680, %0\n\t"
+		"jb 3f\n\t"
+		/*
+		 * movsq instruction is only good for aligned case.
+		 */
+		"cmpb %%dil, %%sil\n\t"
+		"je 4f\n\t"
+		"3:\n\t"
+		"sub $0x20, %0\n\t"
+		/*
+		 * We gobble 32byts forward in each loop.
+		 */
+		"5:\n\t"
+		"sub $0x20, %0\n\t"
+		"movq 0*8(%1), %4\n\t"
+		"movq 1*8(%1), %5\n\t"
+		"movq 2*8(%1), %6\n\t"
+		"movq 3*8(%1), %7\n\t"
+		"leaq 4*8(%1), %1\n\t"
+
+		"movq %4, 0*8(%2)\n\t"
+		"movq %5, 1*8(%2)\n\t"
+		"movq %6, 2*8(%2)\n\t"
+		"movq %7, 3*8(%2)\n\t"
+		"leaq 4*8(%2), %2\n\t"
+		"jae 5b\n\t"
+		"addq $0x20, %0\n\t"
+		"jmp 1f\n\t"
+		/*
+		 * Handle data forward by movsq.
+		 */
+		".p2align 4\n\t"
+		"4:\n\t"
+		"movq %0, %8\n\t"
+		"movq -8(%1, %0), %4\n\t"
+		"lea -8(%2, %0), %5\n\t"
+		"shrq $3, %8\n\t"
+		"rep movsq\n\t"
+		"movq %4, (%5)\n\t"
+		"jmp 13f\n\t"
+		/*
+		 * Handle data backward by movsq.
+		 */
+		".p2align 4\n\t"
+		"7:\n\t"
+		"movq %0, %8\n\t"
+		"movq (%1), %4\n\t"
+		"movq %2, %5\n\t"
+		"leaq -8(%1, %0), %1\n\t"
+		"leaq -8(%2, %0), %2\n\t"
+		"shrq $3, %8\n\t"
+		"std\n\t"
+		"rep movsq\n\t"
+		"cld\n\t"
+		"movq %4, (%5)\n\t"
+		"jmp 13f\n\t"
+
+		/*
+		 * Start to prepare for backward copy.
+		 */
+		".p2align 4\n\t"
+		"2:\n\t"
+		"cmp $680, %0\n\t"
+		"jb 6f \n\t"
+		"cmp %%dil, %%sil\n\t"
+		"je 7b \n\t"
+		"6:\n\t"
+		/*
+		 * Calculate copy position to tail.
+		 */
+		"addq %0, %1\n\t"
+		"addq %0, %2\n\t"
+		"subq $0x20, %0\n\t"
+		/*
+		 * We gobble 32byts backward in each loop.
+		 */
+		"8:\n\t"
+		"subq $0x20, %0\n\t"
+		"movq -1*8(%1), %4\n\t"
+		"movq -2*8(%1), %5\n\t"
+		"movq -3*8(%1), %6\n\t"
+		"movq -4*8(%1), %7\n\t"
+		"leaq -4*8(%1), %1\n\t"
+
+		"movq %4, -1*8(%2)\n\t"
+		"movq %5, -2*8(%2)\n\t"
+		"movq %6, -3*8(%2)\n\t"
+		"movq %7, -4*8(%2)\n\t"
+		"leaq -4*8(%2), %2\n\t"
+		"jae 8b\n\t"
+		/*
+		 * Calculate copy position to head.
+		 */
+		"addq $0x20, %0\n\t"
+		"subq %0, %1\n\t"
+		"subq %0, %2\n\t"
+		"1:\n\t"
+		"cmpq $16, %0\n\t"
+		"jb 9f\n\t"
+		/*
+		 * Move data from 16 bytes to 31 bytes.
+		 */
+		"movq 0*8(%1), %4\n\t"
+		"movq 1*8(%1), %5\n\t"
+		"movq -2*8(%1, %0), %6\n\t"
+		"movq -1*8(%1, %0), %7\n\t"
+		"movq %4, 0*8(%2)\n\t"
+		"movq %5, 1*8(%2)\n\t"
+		"movq %6, -2*8(%2, %0)\n\t"
+		"movq %7, -1*8(%2, %0)\n\t"
+		"jmp 13f\n\t"
+		".p2align 4\n\t"
+		"9:\n\t"
+		"cmpq $8, %0\n\t"
+		"jb 10f\n\t"
+		/*
+		 * Move data from 8 bytes to 15 bytes.
+		 */
+		"movq 0*8(%1), %4\n\t"
+		"movq -1*8(%1, %0), %5\n\t"
+		"movq %4, 0*8(%2)\n\t"
+		"movq %5, -1*8(%2, %0)\n\t"
+		"jmp 13f\n\t"
+		"10:\n\t"
+		"cmpq $4, %0\n\t"
+		"jb 11f\n\t"
+		/*
+		 * Move data from 4 bytes to 7 bytes.
+		 */
+		"movl (%1), %4d\n\t"
+		"movl -4(%1, %0), %5d\n\t"
+		"movl %4d, (%2)\n\t"
+		"movl %5d, -4(%2, %0)\n\t"
+		"jmp 13f\n\t"
+		"11:\n\t"
+		"cmp $2, %0\n\t"
+		"jb 12f\n\t"
+		/*
+		 * Move data from 2 bytes to 3 bytes.
+		 */
+		"movw (%1), %4w\n\t"
+		"movw -2(%1, %0), %5w\n\t"
+		"movw %4w, (%2)\n\t"
+		"movw %5w, -2(%2, %0)\n\t"
+		"jmp 13f\n\t"
+		"12:\n\t"
+		"cmp $1, %0\n\t"
+		"jb 13f\n\t"
+		/*
+		 * Move data for 1 byte.
+		 */
+		"movb (%1), %4b\n\t"
+		"movb %4b, (%2)\n\t"
+		"13:\n\t"
+		: "=&d" (d0), "=&S" (d1), "=&D" (d2), "=&a" (ret) ,
+		  "=r"(d3), "=r"(d4), "=r"(d5), "=r"(d6), "=&c" (d7)
+		:"0" (count),
+		 "1" (src),
+		 "2" (dest)
+		:"memory");
+
+		return ret;
+
 }
 EXPORT_SYMBOL(memmove);
-- 
cgit v1.2.3-18-g5258


From b365a85c68161ea5db5476eb8845a91ceb1777ea Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Wed, 29 Sep 2010 10:41:05 +0200
Subject: x86, UV: Use allocated buffer in tlb_uv.c:tunables_read()

The original code didn't check that the value returned from
snprintf() was less than the size of the buffer.  Although it
didn't cause a runtime bug in this case, it makes the static
checkers complain.

Andrew Morton suggested a dynamically sized buffer would be
cleaner.

Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dan Carpenter <error27@gmail.com>
Cc: Cliff Wickman <cpw@sgi.com>
Cc: Jack Steiner <steiner@sgi.com>
Cc: Robin Holt <holt@sgi.com>
LKML-Reference: <20100929083118.GA6376@bicker>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/tlb_uv.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 312ef029281..33e77e4f7c8 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -1001,10 +1001,10 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
 static ssize_t tunables_read(struct file *file, char __user *userbuf,
 						size_t count, loff_t *ppos)
 {
-	char buf[300];
+	char *buf;
 	int ret;
 
-	ret = snprintf(buf, 300, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n",
+	buf = kasprintf(GFP_KERNEL, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n",
 		"max_bau_concurrent plugged_delay plugsb4reset",
 		"timeoutsb4reset ipi_reset_limit complete_threshold",
 		"congested_response_us congested_reps congested_period",
@@ -1012,7 +1012,12 @@ static ssize_t tunables_read(struct file *file, char __user *userbuf,
 		timeoutsb4reset, ipi_reset_limit, complete_threshold,
 		congested_response_us, congested_reps, congested_period);
 
-	return simple_read_from_buffer(userbuf, count, ppos, buf, ret);
+	if (!buf)
+		return -ENOMEM;
+
+	ret = simple_read_from_buffer(userbuf, count, ppos, buf, strlen(buf));
+	kfree(buf);
+	return ret;
 }
 
 /*
-- 
cgit v1.2.3-18-g5258


From 40c6b3cb64cd1d02322df5f729cca25084580f40 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Wed, 29 Sep 2010 10:46:46 -0400
Subject: oprofile, x86: Using struct stack_frame for 64bit processes dump

Removing unnecessary struct frame_head and replacing it with
struct stack_frame.

The struct stack_frame is already defined and used in other places
in kernel, so there's no reason to define new structure.

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/backtrace.c | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index 3855096c59b..d640a86198b 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -48,35 +48,30 @@ static struct stacktrace_ops backtrace_ops = {
 	.walk_stack	= print_context_stack,
 };
 
-struct frame_head {
-	struct frame_head *bp;
-	unsigned long ret;
-} __attribute__((packed));
-
-static struct frame_head *dump_user_backtrace(struct frame_head *head)
+static struct stack_frame *dump_user_backtrace(struct stack_frame *head)
 {
-	struct frame_head bufhead[2];
+	struct stack_frame bufhead[2];
 
-	/* Also check accessibility of one struct frame_head beyond */
+	/* Also check accessibility of one struct stack_frame beyond */
 	if (!access_ok(VERIFY_READ, head, sizeof(bufhead)))
 		return NULL;
 	if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead)))
 		return NULL;
 
-	oprofile_add_trace(bufhead[0].ret);
+	oprofile_add_trace(bufhead[0].return_address);
 
 	/* frame pointers should strictly progress back up the stack
 	 * (towards higher addresses) */
-	if (head >= bufhead[0].bp)
+	if (head >= bufhead[0].next_frame)
 		return NULL;
 
-	return bufhead[0].bp;
+	return bufhead[0].next_frame;
 }
 
 void
 x86_backtrace(struct pt_regs * const regs, unsigned int depth)
 {
-	struct frame_head *head = (struct frame_head *)frame_pointer(regs);
+	struct stack_frame *head = (struct stack_frame *)frame_pointer(regs);
 
 	if (!user_mode_vm(regs)) {
 		unsigned long stack = kernel_stack_pointer(regs);
-- 
cgit v1.2.3-18-g5258


From f6dedecc37164a58bb80ae2ed9d204669ffc4850 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Wed, 29 Sep 2010 10:46:47 -0400
Subject: oprofile, x86: Adding backtrace dump for 32bit process in compat mode

This patch implements the oprofile backtrace  generation for 32 bit
applications running in the 64bit environment (compat mode).

With this change it's possible to get backtrace for 32bits applications
under the 64bits environment using oprofile's callgraph options.

opcontrol --setup -c ...
opreport -l -cg ...

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/backtrace.c | 53 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index d640a86198b..2d49d4e19a3 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -14,6 +14,7 @@
 #include <asm/ptrace.h>
 #include <asm/uaccess.h>
 #include <asm/stacktrace.h>
+#include <linux/compat.h>
 
 static void backtrace_warning_symbol(void *data, char *msg,
 				     unsigned long symbol)
@@ -48,6 +49,55 @@ static struct stacktrace_ops backtrace_ops = {
 	.walk_stack	= print_context_stack,
 };
 
+#ifdef CONFIG_COMPAT
+static struct stack_frame_ia32 *
+dump_user_backtrace_32(struct stack_frame_ia32 *head)
+{
+	struct stack_frame_ia32 bufhead[2];
+	struct stack_frame_ia32 *fp;
+
+	/* Also check accessibility of one struct frame_head beyond */
+	if (!access_ok(VERIFY_READ, head, sizeof(bufhead)))
+		return NULL;
+	if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead)))
+		return NULL;
+
+	fp = (struct stack_frame_ia32 *) compat_ptr(bufhead[0].next_frame);
+
+	oprofile_add_trace(bufhead[0].return_address);
+
+	/* frame pointers should strictly progress back up the stack
+	* (towards higher addresses) */
+	if (head >= fp)
+		return NULL;
+
+	return fp;
+}
+
+static inline int
+x86_backtrace_32(struct pt_regs * const regs, unsigned int depth)
+{
+	struct stack_frame_ia32 *head;
+
+	/* User process is 32-bit */
+	if (!current || !test_thread_flag(TIF_IA32))
+		return 0;
+
+	head = (struct stack_frame_ia32 *) regs->bp;
+	while (depth-- && head)
+		head = dump_user_backtrace_32(head);
+
+	return 1;
+}
+
+#else
+static inline int
+x86_backtrace_32(struct pt_regs * const regs, unsigned int depth)
+{
+	return 0;
+}
+#endif /* CONFIG_COMPAT */
+
 static struct stack_frame *dump_user_backtrace(struct stack_frame *head)
 {
 	struct stack_frame bufhead[2];
@@ -81,6 +131,9 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth)
 		return;
 	}
 
+	if (x86_backtrace_32(regs, depth))
+		return;
+
 	while (depth-- && head)
 		head = dump_user_backtrace(head);
 }
-- 
cgit v1.2.3-18-g5258


From 5140434d5f82f2e2119926272ada2e9731ec04f1 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Thu, 30 Sep 2010 18:55:47 +0200
Subject: oprofile, x86: Simplify init/exit functions

Now, that we only call the exit function if init succeeds with commit:

 979048e oprofile: don't call arch exit code from init code on failure

we can simplify the x86 init/exit functions too. Variable using_nmi
becomes obsolete.

Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/nmi_int.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index f1575c9a257..bd1489c3ce0 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -695,9 +695,6 @@ static int __init ppro_init(char **cpu_type)
 	return 1;
 }
 
-/* in order to get sysfs right */
-static int using_nmi;
-
 int __init op_nmi_init(struct oprofile_operations *ops)
 {
 	__u8 vendor = boot_cpu_data.x86_vendor;
@@ -705,8 +702,6 @@ int __init op_nmi_init(struct oprofile_operations *ops)
 	char *cpu_type = NULL;
 	int ret = 0;
 
-	using_nmi = 0;
-
 	if (!cpu_has_apic)
 		return -ENODEV;
 
@@ -790,13 +785,11 @@ int __init op_nmi_init(struct oprofile_operations *ops)
 	if (ret)
 		return ret;
 
-	using_nmi = 1;
 	printk(KERN_INFO "oprofile: using NMI interrupt.\n");
 	return 0;
 }
 
 void op_nmi_exit(void)
 {
-	if (using_nmi)
-		exit_sysfs();
+	exit_sysfs();
 }
-- 
cgit v1.2.3-18-g5258


From 3fdbf004c1706480a7c7fac3c9d836fa6df20d7d Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Thu, 30 Sep 2010 14:32:35 +0200
Subject: x86, mtrr: Assume SYS_CFG[Tom2ForceMemTypeWB] exists on all future
 AMD CPUs

Instead of adapting the CPU family check in amd_special_default_mtrr()
for each new CPU family assume that all new AMD CPUs support the
necessary bits in SYS_CFG MSR.

Tom2Enabled is architectural (defined in APM Vol.2).
Tom2ForceMemTypeWB is defined in all BKDGs starting with K8 NPT.
In pre K8-NPT BKDG this bit is reserved (read as zero).

W/o this adaption Linux would unnecessarily complain about bad MTRR
settings on every new AMD CPU family, e.g.

[    0.000000] WARNING: BIOS bug: CPU MTRRs don't cover all of memory, losing 4863MB of RAM.

Cc: stable@kernel.org # .32.x, .35.x
Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
LKML-Reference: <20100930123235.GB20545@loge.amd.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/cpu/mtrr/cleanup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index c5f59d07142..ac140c7be39 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -827,7 +827,7 @@ int __init amd_special_default_mtrr(void)
 
 	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
 		return 0;
-	if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
+	if (boot_cpu_data.x86 < 0xf)
 		return 0;
 	/* In case some hypervisor doesn't pass SYSCFG through: */
 	if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
-- 
cgit v1.2.3-18-g5258


From 420b13b60a3e5c5dcc6ec290e131cf5fbc603d94 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Thu, 30 Sep 2010 14:33:58 +0200
Subject: x86, nmi: Support NMI watchdog on newer AMD CPU families

CPU families 0x12, 0x14 and 0x15 support this functionality.

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
LKML-Reference: <20100930123357.GC20545@loge.amd.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/cpu/perfctr-watchdog.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index fb329e9f849..d9f4ff8fcd6 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -700,11 +700,10 @@ static void probe_nmi_watchdog(void)
 {
 	switch (boot_cpu_data.x86_vendor) {
 	case X86_VENDOR_AMD:
-		if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 &&
-		    boot_cpu_data.x86 != 16 && boot_cpu_data.x86 != 17)
-			return;
-		wd_ops = &k7_wd_ops;
-		break;
+		if (boot_cpu_data.x86 == 6 ||
+		    (boot_cpu_data.x86 >= 0xf && boot_cpu_data.x86 <= 0x15))
+			wd_ops = &k7_wd_ops;
+		return;
 	case X86_VENDOR_INTEL:
 		/* Work around where perfctr1 doesn't have a working enable
 		 * bit as described in the following errata:
-- 
cgit v1.2.3-18-g5258


From 23588c38a84c9175c6668789b64ffba4651e5c6a Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Thu, 30 Sep 2010 14:36:28 +0200
Subject: x86, amd: Add support for CPUID topology extension of AMD CPUs

Node information (ID, number of internal nodes) is provided via
CPUID Fn8000_001e_ECX.

See AMD CPUID Specification (Publication # 25481, Revision 2.34,
September 2010).

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
LKML-Reference: <20100930123628.GD20545@loge.amd.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/cpu/amd.c | 50 ++++++++++++++++++++++++-----------------------
 1 file changed, 26 insertions(+), 24 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 0f0ace5d7db..7e6a37d2425 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -253,37 +253,41 @@ static int __cpuinit nearby_node(int apicid)
 #endif
 
 /*
- * Fixup core topology information for AMD multi-node processors.
- * Assumption: Number of cores in each internal node is the same.
+ * Fixup core topology information for
+ * (1) AMD multi-node processors
+ *     Assumption: Number of cores in each internal node is the same.
  */
 #ifdef CONFIG_X86_HT
-static void __cpuinit amd_fixup_dcm(struct cpuinfo_x86 *c)
+static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c)
 {
-	unsigned long long value;
 	u32 nodes, cores_per_node;
+	u8 node_id;
+	unsigned long long value;
 	int cpu = smp_processor_id();
 
-	if (!cpu_has(c, X86_FEATURE_NODEID_MSR))
-		return;
-
-	/* fixup topology information only once for a core */
-	if (cpu_has(c, X86_FEATURE_AMD_DCM))
+	/* get information required for multi-node processors */
+	if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
+		value = cpuid_ecx(0x8000001e);
+		nodes = ((value >> 8) & 7) + 1;
+		node_id = value & 7;
+	} else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {
+		rdmsrl(MSR_FAM10H_NODE_ID, value);
+		nodes = ((value >> 3) & 7) + 1;
+		node_id = value & 7;
+	} else
 		return;
 
-	rdmsrl(MSR_FAM10H_NODE_ID, value);
+	/* fixup multi-node processor information */
+	if (nodes > 1) {
+		set_cpu_cap(c, X86_FEATURE_AMD_DCM);
+		cores_per_node = c->x86_max_cores / nodes;
 
-	nodes = ((value >> 3) & 7) + 1;
-	if (nodes == 1)
-		return;
-
-	set_cpu_cap(c, X86_FEATURE_AMD_DCM);
-	cores_per_node = c->x86_max_cores / nodes;
+		/* store NodeID, use llc_shared_map to store sibling info */
+		per_cpu(cpu_llc_id, cpu) = node_id;
 
-	/* store NodeID, use llc_shared_map to store sibling info */
-	per_cpu(cpu_llc_id, cpu) = value & 7;
-
-	/* fixup core id to be in range from 0 to (cores_per_node - 1) */
-	c->cpu_core_id = c->cpu_core_id % cores_per_node;
+		/* core id to be in range from 0 to (cores_per_node - 1) */
+		c->cpu_core_id = c->cpu_core_id % cores_per_node;
+	}
 }
 #endif
 
@@ -304,9 +308,7 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
 	c->phys_proc_id = c->initial_apicid >> bits;
 	/* use socket ID also for last level cache */
 	per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
-	/* fixup topology information on multi-node processors */
-	if ((c->x86 == 0x10) && (c->x86_model == 9))
-		amd_fixup_dcm(c);
+	amd_get_topology(c);
 #endif
 }
 
-- 
cgit v1.2.3-18-g5258


From 6057b4d331f19a3ea51aec463ea7839c128b3227 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Thu, 30 Sep 2010 14:38:57 +0200
Subject: x86, amd: Extract compute unit information for AMD CPUs

Get compute unit information from CPUID Fn8000_001E_EBX.
(See AMD CPUID Specification - publication # 25481, revision 2.34,
September 2010.)

Note that each core on a compute unit still has a core_id of its own.

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
LKML-Reference: <20100930123857.GE20545@loge.amd.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/processor.h |  2 ++
 arch/x86/kernel/cpu/amd.c        | 20 +++++++++++++++-----
 2 files changed, 17 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 325b7bdbeba..69e80c2ec6c 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -110,6 +110,8 @@ struct cpuinfo_x86 {
 	u16			phys_proc_id;
 	/* Core id: */
 	u16			cpu_core_id;
+	/* Compute unit id */
+	u8			compute_unit_id;
 	/* Index into per_cpu list: */
 	u16			cpu_index;
 #endif
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 7e6a37d2425..70168ab88b7 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -256,21 +256,29 @@ static int __cpuinit nearby_node(int apicid)
  * Fixup core topology information for
  * (1) AMD multi-node processors
  *     Assumption: Number of cores in each internal node is the same.
+ * (2) AMD processors supporting compute units
  */
 #ifdef CONFIG_X86_HT
 static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c)
 {
-	u32 nodes, cores_per_node;
+	u32 nodes;
 	u8 node_id;
-	unsigned long long value;
 	int cpu = smp_processor_id();
 
 	/* get information required for multi-node processors */
 	if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
-		value = cpuid_ecx(0x8000001e);
-		nodes = ((value >> 8) & 7) + 1;
-		node_id = value & 7;
+		u32 eax, ebx, ecx, edx;
+
+		cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
+		nodes = ((ecx >> 8) & 7) + 1;
+		node_id = ecx & 7;
+
+		/* get compute unit information */
+		smp_num_siblings = ((ebx >> 8) & 3) + 1;
+		c->compute_unit_id = ebx & 0xff;
 	} else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {
+		u64 value;
+
 		rdmsrl(MSR_FAM10H_NODE_ID, value);
 		nodes = ((value >> 3) & 7) + 1;
 		node_id = value & 7;
@@ -279,6 +287,8 @@ static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c)
 
 	/* fixup multi-node processor information */
 	if (nodes > 1) {
+		u32 cores_per_node;
+
 		set_cpu_cap(c, X86_FEATURE_AMD_DCM);
 		cores_per_node = c->x86_max_cores / nodes;
 
-- 
cgit v1.2.3-18-g5258


From d4fbe4f03557e1fd4d9bbb3a1957aad560f39e96 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Thu, 30 Sep 2010 14:41:56 +0200
Subject: x86, amd: Use compute unit information to determine thread siblings

This information is vital for different load balancing policies.

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
LKML-Reference: <20100930124156.GF20545@loge.amd.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/smpboot.c | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 8b3bfc4dd70..bc2cc444844 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -397,6 +397,19 @@ void __cpuinit smp_store_cpu_info(int id)
 		identify_secondary_cpu(c);
 }
 
+static void __cpuinit link_thread_siblings(int cpu1, int cpu2)
+{
+	struct cpuinfo_x86 *c1 = &cpu_data(cpu1);
+	struct cpuinfo_x86 *c2 = &cpu_data(cpu2);
+
+	cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2));
+	cpumask_set_cpu(cpu2, cpu_sibling_mask(cpu1));
+	cpumask_set_cpu(cpu1, cpu_core_mask(cpu2));
+	cpumask_set_cpu(cpu2, cpu_core_mask(cpu1));
+	cpumask_set_cpu(cpu1, c2->llc_shared_map);
+	cpumask_set_cpu(cpu2, c1->llc_shared_map);
+}
+
 
 void __cpuinit set_cpu_sibling_map(int cpu)
 {
@@ -409,14 +422,13 @@ void __cpuinit set_cpu_sibling_map(int cpu)
 		for_each_cpu(i, cpu_sibling_setup_mask) {
 			struct cpuinfo_x86 *o = &cpu_data(i);
 
-			if (c->phys_proc_id == o->phys_proc_id &&
-			    c->cpu_core_id == o->cpu_core_id) {
-				cpumask_set_cpu(i, cpu_sibling_mask(cpu));
-				cpumask_set_cpu(cpu, cpu_sibling_mask(i));
-				cpumask_set_cpu(i, cpu_core_mask(cpu));
-				cpumask_set_cpu(cpu, cpu_core_mask(i));
-				cpumask_set_cpu(i, c->llc_shared_map);
-				cpumask_set_cpu(cpu, o->llc_shared_map);
+			if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
+				if (c->phys_proc_id == o->phys_proc_id &&
+				    c->compute_unit_id == o->compute_unit_id)
+					link_thread_siblings(cpu, i);
+			} else if (c->phys_proc_id == o->phys_proc_id &&
+				   c->cpu_core_id == o->cpu_core_id) {
+				link_thread_siblings(cpu, i);
 			}
 		}
 	} else {
-- 
cgit v1.2.3-18-g5258


From 5c80cc78de46aef6cd5e714208da05c3f7f548f8 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Thu, 30 Sep 2010 14:43:16 +0200
Subject: x86, amd_nb: Enable GART support for AMD family 0x15 CPUs

AMD CPU family 0x15 still supports GART for compatibility reasons.

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
LKML-Reference: <20100930124316.GG20545@loge.amd.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/amd_nb.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 4ffc38df7ac..8f6463d8ed0 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -15,6 +15,7 @@ static u32 *flush_words;
 struct pci_device_id k8_nb_ids[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_MISC) },
 	{}
 };
 EXPORT_SYMBOL(k8_nb_ids);
@@ -45,7 +46,8 @@ int cache_k8_northbridges(void)
 		k8_northbridges.num++;
 
 	/* some CPU families (e.g. family 0x11) do not support GART */
-	if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10)
+	if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 ||
+	    boot_cpu_data.x86 == 0x15)
 		k8_northbridges.gart_supported = 1;
 
 	k8_northbridges.nb_misc = kmalloc((k8_northbridges.num + 1) *
-- 
cgit v1.2.3-18-g5258


From 3bb9808e99bcc36eecb8e082bf70efb2a0bcdcb7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 27 Sep 2010 12:46:02 +0000
Subject: x86: Use genirq Kconfig

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <20100927121843.314600915@linutronix.de>
Reviewed-by: H. Peter Anvin <hpa@zytor.com>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig | 36 +++++-------------------------------
 1 file changed, 5 insertions(+), 31 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cea0cd9a316..3ec657f7ee7 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -59,6 +59,11 @@ config X86
 	select ANON_INODES
 	select HAVE_ARCH_KMEMCHECK
 	select HAVE_USER_RETURN_NOTIFIER
+	select HAVE_GENERIC_HARDIRQS
+	select HAVE_SPARSE_IRQ
+	select NUMA_IRQ_DESC if (SPARSE_IRQ && NUMA)
+	select GENERIC_IRQ_PROBE
+	select GENERIC_PENDING_IRQ if SMP
 
 config INSTRUCTION_DECODER
 	def_bool (KPROBES || PERF_EVENTS)
@@ -200,20 +205,6 @@ config HAVE_INTEL_TXT
 	def_bool y
 	depends on EXPERIMENTAL && DMAR && ACPI
 
-# Use the generic interrupt handling code in kernel/irq/:
-config GENERIC_HARDIRQS
-	def_bool y
-
-config GENERIC_HARDIRQS_NO__DO_IRQ
-       def_bool y
-
-config GENERIC_IRQ_PROBE
-	def_bool y
-
-config GENERIC_PENDING_IRQ
-	def_bool y
-	depends on GENERIC_HARDIRQS && SMP
-
 config USE_GENERIC_SMP_HELPERS
 	def_bool y
 	depends on SMP
@@ -296,23 +287,6 @@ config X86_X2APIC
 
 	  If you don't know what to do here, say N.
 
-config SPARSE_IRQ
-	bool "Support sparse irq numbering"
-	depends on PCI_MSI || HT_IRQ
-	---help---
-	  This enables support for sparse irqs. This is useful for distro
-	  kernels that want to define a high CONFIG_NR_CPUS value but still
-	  want to have low kernel memory footprint on smaller machines.
-
-	  ( Sparse IRQs can also be beneficial on NUMA boxes, as they spread
-	    out the irq_desc[] array in a more NUMA-friendly way. )
-
-	  If you don't know what to do here, say N.
-
-config NUMA_IRQ_DESC
-	def_bool y
-	depends on SPARSE_IRQ && NUMA
-
 config X86_MPPARSE
 	bool "Enable MPS table" if ACPI
 	default y
-- 
cgit v1.2.3-18-g5258


From 366d4a43b1a7a861c356a0e407c4f03f454d42ea Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@alien8.de>
Date: Mon, 4 Oct 2010 09:31:27 +0200
Subject: x86, cpu: Fix X86_FEATURE_NOPL

ba0593bf553c450a03dbc5f8c1f0ff58b778a0c8 cleared the aforementioned
cpuid bit only on 32-bit due to various problems with Virtual PC. This
somehow got lost during the 32- + 64-bit merge so restore the feature
bit on 64-bit. For that, set it explicitly for non-constant arguments of
cpu_has(). Update comment for future reference.

Signed-off-by: Borislav Petkov <bp@alien8.de>
LKML-Reference: <20101004073127.GA20305@liondog.tnic>
Cc: Ryan O'Neill <ryan@innosecc.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/cpu/common.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index f2f9ac7da25..927e630aeaf 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -704,16 +704,21 @@ void __init early_cpu_init(void)
 }
 
 /*
- * The NOPL instruction is supposed to exist on all CPUs with
- * family >= 6; unfortunately, that's not true in practice because
- * of early VIA chips and (more importantly) broken virtualizers that
- * are not easy to detect.  In the latter case it doesn't even *fail*
- * reliably, so probing for it doesn't even work.  Disable it completely
+ * The NOPL instruction is supposed to exist on all CPUs of family >= 6;
+ * unfortunately, that's not true in practice because of early VIA
+ * chips and (more importantly) broken virtualizers that are not easy
+ * to detect. In the latter case it doesn't even *fail* reliably, so
+ * probing for it doesn't even work. Disable it completely on 32-bit
  * unless we can find a reliable way to detect all the broken cases.
+ * Enable it explicitly on 64-bit for non-constant inputs of cpu_has().
  */
 static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
 {
+#ifdef CONFIG_X86_32
 	clear_cpu_cap(c, X86_FEATURE_NOPL);
+#else
+	set_cpu_cap(c, X86_FEATURE_NOPL);
+#endif
 }
 
 static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
-- 
cgit v1.2.3-18-g5258


From c62f981f9309486ba5546edbb5925f71e441fa65 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Fri, 1 Oct 2010 14:18:47 -0700
Subject: perf, gcc-4.6: Fix set but unused variable

Just dead code I believe.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: andi@firstfloor.org
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 6526a86616a..e2513f26ba8 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1152,7 +1152,6 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
 	struct perf_sample_data data;
 	struct cpu_hw_events *cpuc;
 	struct perf_event *event;
-	struct hw_perf_event *hwc;
 	int idx, handled = 0;
 	u64 val;
 
@@ -1173,7 +1172,6 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
 		}
 
 		event = cpuc->events[idx];
-		hwc = &event->hw;
 
 		val = x86_perf_event_update(event);
 		if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
-- 
cgit v1.2.3-18-g5258


From 9f4c13964b58608fbce05540743281ea3146c0e8 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Tue, 5 Oct 2010 16:05:14 -0700
Subject: x86, memblock: Fix crashkernel allocation

Cai Qian found crashkernel is broken with the x86 memblock changes.

1. crashkernel=128M@32M always reported that range is used, even if
   the first kernel is small and does not usethat range

2. we always got following report when using "kexec -p"
	Could not find a free area of memory of a000 bytes...
	locate_hole failed

The root cause is that generic memblock_find_in_range() will try to
allocate from the top of the range, whereas the kexec code was written
assuming that allocation was always near the bottom and that it could
blindly extend memory upward.  Unfortunately the kexec code doesn't
have a system for requesting the range that it really needs, so this
is subject to probabilistic failures.

This patch hacks around the problem by limiting the target range
heuristically to below the traditional bzImage max range.  This number
is arbitrary and not always correct, and a much better result would be
obtained by having kexec communicate this number based on the kernel
header information and any appropriate command line options.

Reported-and-Bisected-by: CAI Qian <caiqian@redhat.com>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <4CABAF2A.5090501@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/setup.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index bf89e0a59b8..b11a238b2e3 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -502,6 +502,7 @@ static inline unsigned long long get_total_mem(void)
 	return total << PAGE_SHIFT;
 }
 
+#define DEFAULT_BZIMAGE_ADDR_MAX 0x37FFFFFF
 static void __init reserve_crashkernel(void)
 {
 	unsigned long long total_mem;
@@ -519,8 +520,12 @@ static void __init reserve_crashkernel(void)
 	if (crash_base <= 0) {
 		const unsigned long long alignment = 16<<20;	/* 16M */
 
-		crash_base = memblock_find_in_range(alignment, ULONG_MAX, crash_size,
-				 alignment);
+		/*
+		 *  kexec want bzImage is below DEFAULT_BZIMAGE_ADDR_MAX
+		 */
+		crash_base = memblock_find_in_range(alignment,
+			       DEFAULT_BZIMAGE_ADDR_MAX, crash_size, alignment);
+
 		if (crash_base == MEMBLOCK_ERROR) {
 			pr_info("crashkernel reservation failed - No suitable area found.\n");
 			return;
@@ -528,8 +533,8 @@ static void __init reserve_crashkernel(void)
 	} else {
 		unsigned long long start;
 
-		start = memblock_find_in_range(crash_base, ULONG_MAX, crash_size,
-				 1<<20);
+		start = memblock_find_in_range(crash_base,
+				 crash_base + crash_size, crash_size, 1<<20);
 		if (start != crash_base) {
 			pr_info("crashkernel reservation failed - memory is in use.\n");
 			return;
-- 
cgit v1.2.3-18-g5258


From 1d931264af0f10649b35afa8fbd2e169da51ac08 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Tue, 5 Oct 2010 16:15:15 -0700
Subject: x86-32, memblock: Make add_highpages honor early reserved ranges

Originally the only early reserved range that is overlapped with high
pages is "KVA RAM", but we already do remove that from the active ranges.

However, It turns out Xen could have that kind of overlapping to support memory
ballooning.x

So we need to make add_highpage_with_active_regions() to subtract
memblock reserved just like low ram; this is the proper design anyway.

In this patch, refactering get_freel_all_memory_range() to make it can
be used by add_highpage_with_active_regions().  Also we don't need to
remove "KVA RAM" from active ranges.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <4CABB183.1040607@kernel.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/memblock.h |  2 ++
 arch/x86/mm/init_32.c           | 53 +++++++++++++----------------------------
 arch/x86/mm/memblock.c          | 19 +++++++++++----
 arch/x86/mm/numa_32.c           |  2 --
 4 files changed, 33 insertions(+), 43 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/memblock.h b/arch/x86/include/asm/memblock.h
index 2c304bb6e07..19ae14ba697 100644
--- a/arch/x86/include/asm/memblock.h
+++ b/arch/x86/include/asm/memblock.h
@@ -9,6 +9,8 @@ void memblock_x86_to_bootmem(u64 start, u64 end);
 void memblock_x86_reserve_range(u64 start, u64 end, char *name);
 void memblock_x86_free_range(u64 start, u64 end);
 struct range;
+int __get_free_all_memory_range(struct range **range, int nodeid,
+			 unsigned long start_pfn, unsigned long end_pfn);
 int get_free_all_memory_range(struct range **rangep, int nodeid);
 
 void memblock_x86_register_active_regions(int nid, unsigned long start_pfn,
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index c2385d7ae31..85467099d6d 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -423,49 +423,28 @@ static void __init add_one_highpage_init(struct page *page)
 	totalhigh_pages++;
 }
 
-struct add_highpages_data {
-	unsigned long start_pfn;
-	unsigned long end_pfn;
-};
-
-static int __init add_highpages_work_fn(unsigned long start_pfn,
-					 unsigned long end_pfn, void *datax)
+void __init add_highpages_with_active_regions(int nid,
+			 unsigned long start_pfn, unsigned long end_pfn)
 {
-	int node_pfn;
-	struct page *page;
-	unsigned long final_start_pfn, final_end_pfn;
-	struct add_highpages_data *data;
+	struct range *range;
+	int nr_range;
+	int i;
 
-	data = (struct add_highpages_data *)datax;
+	nr_range = __get_free_all_memory_range(&range, nid, start_pfn, end_pfn);
 
-	final_start_pfn = max(start_pfn, data->start_pfn);
-	final_end_pfn = min(end_pfn, data->end_pfn);
-	if (final_start_pfn >= final_end_pfn)
-		return 0;
+	for (i = 0; i < nr_range; i++) {
+		struct page *page;
+		int node_pfn;
 
-	for (node_pfn = final_start_pfn; node_pfn < final_end_pfn;
-	     node_pfn++) {
-		if (!pfn_valid(node_pfn))
-			continue;
-		page = pfn_to_page(node_pfn);
-		add_one_highpage_init(page);
+		for (node_pfn = range[i].start; node_pfn < range[i].end;
+		     node_pfn++) {
+			if (!pfn_valid(node_pfn))
+				continue;
+			page = pfn_to_page(node_pfn);
+			add_one_highpage_init(page);
+		}
 	}
-
-	return 0;
-
 }
-
-void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn,
-					      unsigned long end_pfn)
-{
-	struct add_highpages_data data;
-
-	data.start_pfn = start_pfn;
-	data.end_pfn = end_pfn;
-
-	work_with_active_regions(nid, add_highpages_work_fn, &data);
-}
-
 #else
 static inline void permanent_kmaps_init(pgd_t *pgd_base)
 {
diff --git a/arch/x86/mm/memblock.c b/arch/x86/mm/memblock.c
index 50ecbc59757..fd7a0404945 100644
--- a/arch/x86/mm/memblock.c
+++ b/arch/x86/mm/memblock.c
@@ -156,7 +156,8 @@ static int __init count_early_node_map(int nodeid)
 	return data.nr;
 }
 
-int __init get_free_all_memory_range(struct range **rangep, int nodeid)
+int __init __get_free_all_memory_range(struct range **rangep, int nodeid,
+			 unsigned long start_pfn, unsigned long end_pfn)
 {
 	int count;
 	struct range *range;
@@ -172,9 +173,9 @@ int __init get_free_all_memory_range(struct range **rangep, int nodeid)
 	 * at first
 	 */
 	nr_range = add_from_early_node_map(range, count, nr_range, nodeid);
-#ifdef CONFIG_X86_32
-	subtract_range(range, count, max_low_pfn, -1ULL);
-#endif
+	subtract_range(range, count, 0, start_pfn);
+	subtract_range(range, count, end_pfn, -1ULL);
+
 	memblock_x86_subtract_reserved(range, count);
 	nr_range = clean_sort_range(range, count);
 
@@ -182,6 +183,16 @@ int __init get_free_all_memory_range(struct range **rangep, int nodeid)
 	return nr_range;
 }
 
+int __init get_free_all_memory_range(struct range **rangep, int nodeid)
+{
+	unsigned long end_pfn = -1UL;
+
+#ifdef CONFIG_X86_32
+	end_pfn = max_low_pfn;
+#endif
+	return __get_free_all_memory_range(rangep, nodeid, 0, end_pfn);
+}
+
 static u64 __init __memblock_x86_memory_in_range(u64 addr, u64 limit, bool get_free)
 {
 	int i, count;
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 70ddeb75ba2..84a3e4c9f27 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -326,8 +326,6 @@ static __init unsigned long calculate_numa_remap_pages(void)
 			      "KVA RAM");
 
 		node_remap_start_pfn[nid] = node_kva_final>>PAGE_SHIFT;
-		remove_active_range(nid, node_remap_start_pfn[nid],
-					 node_remap_start_pfn[nid] + size);
 	}
 	printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n",
 			reserve_pages);
-- 
cgit v1.2.3-18-g5258


From 16c36f743bf8481d0ba40a6de0af11736095d7cf Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 4 Oct 2010 14:58:04 -0700
Subject: x86, memblock: Remove __memblock_x86_find_in_range_size()

Fold it into memblock_x86_find_in_range(), and change bad_addr_size()
to check_reserve_memblock().

So whole memblock_x86_find_in_range_size() code is more readable.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <4CAA4DEC.4000401@kernel.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/mm/memblock.c | 39 +++++++++++----------------------------
 1 file changed, 11 insertions(+), 28 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/memblock.c b/arch/x86/mm/memblock.c
index fd7a0404945..aa1169392b8 100644
--- a/arch/x86/mm/memblock.c
+++ b/arch/x86/mm/memblock.c
@@ -8,7 +8,7 @@
 #include <linux/range.h>
 
 /* Check for already reserved areas */
-static inline bool __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
+static bool __init check_with_memblock_reserved_size(u64 *addrp, u64 *sizep, u64 align)
 {
 	struct memblock_region *r;
 	u64 addr = *addrp, last;
@@ -30,7 +30,7 @@ again:
 			goto again;
 		}
 		if (last <= (r->base + r->size) && addr >= r->base) {
-			(*sizep)++;
+			*sizep = 0;
 			return false;
 		}
 	}
@@ -41,29 +41,6 @@ again:
 	return changed;
 }
 
-static u64 __init __memblock_x86_find_in_range_size(u64 ei_start, u64 ei_last, u64 start,
-			 u64 *sizep, u64 align)
-{
-	u64 addr, last;
-
-	addr = round_up(ei_start, align);
-	if (addr < start)
-		addr = round_up(start, align);
-	if (addr >= ei_last)
-		goto out;
-	*sizep = ei_last - addr;
-	while (bad_addr_size(&addr, sizep, align) && addr + *sizep <= ei_last)
-		;
-	last = addr + *sizep;
-	if (last > ei_last)
-		goto out;
-
-	return addr;
-
-out:
-	return MEMBLOCK_ERROR;
-}
-
 /*
  * Find next free range after start, and size is returned in *sizep
  */
@@ -76,10 +53,16 @@ u64 __init memblock_x86_find_in_range_size(u64 start, u64 *sizep, u64 align)
 		u64 ei_last = ei_start + r->size;
 		u64 addr;
 
-		addr = __memblock_x86_find_in_range_size(ei_start, ei_last, start,
-					 sizep, align);
+		addr = round_up(ei_start, align);
+		if (addr < start)
+			addr = round_up(start, align);
+		if (addr >= ei_last)
+			continue;
+		*sizep = ei_last - addr;
+		while (check_with_memblock_reserved_size(&addr, sizep, align))
+			;
 
-		if (addr != MEMBLOCK_ERROR)
+		if (*sizep)
 			return addr;
 	}
 
-- 
cgit v1.2.3-18-g5258


From 161b0275e2311b8bd9609d5f32e2b703cf5d70a8 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Thu, 2 Sep 2010 19:35:22 -0700
Subject: x86, mm: Add RESERVE_BRK_ARRAY() helper

This is useful when converting static arrays into boot-time brk
allocated objects.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
LKML-Reference: <4C805EEA.1080205@goop.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/setup.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index ef292c792d7..d6763b139a8 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -93,6 +93,11 @@ void *extend_brk(size_t size, size_t align);
 			: : "i" (sz));					\
 	}
 
+/* Helper for reserving space for arrays of things */
+#define RESERVE_BRK_ARRAY(type, name, entries)		\
+	type *name;					\
+	RESERVE_BRK(name, sizeof(type) * entries)
+
 #ifdef __i386__
 
 void __init i386_start_kernel(void);
-- 
cgit v1.2.3-18-g5258


From df9ee29270c11dba7d0fe0b83ce47a4d8e8d2101 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 7 Oct 2010 14:08:55 +0100
Subject: Fix IRQ flag handling naming

Fix the IRQ flag handling naming.  In linux/irqflags.h under one configuration,
it maps:

	local_irq_enable() -> raw_local_irq_enable()
	local_irq_disable() -> raw_local_irq_disable()
	local_irq_save() -> raw_local_irq_save()
	...

and under the other configuration, it maps:

	raw_local_irq_enable() -> local_irq_enable()
	raw_local_irq_disable() -> local_irq_disable()
	raw_local_irq_save() -> local_irq_save()
	...

This is quite confusing.  There should be one set of names expected of the
arch, and this should be wrapped to give another set of names that are expected
by users of this facility.

Change this to have the arch provide:

	flags = arch_local_save_flags()
	flags = arch_local_irq_save()
	arch_local_irq_restore(flags)
	arch_local_irq_disable()
	arch_local_irq_enable()
	arch_irqs_disabled_flags(flags)
	arch_irqs_disabled()
	arch_safe_halt()

Then linux/irqflags.h wraps these to provide:

	raw_local_save_flags(flags)
	raw_local_irq_save(flags)
	raw_local_irq_restore(flags)
	raw_local_irq_disable()
	raw_local_irq_enable()
	raw_irqs_disabled_flags(flags)
	raw_irqs_disabled()
	raw_safe_halt()

with type checking on the flags 'arguments', and then wraps those to provide:

	local_save_flags(flags)
	local_irq_save(flags)
	local_irq_restore(flags)
	local_irq_disable()
	local_irq_enable()
	irqs_disabled_flags(flags)
	irqs_disabled()
	safe_halt()

with tracing included if enabled.

The arch functions can now all be inline functions rather than some of them
having to be macros.

Signed-off-by: David Howells <dhowells@redhat.com> [X86, FRV, MN10300]
Signed-off-by: Chris Metcalf <cmetcalf@tilera.com> [Tile]
Signed-off-by: Michal Simek <monstr@monstr.eu> [Microblaze]
Tested-by: Catalin Marinas <catalin.marinas@arm.com> [ARM]
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Haavard Skinnemoen <haavard.skinnemoen@atmel.com> [AVR]
Acked-by: Tony Luck <tony.luck@intel.com> [IA-64]
Acked-by: Hirokazu Takata <takata@linux-m32r.org> [M32R]
Acked-by: Greg Ungerer <gerg@uclinux.org> [M68K/M68KNOMMU]
Acked-by: Ralf Baechle <ralf@linux-mips.org> [MIPS]
Acked-by: Kyle McMartin <kyle@mcmartin.ca> [PA-RISC]
Acked-by: Paul Mackerras <paulus@samba.org> [PowerPC]
Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com> [S390]
Acked-by: Chen Liqin <liqin.chen@sunplusct.com> [Score]
Acked-by: Matt Fleming <matt@console-pimps.org> [SH]
Acked-by: David S. Miller <davem@davemloft.net> [Sparc]
Acked-by: Chris Zankel <chris@zankel.net> [Xtensa]
Reviewed-by: Richard Henderson <rth@twiddle.net> [Alpha]
Reviewed-by: Yoshinori Sato <ysato@users.sourceforge.jp> [H8300]
Cc: starvik@axis.com [CRIS]
Cc: jesper.nilsson@axis.com [CRIS]
Cc: linux-cris-kernel@axis.com
---
 arch/x86/include/asm/irqflags.h | 32 ++++++++++++--------------------
 arch/x86/include/asm/paravirt.h | 16 ++++++++--------
 arch/x86/xen/spinlock.c         |  2 +-
 3 files changed, 21 insertions(+), 29 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index 9e2b952f810..5745ce8bf10 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -61,22 +61,22 @@ static inline void native_halt(void)
 #else
 #ifndef __ASSEMBLY__
 
-static inline unsigned long __raw_local_save_flags(void)
+static inline unsigned long arch_local_save_flags(void)
 {
 	return native_save_fl();
 }
 
-static inline void raw_local_irq_restore(unsigned long flags)
+static inline void arch_local_irq_restore(unsigned long flags)
 {
 	native_restore_fl(flags);
 }
 
-static inline void raw_local_irq_disable(void)
+static inline void arch_local_irq_disable(void)
 {
 	native_irq_disable();
 }
 
-static inline void raw_local_irq_enable(void)
+static inline void arch_local_irq_enable(void)
 {
 	native_irq_enable();
 }
@@ -85,7 +85,7 @@ static inline void raw_local_irq_enable(void)
  * Used in the idle loop; sti takes one instruction cycle
  * to complete:
  */
-static inline void raw_safe_halt(void)
+static inline void arch_safe_halt(void)
 {
 	native_safe_halt();
 }
@@ -102,12 +102,10 @@ static inline void halt(void)
 /*
  * For spinlocks, etc:
  */
-static inline unsigned long __raw_local_irq_save(void)
+static inline unsigned long arch_local_irq_save(void)
 {
-	unsigned long flags = __raw_local_save_flags();
-
-	raw_local_irq_disable();
-
+	unsigned long flags = arch_local_save_flags();
+	arch_local_irq_disable();
 	return flags;
 }
 #else
@@ -153,22 +151,16 @@ static inline unsigned long __raw_local_irq_save(void)
 #endif /* CONFIG_PARAVIRT */
 
 #ifndef __ASSEMBLY__
-#define raw_local_save_flags(flags)				\
-	do { (flags) = __raw_local_save_flags(); } while (0)
-
-#define raw_local_irq_save(flags)				\
-	do { (flags) = __raw_local_irq_save(); } while (0)
-
-static inline int raw_irqs_disabled_flags(unsigned long flags)
+static inline int arch_irqs_disabled_flags(unsigned long flags)
 {
 	return !(flags & X86_EFLAGS_IF);
 }
 
-static inline int raw_irqs_disabled(void)
+static inline int arch_irqs_disabled(void)
 {
-	unsigned long flags = __raw_local_save_flags();
+	unsigned long flags = arch_local_save_flags();
 
-	return raw_irqs_disabled_flags(flags);
+	return arch_irqs_disabled_flags(flags);
 }
 
 #else
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 5653f43d90e..499954c530d 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -105,7 +105,7 @@ static inline void write_cr8(unsigned long x)
 }
 #endif
 
-static inline void raw_safe_halt(void)
+static inline void arch_safe_halt(void)
 {
 	PVOP_VCALL0(pv_irq_ops.safe_halt);
 }
@@ -829,32 +829,32 @@ static __always_inline void arch_spin_unlock(struct arch_spinlock *lock)
 #define __PV_IS_CALLEE_SAVE(func)			\
 	((struct paravirt_callee_save) { func })
 
-static inline unsigned long __raw_local_save_flags(void)
+static inline unsigned long arch_local_save_flags(void)
 {
 	return PVOP_CALLEE0(unsigned long, pv_irq_ops.save_fl);
 }
 
-static inline void raw_local_irq_restore(unsigned long f)
+static inline void arch_local_irq_restore(unsigned long f)
 {
 	PVOP_VCALLEE1(pv_irq_ops.restore_fl, f);
 }
 
-static inline void raw_local_irq_disable(void)
+static inline void arch_local_irq_disable(void)
 {
 	PVOP_VCALLEE0(pv_irq_ops.irq_disable);
 }
 
-static inline void raw_local_irq_enable(void)
+static inline void arch_local_irq_enable(void)
 {
 	PVOP_VCALLEE0(pv_irq_ops.irq_enable);
 }
 
-static inline unsigned long __raw_local_irq_save(void)
+static inline unsigned long arch_local_irq_save(void)
 {
 	unsigned long f;
 
-	f = __raw_local_save_flags();
-	raw_local_irq_disable();
+	f = arch_local_save_flags();
+	arch_local_irq_disable();
 	return f;
 }
 
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index e0500646585..23e061b9327 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -224,7 +224,7 @@ static noinline int xen_spin_lock_slow(struct arch_spinlock *lock, bool irq_enab
 			goto out;
 		}
 
-		flags = __raw_local_save_flags();
+		flags = arch_local_save_flags();
 		if (irq_enable) {
 			ADD_STATS(taken_slow_irqenable, 1);
 			raw_local_irq_enable();
-- 
cgit v1.2.3-18-g5258


From a416e9e1dde0fbcf20cda59df284cc0dcf2aadc4 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Wed, 29 Sep 2010 23:29:48 +0900
Subject: x86-32: Fix sparse warning for the __PHYSICAL_MASK calculation

On 32-bit non-PAE system, cast to 'phys_addr_t' truncates value
before subtraction. Subtracting before cast produce same result
but remove following warnings from sparse:

 arch/x86/include/asm/pgtable_types.h:255:38: warning: cast truncates bits from constant value (100000000 becomes 0)
 arch/x86/include/asm/pgtable_types.h:270:38: warning: cast truncates bits from constant value (100000000 becomes 0)
 arch/x86/include/asm/pgtable.h:127:32: warning: cast truncates bits from constant value (100000000 becomes 0)
 arch/x86/include/asm/pgtable.h:132:32: warning: cast truncates bits from constant value (100000000 becomes 0)
 arch/x86/include/asm/pgtable.h:344:31: warning: cast truncates bits from constant value (100000000 becomes 0)

64-bit or PAE machines will not be affected by this change.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
LKML-Reference: <1285770588-14065-1-git-send-email-namhyung@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/page_types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index a667f24c725..1df66211fd1 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -8,7 +8,7 @@
 #define PAGE_SIZE	(_AC(1,UL) << PAGE_SHIFT)
 #define PAGE_MASK	(~(PAGE_SIZE-1))
 
-#define __PHYSICAL_MASK		((phys_addr_t)(1ULL << __PHYSICAL_MASK_SHIFT) - 1)
+#define __PHYSICAL_MASK		((phys_addr_t)((1ULL << __PHYSICAL_MASK_SHIFT) - 1))
 #define __VIRTUAL_MASK		((1UL << __VIRTUAL_MASK_SHIFT) - 1)
 
 /* Cast PAGE_MASK to a signed type so that it is sign-extended if
-- 
cgit v1.2.3-18-g5258


From 55572b293b3a5929e8c54bc91d14ae6264186bf6 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Thu, 7 Oct 2010 16:42:54 -0700
Subject: x86, mrst: A function in a header file needs to be marked "inline"

A function in a header file needs to be explicitly marked "inline", or
gcc will complain if it is not used.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: <stable@kernel.org> v2.6.36
LKML-Reference: <1274295685-6774-3-git-send-email-jacob.jun.pan@linux.intel.com>
---
 arch/x86/include/asm/mrst.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h
index 16350740edf..33fc2966beb 100644
--- a/arch/x86/include/asm/mrst.h
+++ b/arch/x86/include/asm/mrst.h
@@ -26,7 +26,7 @@ enum mrst_cpu_type {
 };
 
 extern enum mrst_cpu_type __mrst_cpu_chip;
-static enum mrst_cpu_type mrst_identify_cpu(void)
+static inline enum mrst_cpu_type mrst_identify_cpu(void)
 {
 	return __mrst_cpu_chip;
 }
-- 
cgit v1.2.3-18-g5258


From 68f4d5a00adaab33b136fce2c72d5c377b39b0b0 Mon Sep 17 00:00:00 2001
From: Zhao Yakui <yakui.zhao@intel.com>
Date: Fri, 8 Oct 2010 09:47:33 +0800
Subject: x86, setup: Use string copy operation to optimze copy in kernel
 compression

The kernel decompression code parses the ELF header and then copies
the segment to the corresponding destination.  Currently it uses slow
byte-copy code.  This patch makes it use the string copy operations
instead.

In the test the copy performance can be improved very significantly after using
the string copy operation mechanism.
        1. The copy time can be reduced from 150ms to 20ms on one Atom machine
	2. The copy time can be reduced about 80% on another machine
		The time is reduced from 7ms to 1.5ms when using 32-bit kernel.
		The time is reduced from 10ms to 2ms when using 64-bit kernel.

Signed-off-by: Zhao Yakui <yakui.zhao@intel.com>
LKML-Reference: <1286502453-7043-1-git-send-email-yakui.zhao@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/boot/compressed/misc.c | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 8f7bef8e9ff..23f315c9f21 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -229,18 +229,35 @@ void *memset(void *s, int c, size_t n)
 		ss[i] = c;
 	return s;
 }
-
+#ifdef CONFIG_X86_32
 void *memcpy(void *dest, const void *src, size_t n)
 {
-	int i;
-	const char *s = src;
-	char *d = dest;
+	int d0, d1, d2;
+	asm volatile(
+		"rep ; movsl\n\t"
+		"movl %4,%%ecx\n\t"
+		"rep ; movsb\n\t"
+		: "=&c" (d0), "=&D" (d1), "=&S" (d2)
+		: "0" (n >> 2), "g" (n & 3), "1" (dest), "2" (src)
+		: "memory");
 
-	for (i = 0; i < n; i++)
-		d[i] = s[i];
 	return dest;
 }
+#else
+void *memcpy(void *dest, const void *src, size_t n)
+{
+	long d0, d1, d2;
+	asm volatile(
+		"rep ; movsq\n\t"
+		"movq %4,%%rcx\n\t"
+		"rep ; movsb\n\t"
+		: "=&c" (d0), "=&D" (d1), "=&S" (d2)
+		: "0" (n >> 3), "g" (n & 7), "1" (dest), "2" (src)
+		: "memory");
 
+	return dest;
+}
+#endif
 
 static void error(char *x)
 {
-- 
cgit v1.2.3-18-g5258


From f672b49b07a4a152fc4251f2aec6b4d05164c4cd Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Mon, 27 Sep 2010 22:05:55 +0200
Subject: x86: HWPOISON: Report correct address granuality for huge hwpoison
 faults

An earlier patch fixed the hwpoison fault handling to encode the
huge page size in the fault code of the page fault handler.

This is needed to report this information in SIGBUS to user space.

This is a straight forward patch to pass this information
through to the signal handling in the x86 specific fault.c

Cc: x86@kernel.org
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: fengguang.wu@intel.com
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/mm/fault.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 4c4508e8a20..1d15a27dd6f 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -11,6 +11,7 @@
 #include <linux/kprobes.h>		/* __kprobes, ...		*/
 #include <linux/mmiotrace.h>		/* kmmio_handler, ...		*/
 #include <linux/perf_event.h>		/* perf_sw_event		*/
+#include <linux/hugetlb.h>		/* hstate_index_to_shift	*/
 
 #include <asm/traps.h>			/* dotraplinkage, ...		*/
 #include <asm/pgalloc.h>		/* pgd_*(), ...			*/
@@ -160,15 +161,20 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
 
 static void
 force_sig_info_fault(int si_signo, int si_code, unsigned long address,
-		     struct task_struct *tsk)
+		     struct task_struct *tsk, int fault)
 {
+	unsigned lsb = 0;
 	siginfo_t info;
 
 	info.si_signo	= si_signo;
 	info.si_errno	= 0;
 	info.si_code	= si_code;
 	info.si_addr	= (void __user *)address;
-	info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0;
+	if (fault & VM_FAULT_HWPOISON_LARGE)
+		lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); 
+	if (fault & VM_FAULT_HWPOISON)
+		lsb = PAGE_SHIFT;
+	info.si_addr_lsb = lsb;
 
 	force_sig_info(si_signo, &info, tsk);
 }
@@ -731,7 +737,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
 		tsk->thread.error_code	= error_code | (address >= TASK_SIZE);
 		tsk->thread.trap_no	= 14;
 
-		force_sig_info_fault(SIGSEGV, si_code, address, tsk);
+		force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0);
 
 		return;
 	}
@@ -816,14 +822,14 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
 	tsk->thread.trap_no	= 14;
 
 #ifdef CONFIG_MEMORY_FAILURE
-	if (fault & VM_FAULT_HWPOISON) {
+	if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
 		printk(KERN_ERR
 	"MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
 			tsk->comm, tsk->pid, address);
 		code = BUS_MCEERR_AR;
 	}
 #endif
-	force_sig_info_fault(SIGBUS, code, address, tsk);
+	force_sig_info_fault(SIGBUS, code, address, tsk, fault);
 }
 
 static noinline void
@@ -833,7 +839,8 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
 	if (fault & VM_FAULT_OOM) {
 		out_of_memory(regs, error_code, address);
 	} else {
-		if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON))
+		if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
+			     VM_FAULT_HWPOISON_LARGE))
 			do_sigbus(regs, error_code, address, fault);
 		else
 			BUG();
-- 
cgit v1.2.3-18-g5258


From 5a47c7dae861c3ca3edf178546641909851bf715 Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Mon, 13 Sep 2010 15:08:54 +0800
Subject: x86: Add two helper macros for fixed address mapping

Sometimes fixmap will be used to map an physical address which
is not PAGE align, so to use it we need first map it and then
add the address offset to the mapped fixed address. These 2 new
helpers are suggested by Ingo Molnar to make the process
simpler.

For a physicall address like "phys", a directly usable virtual
address can be get by
	virt = (void *)set_fixmap_offset(fixed_idx, phys);
or
	virt = (void *)set_fixmap_offset_nocache(fixed_idx, phys);
(depends on whether the physical address is cachable or not).

Signed-off-by: Feng Tang <feng.tang@intel.com>
Cc: alan@linux.intel.com
Cc: greg@kroah.com
Cc: x86@kernel.org
LKML-Reference: <1284361736-23011-3-git-send-email-feng.tang@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/fixmap.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index d07b44f7d1d..4d293dced62 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -214,5 +214,20 @@ static inline unsigned long virt_to_fix(const unsigned long vaddr)
 	BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
 	return __virt_to_fix(vaddr);
 }
+
+/* Return an pointer with offset calculated */
+static inline unsigned long __set_fixmap_offset(enum fixed_addresses idx,
+				phys_addr_t phys, pgprot_t flags)
+{
+	__set_fixmap(idx, phys, flags);
+	return fix_to_virt(idx) + (phys & (PAGE_SIZE - 1));
+}
+
+#define set_fixmap_offset(idx, phys)			\
+	__set_fixmap_offset(idx, phys, PAGE_KERNEL)
+
+#define set_fixmap_offset_nocache(idx, phys)			\
+	__set_fixmap_offset(idx, phys, PAGE_KERNEL_NOCACHE)
+
 #endif /* !__ASSEMBLY__ */
 #endif /* _ASM_X86_FIXMAP_H */
-- 
cgit v1.2.3-18-g5258


From c20b5c3318fe45e4f33f01a91ccead645dfdf619 Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Mon, 13 Sep 2010 15:08:55 +0800
Subject: x86, earlyprintk: Add earlyprintk for Intel Moorestown platform

Intel Moorestown platform has a spi-uart device(Maxim3110),
which connects to a Designware spi core controller. This patch
will add early console function based on it.

As it will be used long before Linux spi subsystem get
initialised, we simply directly manipulate the spi controller's
register to acheive the early console func. This is safe as it
will be disabled when devices subsytem get initialised.

To use it, user need enable CONFIG_X86_MRST_EARLY_PRINTK in
kenrel config and add "earlyprintk=mrst" in kernel command line.

Signed-off-by: Feng Tang <feng.tang@intel.com>
Acked-by: Alan Cox <alan@linux.intel.com>
Cc: greg@kroah.com
LKML-Reference: <1284361736-23011-4-git-send-email-feng.tang@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig.debug              |   4 +
 arch/x86/include/asm/mrst.h         |   5 +
 arch/x86/kernel/Makefile            |   1 +
 arch/x86/kernel/early_printk.c      |   7 ++
 arch/x86/kernel/early_printk_mrst.c | 232 ++++++++++++++++++++++++++++++++++++
 5 files changed, 249 insertions(+)
 create mode 100644 arch/x86/kernel/early_printk_mrst.c

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 75085080b63..e5bb96b10f1 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -43,6 +43,10 @@ config EARLY_PRINTK
 	  with klogd/syslogd or the X server. You should normally N here,
 	  unless you want to debug such a crash.
 
+config EARLY_PRINTK_MRST
+	bool "Early printk for MRST platform support"
+	depends on EARLY_PRINTK && X86_MRST
+
 config EARLY_PRINTK_DBGP
 	bool "Early printk via EHCI debug port"
 	depends on EARLY_PRINTK && PCI
diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h
index 33fc2966beb..d0ea5bc505a 100644
--- a/arch/x86/include/asm/mrst.h
+++ b/arch/x86/include/asm/mrst.h
@@ -10,6 +10,9 @@
  */
 #ifndef _ASM_X86_MRST_H
 #define _ASM_X86_MRST_H
+
+#include <linux/sfi.h>
+
 extern int pci_mrst_init(void);
 int __init sfi_parse_mrtc(struct sfi_table_header *table);
 
@@ -42,4 +45,6 @@ extern enum mrst_timer_options mrst_timer_options;
 #define SFI_MTMR_MAX_NUM 8
 #define SFI_MRTC_MAX	8
 
+extern struct console early_mrst_console;
+extern void mrst_early_console_init(void);
 #endif /* _ASM_X86_MRST_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index fedf32a8c3e..3cd01d04613 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -85,6 +85,7 @@ obj-$(CONFIG_DOUBLEFAULT) 	+= doublefault_32.o
 obj-$(CONFIG_KGDB)		+= kgdb.o
 obj-$(CONFIG_VM86)		+= vm86_32.o
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
+obj-$(CONFIG_EARLY_PRINTK_MRST)	+= early_printk_mrst.o
 
 obj-$(CONFIG_HPET_TIMER) 	+= hpet.o
 obj-$(CONFIG_APB_TIMER)		+= apb_timer.o
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index fa99bae75ac..6082463768a 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -14,6 +14,7 @@
 #include <xen/hvc-console.h>
 #include <asm/pci-direct.h>
 #include <asm/fixmap.h>
+#include <asm/mrst.h>
 #include <asm/pgtable.h>
 #include <linux/usb/ehci_def.h>
 
@@ -238,6 +239,12 @@ static int __init setup_early_printk(char *buf)
 #ifdef CONFIG_HVC_XEN
 		if (!strncmp(buf, "xen", 3))
 			early_console_register(&xenboot_console, keep);
+#endif
+#ifdef CONFIG_X86_MRST_EARLY_PRINTK
+		if (!strncmp(buf, "mrst", 4)) {
+			mrst_early_console_init();
+			early_console_register(&early_mrst_console, keep);
+		}
 #endif
 		buf++;
 	}
diff --git a/arch/x86/kernel/early_printk_mrst.c b/arch/x86/kernel/early_printk_mrst.c
new file mode 100644
index 00000000000..05d27e1e2b6
--- /dev/null
+++ b/arch/x86/kernel/early_printk_mrst.c
@@ -0,0 +1,232 @@
+/*
+ * early_printk_mrst.c - spi-uart early printk for Intel Moorestown platform
+ *
+ * Copyright (c) 2008-2010, Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/kmsg_dump.h>
+#include <linux/console.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/io.h>
+
+#include <asm/fixmap.h>
+#include <asm/pgtable.h>
+#include <asm/mrst.h>
+
+#define MRST_SPI_TIMEOUT		0x200000
+#define MRST_REGBASE_SPI0		0xff128000
+#define MRST_REGBASE_SPI1		0xff128400
+#define MRST_CLK_SPI0_REG		0xff11d86c
+
+/* Bit fields in CTRLR0 */
+#define SPI_DFS_OFFSET			0
+
+#define SPI_FRF_OFFSET			4
+#define SPI_FRF_SPI			0x0
+#define SPI_FRF_SSP			0x1
+#define SPI_FRF_MICROWIRE		0x2
+#define SPI_FRF_RESV			0x3
+
+#define SPI_MODE_OFFSET			6
+#define SPI_SCPH_OFFSET			6
+#define SPI_SCOL_OFFSET			7
+#define SPI_TMOD_OFFSET			8
+#define	SPI_TMOD_TR			0x0		/* xmit & recv */
+#define SPI_TMOD_TO			0x1		/* xmit only */
+#define SPI_TMOD_RO			0x2		/* recv only */
+#define SPI_TMOD_EPROMREAD		0x3		/* eeprom read mode */
+
+#define SPI_SLVOE_OFFSET		10
+#define SPI_SRL_OFFSET			11
+#define SPI_CFS_OFFSET			12
+
+/* Bit fields in SR, 7 bits */
+#define SR_MASK				0x7f		/* cover 7 bits */
+#define SR_BUSY				(1 << 0)
+#define SR_TF_NOT_FULL			(1 << 1)
+#define SR_TF_EMPT			(1 << 2)
+#define SR_RF_NOT_EMPT			(1 << 3)
+#define SR_RF_FULL			(1 << 4)
+#define SR_TX_ERR			(1 << 5)
+#define SR_DCOL				(1 << 6)
+
+struct dw_spi_reg {
+	u32	ctrl0;
+	u32	ctrl1;
+	u32	ssienr;
+	u32	mwcr;
+	u32	ser;
+	u32	baudr;
+	u32	txfltr;
+	u32	rxfltr;
+	u32	txflr;
+	u32	rxflr;
+	u32	sr;
+	u32	imr;
+	u32	isr;
+	u32	risr;
+	u32	txoicr;
+	u32	rxoicr;
+	u32	rxuicr;
+	u32	msticr;
+	u32	icr;
+	u32	dmacr;
+	u32	dmatdlr;
+	u32	dmardlr;
+	u32	idr;
+	u32	version;
+
+	/* Currently operates as 32 bits, though only the low 16 bits matter */
+	u32	dr;
+} __packed;
+
+#define dw_readl(dw, name)		__raw_readl(&(dw)->name)
+#define dw_writel(dw, name, val)	__raw_writel((val), &(dw)->name)
+
+/* Default use SPI0 register for mrst, we will detect Penwell and use SPI1 */
+static unsigned long mrst_spi_paddr = MRST_REGBASE_SPI0;
+
+static u32 *pclk_spi0;
+/* Always contains an accessable address, start with 0 */
+static struct dw_spi_reg *pspi;
+
+static struct kmsg_dumper dw_dumper;
+static int dumper_registered;
+
+static void dw_kmsg_dump(struct kmsg_dumper *dumper,
+			enum kmsg_dump_reason reason,
+			const char *s1, unsigned long l1,
+			const char *s2, unsigned long l2)
+{
+	int i;
+
+	/* When run to this, we'd better re-init the HW */
+	mrst_early_console_init();
+
+	for (i = 0; i < l1; i++)
+		early_mrst_console.write(&early_mrst_console, s1 + i, 1);
+	for (i = 0; i < l2; i++)
+		early_mrst_console.write(&early_mrst_console, s2 + i, 1);
+}
+
+/* Set the ratio rate to 115200, 8n1, IRQ disabled */
+static void max3110_write_config(void)
+{
+	u16 config;
+
+	config = 0xc001;
+	dw_writel(pspi, dr, config);
+}
+
+/* Translate char to a eligible word and send to max3110 */
+static void max3110_write_data(char c)
+{
+	u16 data;
+
+	data = 0x8000 | c;
+	dw_writel(pspi, dr, data);
+}
+
+void mrst_early_console_init(void)
+{
+	u32 ctrlr0 = 0;
+	u32 spi0_cdiv;
+	u32 freq; /* Freqency info only need be searched once */
+
+	/* Base clk is 100 MHz, the actual clk = 100M / (clk_divider + 1) */
+	pclk_spi0 = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
+							MRST_CLK_SPI0_REG);
+	spi0_cdiv = ((*pclk_spi0) & 0xe00) >> 9;
+	freq = 100000000 / (spi0_cdiv + 1);
+
+	if (mrst_identify_cpu() == MRST_CPU_CHIP_PENWELL)
+		mrst_spi_paddr = MRST_REGBASE_SPI1;
+
+	pspi = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
+						mrst_spi_paddr);
+
+	/* Disable SPI controller */
+	dw_writel(pspi, ssienr, 0);
+
+	/* Set control param, 8 bits, transmit only mode */
+	ctrlr0 = dw_readl(pspi, ctrl0);
+
+	ctrlr0 &= 0xfcc0;
+	ctrlr0 |= 0xf | (SPI_FRF_SPI << SPI_FRF_OFFSET)
+		      | (SPI_TMOD_TO << SPI_TMOD_OFFSET);
+	dw_writel(pspi, ctrl0, ctrlr0);
+
+	/*
+	 * Change the spi0 clk to comply with 115200 bps, use 100000 to
+	 * calculate the clk dividor to make the clock a little slower
+	 * than real baud rate.
+	 */
+	dw_writel(pspi, baudr, freq/100000);
+
+	/* Disable all INT for early phase */
+	dw_writel(pspi, imr, 0x0);
+
+	/* Set the cs to spi-uart */
+	dw_writel(pspi, ser, 0x2);
+
+	/* Enable the HW, the last step for HW init */
+	dw_writel(pspi, ssienr, 0x1);
+
+	/* Set the default configuration */
+	max3110_write_config();
+
+	/* Register the kmsg dumper */
+	if (!dumper_registered) {
+		dw_dumper.dump = dw_kmsg_dump;
+		kmsg_dump_register(&dw_dumper);
+		dumper_registered = 1;
+	}
+}
+
+/* Slave select should be called in the read/write function */
+static void early_mrst_spi_putc(char c)
+{
+	unsigned int timeout;
+	u32 sr;
+
+	timeout = MRST_SPI_TIMEOUT;
+	/* Early putc needs to make sure the TX FIFO is not full */
+	while (--timeout) {
+		sr = dw_readl(pspi, sr);
+		if (!(sr & SR_TF_NOT_FULL))
+			cpu_relax();
+		else
+			break;
+	}
+
+	if (!timeout)
+		pr_warning("MRST earlycon: timed out\n");
+	else
+		max3110_write_data(c);
+}
+
+/* Early SPI only uses polling mode */
+static void early_mrst_spi_write(struct console *con, const char *str, unsigned n)
+{
+	int i;
+
+	for (i = 0; i < n && *str; i++) {
+		if (*str == '\n')
+			early_mrst_spi_putc('\r');
+		early_mrst_spi_putc(*str);
+		str++;
+	}
+}
+
+struct console early_mrst_console = {
+	.name =		"earlymrst",
+	.write =	early_mrst_spi_write,
+	.flags =	CON_PRINTBUFFER,
+	.index =	-1,
+};
-- 
cgit v1.2.3-18-g5258


From 4d033556f1bfaa7604b951bfadd17aaf1b74e4c5 Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Mon, 13 Sep 2010 15:08:56 +0800
Subject: x86, earlyprintk: Add hsu early console for Intel Medfield platform

Intel Medfield platform has a high speed UART device, which
could act as a early console. To enable early printk of HSU
console, simply add "earlyprintk=hsu" in kernel command line.

Currently we put the code in the early_printk_mrst.c as it is
also for Intel MID platforms like the mrst early console

Signed-off-by: Feng Tang <feng.tang@intel.com>
Acked-by: Alan Cox <alan@linux.intel.com>
Cc: greg@kroah.com
LKML-Reference: <1284361736-23011-5-git-send-email-feng.tang@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/mrst.h         |  3 ++
 arch/x86/kernel/early_printk.c      |  6 +++
 arch/x86/kernel/early_printk_mrst.c | 89 ++++++++++++++++++++++++++++++++++++-
 3 files changed, 97 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h
index d0ea5bc505a..4a711a684b1 100644
--- a/arch/x86/include/asm/mrst.h
+++ b/arch/x86/include/asm/mrst.h
@@ -47,4 +47,7 @@ extern enum mrst_timer_options mrst_timer_options;
 
 extern struct console early_mrst_console;
 extern void mrst_early_console_init(void);
+
+extern struct console early_hsu_console;
+extern void hsu_early_console_init(void);
 #endif /* _ASM_X86_MRST_H */
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 6082463768a..4572f25f932 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -245,6 +245,12 @@ static int __init setup_early_printk(char *buf)
 			mrst_early_console_init();
 			early_console_register(&early_mrst_console, keep);
 		}
+
+		if (!strncmp(buf, "hsu", 3)) {
+			hsu_early_console_init();
+			early_console_register(&early_hsu_console, keep);
+		}
+
 #endif
 		buf++;
 	}
diff --git a/arch/x86/kernel/early_printk_mrst.c b/arch/x86/kernel/early_printk_mrst.c
index 05d27e1e2b6..65df603622b 100644
--- a/arch/x86/kernel/early_printk_mrst.c
+++ b/arch/x86/kernel/early_printk_mrst.c
@@ -1,5 +1,5 @@
 /*
- * early_printk_mrst.c - spi-uart early printk for Intel Moorestown platform
+ * early_printk_mrst.c - early consoles for Intel MID platforms
  *
  * Copyright (c) 2008-2010, Intel Corporation
  *
@@ -9,9 +9,19 @@
  * of the License.
  */
 
+/*
+ * This file implements two early consoles named mrst and hsu.
+ * mrst is based on Maxim3110 spi-uart device, it exists in both
+ * Moorestown and Medfield platforms, while hsu is based on a High
+ * Speed UART device which only exists in the Medfield platform
+ */
+
+#include <linux/serial_reg.h>
+#include <linux/serial_mfd.h>
 #include <linux/kmsg_dump.h>
 #include <linux/console.h>
 #include <linux/kernel.h>
+#include <linux/delay.h>
 #include <linux/init.h>
 #include <linux/io.h>
 
@@ -230,3 +240,80 @@ struct console early_mrst_console = {
 	.flags =	CON_PRINTBUFFER,
 	.index =	-1,
 };
+
+/*
+ * Following is the early console based on Medfield HSU (High
+ * Speed UART) device.
+ */
+#define HSU_PORT2_PADDR		0xffa28180
+
+static void __iomem *phsu;
+
+void hsu_early_console_init(void)
+{
+	u8 lcr;
+
+	phsu = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
+							HSU_PORT2_PADDR);
+
+	/* Disable FIFO */
+	writeb(0x0, phsu + UART_FCR);
+
+	/* Set to default 115200 bps, 8n1 */
+	lcr = readb(phsu + UART_LCR);
+	writeb((0x80 | lcr), phsu + UART_LCR);
+	writeb(0x18, phsu + UART_DLL);
+	writeb(lcr,  phsu + UART_LCR);
+	writel(0x3600, phsu + UART_MUL*4);
+
+	writeb(0x8, phsu + UART_MCR);
+	writeb(0x7, phsu + UART_FCR);
+	writeb(0x3, phsu + UART_LCR);
+
+	/* Clear IRQ status */
+	readb(phsu + UART_LSR);
+	readb(phsu + UART_RX);
+	readb(phsu + UART_IIR);
+	readb(phsu + UART_MSR);
+
+	/* Enable FIFO */
+	writeb(0x7, phsu + UART_FCR);
+}
+
+#define BOTH_EMPTY (UART_LSR_TEMT | UART_LSR_THRE)
+
+static void early_hsu_putc(char ch)
+{
+	unsigned int timeout = 10000; /* 10ms */
+	u8 status;
+
+	while (--timeout) {
+		status = readb(phsu + UART_LSR);
+		if (status & BOTH_EMPTY)
+			break;
+		udelay(1);
+	}
+
+	/* Only write the char when there was no timeout */
+	if (timeout)
+		writeb(ch, phsu + UART_TX);
+}
+
+static void early_hsu_write(struct console *con, const char *str, unsigned n)
+{
+	int i;
+
+	for (i = 0; i < n && *str; i++) {
+		if (*str == '\n')
+			early_hsu_putc('\r');
+		early_hsu_putc(*str);
+		str++;
+	}
+}
+
+struct console early_hsu_console = {
+	.name =		"earlyhsu",
+	.write =	early_hsu_write,
+	.flags =	CON_PRINTBUFFER,
+	.index =	-1,
+};
-- 
cgit v1.2.3-18-g5258


From 286e5b97eb22baab9d9a41ca76c6b933a484252c Mon Sep 17 00:00:00 2001
From: Paul Fox <pgf@laptop.org>
Date: Fri, 1 Oct 2010 18:17:19 +0100
Subject: x86, olpc: Don't retry EC commands forever

Avoids a potential infinite loop.

It was observed once, during an EC hacking/debugging
session - not in regular operation.

Signed-off-by: Daniel Drake <dsd@laptop.org>
Cc: dilinger@queued.net
Cc: <stable@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/olpc.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c
index 37c49934c78..74726130259 100644
--- a/arch/x86/kernel/olpc.c
+++ b/arch/x86/kernel/olpc.c
@@ -114,6 +114,7 @@ int olpc_ec_cmd(unsigned char cmd, unsigned char *inbuf, size_t inlen,
 	unsigned long flags;
 	int ret = -EIO;
 	int i;
+	int restarts = 0;
 
 	spin_lock_irqsave(&ec_lock, flags);
 
@@ -169,7 +170,9 @@ restart:
 			if (wait_on_obf(0x6c, 1)) {
 				printk(KERN_ERR "olpc-ec:  timeout waiting for"
 						" EC to provide data!\n");
-				goto restart;
+				if (restarts++ < 10)
+					goto restart;
+				goto err;
 			}
 			outbuf[i] = inb(0x68);
 			pr_devel("olpc-ec:  received 0x%x\n", outbuf[i]);
-- 
cgit v1.2.3-18-g5258


From 6e9636693373d938aa3b13427be3d212f172ac06 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Fri, 8 Oct 2010 14:53:48 -0400
Subject: x86, iommu: Update header comments with appropriate naming

The header comments diverged a bit from the implementation. Lets
re-sync them.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
LKML-Reference: <1286564028-2352-3-git-send-email-konrad.wilk@oracle.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/iommu_table.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/iommu_table.h b/arch/x86/include/asm/iommu_table.h
index df55a78888e..f229b13a5f3 100644
--- a/arch/x86/include/asm/iommu_table.h
+++ b/arch/x86/include/asm/iommu_table.h
@@ -1,4 +1,3 @@
-
 #ifndef _ASM_X86_IOMMU_TABLE_H
 #define _ASM_X86_IOMMU_TABLE_H
 
@@ -60,7 +59,7 @@ struct iommu_table_entry {
  * and it will be run after the SWIOTLB and the other IOMMUs
  * that utilize this macro. If the IOMMU is detected (ie, the
  * detect routine returns a positive value), the other IOMMUs
- * are also checked. You can use IOMMU_INIT_FINISH if you prefer
+ * are also checked. You can use IOMMU_INIT_POST_FINISH if you prefer
  * to stop detecting the other IOMMUs after yours has been detected.
  */
 #define IOMMU_INIT_POST(_detect)					\
@@ -80,9 +79,9 @@ struct iommu_table_entry {
  *  d). Similar to the 'init', except that this gets called from pci_iommu_init
  *      where we do have a memory allocator.
  *
- * The _CONT vs the _EXIT differs in that the _CONT variant will
+ * The standard vs the _FINISH differs in that the _FINISH variant will
  * continue detecting other IOMMUs in the call list after the
- * the detection routine returns a positive number. The _EXIT will
+ * the detection routine returns a positive number. The _FINISH will
  * stop the execution chain. Both will still call the 'init' and
  * 'late_init' functions if they are set.
  */
-- 
cgit v1.2.3-18-g5258


From 708ff2a0097b02d32d375b66996661f36cd4d6d1 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Wed, 29 Sep 2010 18:08:50 +0900
Subject: bitops: make asm-generic/bitops/find.h more generic

asm-generic/bitops/find.h has the extern declarations of find_next_bit()
and find_next_zero_bit() and the macro definitions of find_first_bit()
and find_first_zero_bit(). It is only usable by the architectures which
enables CONFIG_GENERIC_FIND_NEXT_BIT and disables
CONFIG_GENERIC_FIND_FIRST_BIT.

x86 and tile enable both CONFIG_GENERIC_FIND_NEXT_BIT and
CONFIG_GENERIC_FIND_FIRST_BIT. These architectures cannot include
asm-generic/bitops/find.h in their asm/bitops.h. So ifdefed extern
declarations of find_first_bit and find_first_zero_bit() are put in
linux/bitops.h.

This makes asm-generic/bitops/find.h usable by these architectures
and use it. Also this change is needed for the forthcoming duplicated
extern declarations cleanup.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: x86@kernel.org
Cc: Chris Metcalf <cmetcalf@tilera.com>
---
 arch/x86/include/asm/bitops.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index bafd80defa4..903683b07e4 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -440,6 +440,8 @@ static inline int fls(int x)
 
 #ifdef __KERNEL__
 
+#include <asm-generic/bitops/find.h>
+
 #include <asm-generic/bitops/sched.h>
 
 #define ARCH_HAS_FAST_MULTIPLIER 1
-- 
cgit v1.2.3-18-g5258


From 236260b90dd94516982ad67aa6f5449c4c37db7b Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Wed, 6 Oct 2010 15:52:29 -0700
Subject: memblock: Allow memblock_init to be called early

The Xen setup code needs to call memblock_x86_reserve_range() very early,
so allow it to initialize the memblock subsystem before doing so.  The
second memblock_init() is ignored.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
LKML-Reference: <4CACFDAD.3090900@goop.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/xen/enlighten.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 7d46c844141..63b83ceebd1 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -30,6 +30,7 @@
 #include <linux/console.h>
 #include <linux/pci.h>
 #include <linux/gfp.h>
+#include <linux/memblock.h>
 
 #include <xen/xen.h>
 #include <xen/interface/xen.h>
@@ -1183,6 +1184,8 @@ asmlinkage void __init xen_start_kernel(void)
 	local_irq_disable();
 	early_boot_irqs_off();
 
+	memblock_init();
+
 	xen_raw_console_write("mapping kernel into physical memory\n");
 	pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages);
 
-- 
cgit v1.2.3-18-g5258


From 50f2d7f682f9c0ed58191d0982fe77888d59d162 Mon Sep 17 00:00:00 2001
From: Nikanth Karthikesan <knikanth@suse.de>
Date: Thu, 30 Sep 2010 17:34:10 +0530
Subject: x86, numa: Assign CPUs to nodes in round-robin manner on fake NUMA

commit d9c2d5ac6af87b4491bff107113aaf16f6c2b2d9 "x86, numa: Use near(er)
online node instead of roundrobin for NUMA" changed NUMA initialization on
Intel to choose the nearest online node or first node.  Fake NUMA would be
better of with round-robin initialization, instead of the all CPUS on
first node.  Change the choice of first node, back to round-robin.

For testing NUMA kernel behaviour without cpusets and NUMA aware
applications, it would be better to have cpus in different nodes, rather
than all in a single node.  With cpusets migration of tasks scenarios
cannot not be tested.

I guess having it round-robin shouldn't affect the use cases for all cpus
on the first node.

The code comments in arch/x86/mm/numa_64.c:759 indicate that this used to
be the case, which was changed by commit d9c2d5ac6.  It changed from
roundrobin to nearer or first node.  And I couldn't find any reason for
this change in its changelog.

Signed-off-by: Nikanth Karthikesan <knikanth@suse.de>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/x86/kernel/cpu/intel.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index b4389441efb..6d61786c2dd 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -284,9 +284,7 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
 	/* Don't do the funky fallback heuristics the AMD version employs
 	   for now. */
 	node = apicid_to_node[apicid];
-	if (node == NUMA_NO_NODE)
-		node = first_node(node_online_map);
-	else if (!node_online(node)) {
+	if (node == NUMA_NO_NODE || !node_online(node)) {
 		/* reuse the value from init_cpu_to_node() */
 		node = cpu_to_node(cpu);
 	}
-- 
cgit v1.2.3-18-g5258


From dab5fff14df2cd16eb1ad4c02e83915e1063fece Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Tue, 12 Oct 2010 09:09:37 +0800
Subject: acpi-cpufreq: fix a memleak when unloading driver

We didn't free per_cpu(acfreq_data, cpu)->freq_table
when acpi_freq driver is unloaded.

Resulting in the following messages in /sys/kernel/debug/kmemleak:

unreferenced object 0xf6450e80 (size 64):
  comm "modprobe", pid 1066, jiffies 4294677317 (age 19290.453s)
  hex dump (first 32 bytes):
    00 00 00 00 e8 a2 24 00 01 00 00 00 00 9f 24 00  ......$.......$.
    02 00 00 00 00 6a 18 00 03 00 00 00 00 35 0c 00  .....j.......5..
  backtrace:
    [<c123ba97>] kmemleak_alloc+0x27/0x50
    [<c109f96f>] __kmalloc+0xcf/0x110
    [<f9da97ee>] acpi_cpufreq_cpu_init+0x1ee/0x4e4 [acpi_cpufreq]
    [<c11cd8d2>] cpufreq_add_dev+0x142/0x3a0
    [<c11920b7>] sysdev_driver_register+0x97/0x110
    [<c11cce56>] cpufreq_register_driver+0x86/0x140
    [<f9dad080>] 0xf9dad080
    [<c1001130>] do_one_initcall+0x30/0x160
    [<c10626e9>] sys_init_module+0x99/0x1e0
    [<c1002d97>] sysenter_do_call+0x12/0x26
    [<ffffffff>] 0xffffffff

https://bugzilla.kernel.org/show_bug.cgi?id=15807#c21

Tested-by: Toralf Forster <toralf.foerster@gmx.de>
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index cd8da247dda..a2baafb2fe6 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -701,6 +701,7 @@ static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy)
 		per_cpu(acfreq_data, policy->cpu) = NULL;
 		acpi_processor_unregister_performance(data->acpi_data,
 						      policy->cpu);
+		kfree(data->freq_table);
 		kfree(data);
 	}
 
-- 
cgit v1.2.3-18-g5258


From b683de2b3cb17bb10fa6fd4af614dc75b5749fe0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 27 Sep 2010 20:55:03 +0200
Subject: genirq: Query arch for number of early descriptors

sparse irq sets up NR_IRQS_LEGACY irq descriptors and archs then go
ahead and allocate more.

Use the unused return value of arch_probe_nr_irqs() to let the
architecture return the number of early allocations. Fix up all users.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/io_apic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index f1efebaf551..5aee1d1a306 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3880,7 +3880,7 @@ int __init arch_probe_nr_irqs(void)
 	if (nr < nr_irqs)
 		nr_irqs = nr;
 
-	return 0;
+	return NR_IRQS_LEGACY;
 }
 #endif
 
-- 
cgit v1.2.3-18-g5258


From 1c9db52534a2c0e9776788cd34ccc193289fc18c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 28 Sep 2010 16:46:51 +0200
Subject: pci: Convert msi to new irq_chip functions

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Jesse Barnes <jbarnes@virtuousgeek.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Russell King <linux@arm.linux.org.uk>
---
 arch/x86/kernel/apic/io_apic.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 7556eb7a1a4..b79938ff9bd 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3441,8 +3441,8 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
  */
 static struct irq_chip msi_chip = {
 	.name		= "PCI-MSI",
-	.unmask		= unmask_msi_irq,
-	.mask		= mask_msi_irq,
+	.irq_unmask	= unmask_msi_irq,
+	.irq_mask	= mask_msi_irq,
 	.ack		= ack_apic_edge,
 #ifdef CONFIG_SMP
 	.set_affinity	= set_msi_irq_affinity,
@@ -3452,8 +3452,8 @@ static struct irq_chip msi_chip = {
 
 static struct irq_chip msi_ir_chip = {
 	.name		= "IR-PCI-MSI",
-	.unmask		= unmask_msi_irq,
-	.mask		= mask_msi_irq,
+	.irq_unmask	= unmask_msi_irq,
+	.irq_mask	= mask_msi_irq,
 #ifdef CONFIG_INTR_REMAP
 	.ack		= ir_ack_apic_edge,
 #ifdef CONFIG_SMP
-- 
cgit v1.2.3-18-g5258


From 39431acb1a4c464e62471cb3058b8ffffb9244db Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 28 Sep 2010 19:09:51 +0200
Subject: pci: Cleanup the irq_desc mess in msi

Handing down irq_desc to msi just so that msi can access
irq_desc.irq_data.msi_desc is a pretty stupid idea. The calling code
can hand down a pointer to msi_desc so msi code does not need to know
about the irq descriptor at all.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/kernel/apic/io_apic.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index b79938ff9bd..74bb027b517 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3383,14 +3383,14 @@ static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
 
 	cfg = desc->chip_data;
 
-	get_cached_msi_msg_desc(desc, &msg);
+	__get_cached_msi_msg(desc->irq_data.msi_desc, &msg);
 
 	msg.data &= ~MSI_DATA_VECTOR_MASK;
 	msg.data |= MSI_DATA_VECTOR(cfg->vector);
 	msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
 	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
-	write_msi_msg_desc(desc, &msg);
+	__write_msi_msg(desc->irq_data.msi_desc, &msg);
 
 	return 0;
 }
-- 
cgit v1.2.3-18-g5258


From 011d578fdadb64bcc1deedbb02216bfee6a9b4dc Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 28 Sep 2010 00:15:31 +0200
Subject: x86: Remove useless reinitialization of irq descriptors

The descriptors are already initialized in exactly this way.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/irqinit.c | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 990ae7cfc57..a91ab503e24 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -100,6 +100,8 @@ int vector_used_by_percpu_irq(unsigned int vector)
 
 void __init init_ISA_irqs(void)
 {
+	struct irq_chip *chip = legacy_pic->chip;
+	const char *name = chip->name;
 	int i;
 
 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
@@ -107,19 +109,8 @@ void __init init_ISA_irqs(void)
 #endif
 	legacy_pic->init(0);
 
-	/*
-	 * 16 old-style INTA-cycle interrupts:
-	 */
-	for (i = 0; i < legacy_pic->nr_legacy_irqs; i++) {
-		struct irq_desc *desc = irq_to_desc(i);
-
-		desc->status = IRQ_DISABLED;
-		desc->action = NULL;
-		desc->depth = 1;
-
-		set_irq_chip_and_handler_name(i, &i8259A_chip,
-					      handle_level_irq, "XT");
-	}
+	for (i = 0; i < legacy_pic->nr_legacy_irqs; i++)
+		set_irq_chip_and_handler_name(i, chip, handle_level_irq, name);
 }
 
 void __init init_IRQ(void)
-- 
cgit v1.2.3-18-g5258


From a3c08e5d80c54e32423efbba113b02942c91f726 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 8 Oct 2010 20:24:58 +0200
Subject: x86: Convert irq_chip access to new functions

Before moving the irq chips to the new functions, fixup direct callers.

The cpu offline irq fixup code needs to become generic and archs need
to honour the "force" flag as an indicator, but that's for later.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/irq.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 91fd0c70a18..d765bdc4807 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -159,7 +159,7 @@ int show_interrupts(struct seq_file *p, void *v)
 	seq_printf(p, "%*d: ", prec, i);
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
-	seq_printf(p, " %8s", desc->chip->name);
+	seq_printf(p, " %8s", desc->irq_data.chip->name);
 	seq_printf(p, "-%-8s", desc->name);
 
 	if (action) {
@@ -282,6 +282,7 @@ void fixup_irqs(void)
 	unsigned int irq, vector;
 	static int warned;
 	struct irq_desc *desc;
+	struct irq_data *data;
 
 	for_each_irq_desc(irq, desc) {
 		int break_affinity = 0;
@@ -296,7 +297,8 @@ void fixup_irqs(void)
 		/* interrupt's are disabled at this point */
 		raw_spin_lock(&desc->lock);
 
-		affinity = desc->affinity;
+		data = &desc->irq_data;
+		affinity = data->affinity;
 		if (!irq_has_action(irq) ||
 		    cpumask_equal(affinity, cpu_online_mask)) {
 			raw_spin_unlock(&desc->lock);
@@ -315,16 +317,16 @@ void fixup_irqs(void)
 			affinity = cpu_all_mask;
 		}
 
-		if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->mask)
-			desc->chip->mask(irq);
+		if (!(desc->status & IRQ_MOVE_PCNTXT) && data->chip->irq_mask)
+			data->chip->irq_mask(data);
 
-		if (desc->chip->set_affinity)
-			desc->chip->set_affinity(irq, affinity);
+		if (data->chip->irq_set_affinity)
+			data->chip->irq_set_affinity(data, affinity, true);
 		else if (!(warned++))
 			set_affinity = 0;
 
-		if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->unmask)
-			desc->chip->unmask(irq);
+		if (!(desc->status & IRQ_MOVE_PCNTXT) && data->chip->irq_unmask)
+			data->chip->irq_unmask(data);
 
 		raw_spin_unlock(&desc->lock);
 
@@ -355,10 +357,10 @@ void fixup_irqs(void)
 		if (irr  & (1 << (vector % 32))) {
 			irq = __get_cpu_var(vector_irq)[vector];
 
-			desc = irq_to_desc(irq);
+			data = irq_get_irq_data(irq);
 			raw_spin_lock(&desc->lock);
-			if (desc->chip->retrigger)
-				desc->chip->retrigger(irq);
+			if (data->chip->irq_retrigger)
+				data->chip->irq_retrigger(data);
 			raw_spin_unlock(&desc->lock);
 		}
 	}
-- 
cgit v1.2.3-18-g5258


From a5ef2e70405c8a9ee380b5ff33a008c75454791f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 28 Sep 2010 11:11:10 +0200
Subject: x86: Sanitize apb timer interrupt handling

Disable the interrupt in CPU_DEAD where it belongs. Remove the
open coded irq_desc manipulation.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
---
 arch/x86/kernel/apb_timer.c | 54 ++++++++++++++++++++-------------------------
 1 file changed, 24 insertions(+), 30 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index 08f75fb4f50..42a70a2accc 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -231,34 +231,6 @@ static void apbt_restart_clocksource(struct clocksource *cs)
 	apbt_start_counter(phy_cs_timer_id);
 }
 
-/* Setup IRQ routing via IOAPIC */
-#ifdef CONFIG_SMP
-static void apbt_setup_irq(struct apbt_dev *adev)
-{
-	struct irq_chip *chip;
-	struct irq_desc *desc;
-
-	/* timer0 irq has been setup early */
-	if (adev->irq == 0)
-		return;
-	desc = irq_to_desc(adev->irq);
-	chip = get_irq_chip(adev->irq);
-	disable_irq(adev->irq);
-	desc->status |= IRQ_MOVE_PCNTXT;
-	irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
-	/* APB timer irqs are set up as mp_irqs, timer is edge triggerred */
-	set_irq_chip_and_handler_name(adev->irq, chip, handle_edge_irq, "edge");
-	enable_irq(adev->irq);
-	if (system_state == SYSTEM_BOOTING)
-		if (request_irq(adev->irq, apbt_interrupt_handler,
-				IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
-				adev->name, adev)) {
-			printk(KERN_ERR "Failed request IRQ for APBT%d\n",
-			       adev->num);
-		}
-}
-#endif
-
 static void apbt_enable_int(int n)
 {
 	unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
@@ -334,6 +306,27 @@ static int __init apbt_clockevent_register(void)
 }
 
 #ifdef CONFIG_SMP
+
+static void apbt_setup_irq(struct apbt_dev *adev)
+{
+	/* timer0 irq has been setup early */
+	if (adev->irq == 0)
+		return;
+
+	if (system_state == SYSTEM_BOOTING) {
+		irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT);
+		/* APB timer irqs are set up as mp_irqs, timer is edge type */
+		__set_irq_handler(adev->irq, handle_edge_irq, 0, "edge");
+		if (request_irq(adev->irq, apbt_interrupt_handler,
+				IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
+				adev->name, adev)) {
+			printk(KERN_ERR "Failed request IRQ for APBT%d\n",
+			       adev->num);
+		}
+	} else
+		enable_irq(adev->irq);
+}
+
 /* Should be called with per cpu */
 void apbt_setup_secondary_clock(void)
 {
@@ -389,10 +382,11 @@ static int apbt_cpuhp_notify(struct notifier_block *n,
 
 	switch (action & 0xf) {
 	case CPU_DEAD:
+		disable_irq(adev->irq);
 		apbt_disable_int(cpu);
-		if (system_state == SYSTEM_RUNNING)
+		if (system_state == SYSTEM_RUNNING) {
 			pr_debug("skipping APBT CPU %lu offline\n", cpu);
-		else if (adev) {
+		} else if (adev) {
 			pr_debug("APBT clockevent for cpu %lu offline\n", cpu);
 			free_irq(adev->irq, adev);
 		}
-- 
cgit v1.2.3-18-g5258


From fe25c7fc2e036e1569faac8715a8aa5496cda78d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 28 Sep 2010 14:57:24 +0200
Subject: x86: lguest: Convert to new irq chip functions

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Cc: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/x86/lguest/boot.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 9d5f5584845..2d4e6fcac83 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -791,22 +791,22 @@ static void lguest_flush_tlb_kernel(void)
  * simple as setting a bit.  We don't actually "ack" interrupts as such, we
  * just mask and unmask them.  I wonder if we should be cleverer?
  */
-static void disable_lguest_irq(unsigned int irq)
+static void disable_lguest_irq(struct irq_data *data)
 {
-	set_bit(irq, lguest_data.blocked_interrupts);
+	set_bit(data->irq, lguest_data.blocked_interrupts);
 }
 
-static void enable_lguest_irq(unsigned int irq)
+static void enable_lguest_irq(struct irq_data *data)
 {
-	clear_bit(irq, lguest_data.blocked_interrupts);
+	clear_bit(data->irq, lguest_data.blocked_interrupts);
 }
 
 /* This structure describes the lguest IRQ controller. */
 static struct irq_chip lguest_irq_controller = {
 	.name		= "lguest",
-	.mask		= disable_lguest_irq,
-	.mask_ack	= disable_lguest_irq,
-	.unmask		= enable_lguest_irq,
+	.irq_mask	= disable_lguest_irq,
+	.irq_mask_ack	= disable_lguest_irq,
+	.irq_unmask	= enable_lguest_irq,
 };
 
 /*
-- 
cgit v1.2.3-18-g5258


From 020dd984d7c0792e8234f2e4b4fb0534fe750f9d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 28 Sep 2010 14:59:58 +0200
Subject: x86: Cleanup visws interrupt handling

Remove the open coded access to irq_desc and convert to the new irq
chip functions. Change the mask function of piix4_virtual_irq_type so
we can use the generic irq handling function for the virtual interrupt
instead of open coding it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/visws_quirks.c | 140 +++++++++++++----------------------------
 1 file changed, 44 insertions(+), 96 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index e680ea52db9..3371bd053b8 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -66,10 +66,7 @@ static void __init visws_time_init(void)
 }
 
 /* Replaces the default init_ISA_irqs in the generic setup */
-static void __init visws_pre_intr_init(void)
-{
-	init_VISWS_APIC_irqs();
-}
+static void __init visws_pre_intr_init(void);
 
 /* Quirk for machine specific memory setup. */
 
@@ -429,67 +426,34 @@ static int is_co_apic(unsigned int irq)
 /*
  * This is the SGI Cobalt (IO-)APIC:
  */
-
-static void enable_cobalt_irq(unsigned int irq)
+static void enable_cobalt_irq(struct irq_data *data)
 {
-	co_apic_set(is_co_apic(irq), irq);
+	co_apic_set(is_co_apic(data->irq), data->irq);
 }
 
-static void disable_cobalt_irq(unsigned int irq)
+static void disable_cobalt_irq(struct irq_data *data)
 {
-	int entry = is_co_apic(irq);
+	int entry = is_co_apic(data->irq);
 
 	co_apic_write(CO_APIC_LO(entry), CO_APIC_MASK);
 	co_apic_read(CO_APIC_LO(entry));
 }
 
-/*
- * "irq" really just serves to identify the device.  Here is where we
- * map this to the Cobalt APIC entry where it's physically wired.
- * This is called via request_irq -> setup_irq -> irq_desc->startup()
- */
-static unsigned int startup_cobalt_irq(unsigned int irq)
+static void ack_cobalt_irq(struct irq_data *data)
 {
 	unsigned long flags;
-	struct irq_desc *desc = irq_to_desc(irq);
 
 	spin_lock_irqsave(&cobalt_lock, flags);
-	if ((desc->status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING)))
-		desc->status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING);
-	enable_cobalt_irq(irq);
-	spin_unlock_irqrestore(&cobalt_lock, flags);
-	return 0;
-}
-
-static void ack_cobalt_irq(unsigned int irq)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&cobalt_lock, flags);
-	disable_cobalt_irq(irq);
+	disable_cobalt_irq(data);
 	apic_write(APIC_EOI, APIC_EIO_ACK);
 	spin_unlock_irqrestore(&cobalt_lock, flags);
 }
 
-static void end_cobalt_irq(unsigned int irq)
-{
-	unsigned long flags;
-	struct irq_desc *desc = irq_to_desc(irq);
-
-	spin_lock_irqsave(&cobalt_lock, flags);
-	if (!(desc->status & (IRQ_DISABLED | IRQ_INPROGRESS)))
-		enable_cobalt_irq(irq);
-	spin_unlock_irqrestore(&cobalt_lock, flags);
-}
-
 static struct irq_chip cobalt_irq_type = {
-	.name =		"Cobalt-APIC",
-	.startup =	startup_cobalt_irq,
-	.shutdown =	disable_cobalt_irq,
-	.enable =	enable_cobalt_irq,
-	.disable =	disable_cobalt_irq,
-	.ack =		ack_cobalt_irq,
-	.end =		end_cobalt_irq,
+	.name		= "Cobalt-APIC",
+	.irq_enable	= enable_cobalt_irq,
+	.irq_disable	= disable_cobalt_irq,
+	.irq_ack	= ack_cobalt_irq,
 };
 
 
@@ -503,35 +467,34 @@ static struct irq_chip cobalt_irq_type = {
  * interrupt controller type, and through a special virtual interrupt-
  * controller. Device drivers only see the virtual interrupt sources.
  */
-static unsigned int startup_piix4_master_irq(unsigned int irq)
+static unsigned int startup_piix4_master_irq(struct irq_data *data)
 {
 	legacy_pic->init(0);
-
-	return startup_cobalt_irq(irq);
+	enable_cobalt_irq(data);
 }
 
-static void end_piix4_master_irq(unsigned int irq)
+static void end_piix4_master_irq(struct irq_data *data)
 {
 	unsigned long flags;
 
 	spin_lock_irqsave(&cobalt_lock, flags);
-	enable_cobalt_irq(irq);
+	enable_cobalt_irq(data);
 	spin_unlock_irqrestore(&cobalt_lock, flags);
 }
 
 static struct irq_chip piix4_master_irq_type = {
-	.name =		"PIIX4-master",
-	.startup =	startup_piix4_master_irq,
-	.ack =		ack_cobalt_irq,
-	.end =		end_piix4_master_irq,
+	.name		= "PIIX4-master",
+	.irq_startup	= startup_piix4_master_irq,
+	.irq_ack	= ack_cobalt_irq,
 };
 
+static void pii4_mask(struct irq_data *data) { }
 
 static struct irq_chip piix4_virtual_irq_type = {
-	.name =		"PIIX4-virtual",
+	.name		= "PIIX4-virtual",
+	.mask		= pii4_mask,
 };
 
-
 /*
  * PIIX4-8259 master/virtual functions to handle interrupt requests
  * from legacy devices: floppy, parallel, serial, rtc.
@@ -549,9 +512,8 @@ static struct irq_chip piix4_virtual_irq_type = {
  */
 static irqreturn_t piix4_master_intr(int irq, void *dev_id)
 {
-	int realirq;
-	struct irq_desc *desc;
 	unsigned long flags;
+	int realirq;
 
 	raw_spin_lock_irqsave(&i8259A_lock, flags);
 
@@ -592,18 +554,10 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id)
 
 	raw_spin_unlock_irqrestore(&i8259A_lock, flags);
 
-	desc = irq_to_desc(realirq);
-
 	/*
 	 * handle this 'virtual interrupt' as a Cobalt one now.
 	 */
-	kstat_incr_irqs_this_cpu(realirq, desc);
-
-	if (likely(desc->action != NULL))
-		handle_IRQ_event(realirq, desc->action);
-
-	if (!(desc->status & IRQ_DISABLED))
-		legacy_pic->chip->unmask(realirq);
+	generic_handle_irq(realirq);
 
 	return IRQ_HANDLED;
 
@@ -624,41 +578,35 @@ static struct irqaction cascade_action = {
 
 static inline void set_piix4_virtual_irq_type(void)
 {
-	piix4_virtual_irq_type.shutdown = i8259A_chip.mask;
 	piix4_virtual_irq_type.enable =	i8259A_chip.unmask;
 	piix4_virtual_irq_type.disable = i8259A_chip.mask;
+	piix4_virtual_irq_type.unmask =	i8259A_chip.unmask;
 }
 
-void init_VISWS_APIC_irqs(void)
+static void __init visws_pre_intr_init(void)
 {
 	int i;
 
-	for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) {
-		struct irq_desc *desc = irq_to_desc(i);
-
-		desc->status = IRQ_DISABLED;
-		desc->action = 0;
-		desc->depth = 1;
+	set_piix4_virtual_irq_type();
 
-		if (i == 0) {
-			desc->chip = &cobalt_irq_type;
-		}
-		else if (i == CO_IRQ_IDE0) {
-			desc->chip = &cobalt_irq_type;
-		}
-		else if (i == CO_IRQ_IDE1) {
-			desc->chip = &cobalt_irq_type;
-		}
-		else if (i == CO_IRQ_8259) {
-			desc->chip = &piix4_master_irq_type;
-		}
-		else if (i < CO_IRQ_APIC0) {
-			set_piix4_virtual_irq_type();
-			desc->chip = &piix4_virtual_irq_type;
-		}
-		else if (IS_CO_APIC(i)) {
-			desc->chip = &cobalt_irq_type;
-		}
+	for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) {
+		struct irq_chip *chip = NULL;
+
+		if (i == 0)
+			chip = &cobalt_irq_type;
+		else if (i == CO_IRQ_IDE0)
+			chip = &cobalt_irq_type;
+		else if (i == CO_IRQ_IDE1)
+			>chip = &cobalt_irq_type;
+		else if (i == CO_IRQ_8259)
+			chip = &piix4_master_irq_type;
+		else if (i < CO_IRQ_APIC0)
+			chip = &piix4_virtual_irq_type;
+		else if (IS_CO_APIC(i))
+			chip = &cobalt_irq_type;
+
+		if (chip)
+			set_irq_chip(i, chip);
 	}
 
 	setup_irq(CO_IRQ_8259, &master_action);
-- 
cgit v1.2.3-18-g5258


From 4305df947ca1fd52867c8d56837a4e6b1e33167c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 28 Sep 2010 15:01:33 +0200
Subject: x86: i8259: Convert to new irq_chip functions

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/i8259.h   |  2 ++
 arch/x86/kernel/apic/io_apic.c | 20 +++++++-------
 arch/x86/kernel/apic/nmi.c     |  2 +-
 arch/x86/kernel/i8259.c        | 63 +++++++++++++++++++++---------------------
 arch/x86/kernel/smpboot.c      |  4 +--
 5 files changed, 47 insertions(+), 44 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h
index 1655147646a..a20365953bf 100644
--- a/arch/x86/include/asm/i8259.h
+++ b/arch/x86/include/asm/i8259.h
@@ -55,6 +55,8 @@ extern struct irq_chip i8259A_chip;
 struct legacy_pic {
 	int nr_legacy_irqs;
 	struct irq_chip *chip;
+	void (*mask)(unsigned int irq);
+	void (*unmask)(unsigned int irq);
 	void (*mask_all)(void);
 	void (*restore_mask)(void);
 	void (*init)(int auto_eoi);
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 74bb027b517..e5ae2a22262 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1459,7 +1459,7 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
 
 	ioapic_register_intr(irq, desc, trigger);
 	if (irq < legacy_pic->nr_legacy_irqs)
-		legacy_pic->chip->mask(irq);
+		legacy_pic->mask(irq);
 
 	ioapic_write_entry(apic_id, pin, entry);
 }
@@ -2233,7 +2233,7 @@ static unsigned int startup_ioapic_irq(unsigned int irq)
 
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
 	if (irq < legacy_pic->nr_legacy_irqs) {
-		legacy_pic->chip->mask(irq);
+		legacy_pic->mask(irq);
 		if (legacy_pic->irq_pending(irq))
 			was_pending = 1;
 	}
@@ -2928,7 +2928,7 @@ static inline void __init check_timer(void)
 	/*
 	 * get/set the timer IRQ vector:
 	 */
-	legacy_pic->chip->mask(0);
+	legacy_pic->mask(0);
 	assign_irq_vector(0, cfg, apic->target_cpus());
 
 	/*
@@ -3000,7 +3000,7 @@ static inline void __init check_timer(void)
 		if (timer_irq_works()) {
 			if (nmi_watchdog == NMI_IO_APIC) {
 				setup_nmi();
-				legacy_pic->chip->unmask(0);
+				legacy_pic->unmask(0);
 			}
 			if (disable_timer_pin_1 > 0)
 				clear_IO_APIC_pin(0, pin1);
@@ -3023,14 +3023,14 @@ static inline void __init check_timer(void)
 		 */
 		replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2);
 		setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
-		legacy_pic->chip->unmask(0);
+		legacy_pic->unmask(0);
 		if (timer_irq_works()) {
 			apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
 			timer_through_8259 = 1;
 			if (nmi_watchdog == NMI_IO_APIC) {
-				legacy_pic->chip->mask(0);
+				legacy_pic->mask(0);
 				setup_nmi();
-				legacy_pic->chip->unmask(0);
+				legacy_pic->unmask(0);
 			}
 			goto out;
 		}
@@ -3038,7 +3038,7 @@ static inline void __init check_timer(void)
 		 * Cleanup, just in case ...
 		 */
 		local_irq_disable();
-		legacy_pic->chip->mask(0);
+		legacy_pic->mask(0);
 		clear_IO_APIC_pin(apic2, pin2);
 		apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
 	}
@@ -3057,14 +3057,14 @@ static inline void __init check_timer(void)
 
 	lapic_register_intr(0, desc);
 	apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector);	/* Fixed mode */
-	legacy_pic->chip->unmask(0);
+	legacy_pic->unmask(0);
 
 	if (timer_irq_works()) {
 		apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
 		goto out;
 	}
 	local_irq_disable();
-	legacy_pic->chip->mask(0);
+	legacy_pic->mask(0);
 	apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
 	apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
 
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index a43f71cb30f..c90041ccb74 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -178,7 +178,7 @@ int __init check_nmi_watchdog(void)
 error:
 	if (nmi_watchdog == NMI_IO_APIC) {
 		if (!timer_through_8259)
-			legacy_pic->chip->mask(0);
+			legacy_pic->mask(0);
 		on_each_cpu(__acpi_nmi_disable, NULL, 1);
 	}
 
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index cafa7c80ac9..20757cb2efa 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -29,24 +29,10 @@
  * plus some generic x86 specific things if generic specifics makes
  * any sense at all.
  */
+static void init_8259A(int auto_eoi);
 
 static int i8259A_auto_eoi;
 DEFINE_RAW_SPINLOCK(i8259A_lock);
-static void mask_and_ack_8259A(unsigned int);
-static void mask_8259A(void);
-static void unmask_8259A(void);
-static void disable_8259A_irq(unsigned int irq);
-static void enable_8259A_irq(unsigned int irq);
-static void init_8259A(int auto_eoi);
-static int i8259A_irq_pending(unsigned int irq);
-
-struct irq_chip i8259A_chip = {
-	.name		= "XT-PIC",
-	.mask		= disable_8259A_irq,
-	.disable	= disable_8259A_irq,
-	.unmask		= enable_8259A_irq,
-	.mask_ack	= mask_and_ack_8259A,
-};
 
 /*
  * 8259A PIC functions to handle ISA devices:
@@ -68,7 +54,7 @@ unsigned int cached_irq_mask = 0xffff;
  */
 unsigned long io_apic_irqs;
 
-static void disable_8259A_irq(unsigned int irq)
+static void mask_8259A_irq(unsigned int irq)
 {
 	unsigned int mask = 1 << irq;
 	unsigned long flags;
@@ -82,7 +68,12 @@ static void disable_8259A_irq(unsigned int irq)
 	raw_spin_unlock_irqrestore(&i8259A_lock, flags);
 }
 
-static void enable_8259A_irq(unsigned int irq)
+static void disable_8259A_irq(struct irq_data *data)
+{
+	mask_8259A_irq(data->irq);
+}
+
+static void unmask_8259A_irq(unsigned int irq)
 {
 	unsigned int mask = ~(1 << irq);
 	unsigned long flags;
@@ -96,6 +87,11 @@ static void enable_8259A_irq(unsigned int irq)
 	raw_spin_unlock_irqrestore(&i8259A_lock, flags);
 }
 
+static void enable_8259A_irq(struct irq_data *data)
+{
+	unmask_8259A_irq(data->irq);
+}
+
 static int i8259A_irq_pending(unsigned int irq)
 {
 	unsigned int mask = 1<<irq;
@@ -117,7 +113,7 @@ static void make_8259A_irq(unsigned int irq)
 	disable_irq_nosync(irq);
 	io_apic_irqs &= ~(1<<irq);
 	set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq,
-				      "XT");
+				      i8259A_chip.name);
 	enable_irq(irq);
 }
 
@@ -150,8 +146,9 @@ static inline int i8259A_irq_real(unsigned int irq)
  * first, _then_ send the EOI, and the order of EOI
  * to the two 8259s is important!
  */
-static void mask_and_ack_8259A(unsigned int irq)
+static void mask_and_ack_8259A(struct irq_data *data)
 {
+	unsigned int irq = data->irq;
 	unsigned int irqmask = 1 << irq;
 	unsigned long flags;
 
@@ -223,6 +220,14 @@ spurious_8259A_irq:
 	}
 }
 
+struct irq_chip i8259A_chip = {
+	.name		= "XT-PIC",
+	.irq_mask	= disable_8259A_irq,
+	.irq_disable	= disable_8259A_irq,
+	.irq_unmask	= enable_8259A_irq,
+	.irq_mask_ack	= mask_and_ack_8259A,
+};
+
 static char irq_trigger[2];
 /**
  * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ
@@ -342,9 +347,9 @@ static void init_8259A(int auto_eoi)
 		 * In AEOI mode we just have to mask the interrupt
 		 * when acking.
 		 */
-		i8259A_chip.mask_ack = disable_8259A_irq;
+		i8259A_chip.irq_mask_ack = disable_8259A_irq;
 	else
-		i8259A_chip.mask_ack = mask_and_ack_8259A;
+		i8259A_chip.irq_mask_ack = mask_and_ack_8259A;
 
 	udelay(100);		/* wait for 8259A to initialize */
 
@@ -363,14 +368,6 @@ static void init_8259A(int auto_eoi)
 static void legacy_pic_noop(void) { };
 static void legacy_pic_uint_noop(unsigned int unused) { };
 static void legacy_pic_int_noop(int unused) { };
-
-static struct irq_chip dummy_pic_chip  = {
-	.name = "dummy pic",
-	.mask = legacy_pic_uint_noop,
-	.unmask = legacy_pic_uint_noop,
-	.disable = legacy_pic_uint_noop,
-	.mask_ack = legacy_pic_uint_noop,
-};
 static int legacy_pic_irq_pending_noop(unsigned int irq)
 {
 	return 0;
@@ -378,7 +375,9 @@ static int legacy_pic_irq_pending_noop(unsigned int irq)
 
 struct legacy_pic null_legacy_pic = {
 	.nr_legacy_irqs = 0,
-	.chip = &dummy_pic_chip,
+	.chip = &dummy_irq_chip,
+	.mask = legacy_pic_uint_noop,
+	.unmask = legacy_pic_uint_noop,
 	.mask_all = legacy_pic_noop,
 	.restore_mask = legacy_pic_noop,
 	.init = legacy_pic_int_noop,
@@ -389,7 +388,9 @@ struct legacy_pic null_legacy_pic = {
 struct legacy_pic default_legacy_pic = {
 	.nr_legacy_irqs = NR_IRQS_LEGACY,
 	.chip  = &i8259A_chip,
-	.mask_all  = mask_8259A,
+	.mask = mask_8259A_irq,
+	.unmask = unmask_8259A_irq,
+	.mask_all = mask_8259A,
 	.restore_mask = unmask_8259A,
 	.init = init_8259A,
 	.irq_pending = i8259A_irq_pending,
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 87a8c6b00f8..864b386f6c0 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -324,9 +324,9 @@ notrace static void __cpuinit start_secondary(void *unused)
 	check_tsc_sync_target();
 
 	if (nmi_watchdog == NMI_IO_APIC) {
-		legacy_pic->chip->mask(0);
+		legacy_pic->mask(0);
 		enable_NMI_through_LVT0();
-		legacy_pic->chip->unmask(0);
+		legacy_pic->unmask(0);
 	}
 
 	/* This must be done before setting cpu_online_mask */
-- 
cgit v1.2.3-18-g5258


From d4eba29770244e7cc5e60c0977d73d84148a3d6d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 24 Sep 2010 12:26:18 +0200
Subject: x86: Cleanup access to irq_data

Fixup the open coded access to
      irq_desc->[handler_data|chip_data|msi-desc]

Use the macros and inline functions for it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/io_apic.c | 63 +++++++++++++++++++++---------------------
 arch/x86/kernel/uv_irq.c       |  2 +-
 2 files changed, 33 insertions(+), 32 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index e5ae2a22262..fa0d92a6db5 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -190,7 +190,7 @@ struct irq_cfg *irq_cfg(unsigned int irq)
 
 	desc = irq_to_desc(irq);
 	if (desc)
-		cfg = desc->chip_data;
+		cfg = get_irq_desc_chip_data(desc);
 
 	return cfg;
 }
@@ -219,10 +219,11 @@ int arch_init_chip_data(struct irq_desc *desc, int node)
 {
 	struct irq_cfg *cfg;
 
-	cfg = desc->chip_data;
+	cfg = get_irq_desc_chip_data(desc);
 	if (!cfg) {
-		desc->chip_data = get_one_free_irq_cfg(node);
-		if (!desc->chip_data) {
+		cfg = get_one_free_irq_cfg(node);
+		desc->chip_data = cfg;
+		if (!cfg) {
 			printk(KERN_ERR "can not alloc irq_cfg\n");
 			BUG_ON(1);
 		}
@@ -325,8 +326,8 @@ void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
 {
 	struct irq_cfg *old_cfg, *cfg;
 
-	old_cfg = old_desc->chip_data;
-	cfg = desc->chip_data;
+	old_cfg = get_irq_desc_chip_data(old_desc);
+	cfg = get_irq_desc_chip_data(desc);
 
 	if (old_cfg == cfg)
 		return;
@@ -594,7 +595,7 @@ static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
 
 static void mask_IO_APIC_irq_desc(struct irq_desc *desc)
 {
-	struct irq_cfg *cfg = desc->chip_data;
+	struct irq_cfg *cfg = get_irq_desc_chip_data(desc);
 	unsigned long flags;
 
 	BUG_ON(!cfg);
@@ -606,7 +607,7 @@ static void mask_IO_APIC_irq_desc(struct irq_desc *desc)
 
 static void unmask_IO_APIC_irq_desc(struct irq_desc *desc)
 {
-	struct irq_cfg *cfg = desc->chip_data;
+	struct irq_cfg *cfg = get_irq_desc_chip_data(desc);
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
@@ -1269,7 +1270,7 @@ void __setup_vector_irq(int cpu)
 	raw_spin_lock(&vector_lock);
 	/* Mark the inuse vectors */
 	for_each_irq_desc(irq, desc) {
-		cfg = desc->chip_data;
+		cfg = get_irq_desc_chip_data(desc);
 
 		/*
 		 * If it is a legacy IRQ handled by the legacy PIC, this cpu
@@ -1427,7 +1428,7 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
 	if (!IO_APIC_IRQ(irq))
 		return;
 
-	cfg = desc->chip_data;
+	cfg = get_irq_desc_chip_data(desc);
 
 	/*
 	 * For legacy irqs, cfg->domain starts with cpu 0 for legacy
@@ -1516,7 +1517,7 @@ static void __init setup_IO_APIC_irqs(void)
 			printk(KERN_INFO "can not get irq_desc for %d\n", irq);
 			continue;
 		}
-		cfg = desc->chip_data;
+		cfg = get_irq_desc_chip_data(desc);
 		add_pin_to_irq_node(cfg, node, apic_id, pin);
 		/*
 		 * don't mark it in pin_programmed, so later acpi could
@@ -1567,7 +1568,7 @@ void setup_IO_APIC_irq_extra(u32 gsi)
 		return;
 	}
 
-	cfg = desc->chip_data;
+	cfg = get_irq_desc_chip_data(desc);
 	add_pin_to_irq_node(cfg, node, apic_id, pin);
 
 	if (test_bit(pin, mp_ioapic_routing[apic_id].pin_programmed)) {
@@ -1718,7 +1719,7 @@ __apicdebuginit(void) print_IO_APIC(void)
 	for_each_irq_desc(irq, desc) {
 		struct irq_pin_list *entry;
 
-		cfg = desc->chip_data;
+		cfg = get_irq_desc_chip_data(desc);
 		if (!cfg)
 			continue;
 		entry = cfg->irq_2_pin;
@@ -2323,7 +2324,7 @@ set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask,
 		return -1;
 
 	irq = desc->irq;
-	cfg = desc->chip_data;
+	cfg = get_irq_desc_chip_data(desc);
 	if (assign_irq_vector(irq, cfg, mask))
 		return -1;
 
@@ -2343,7 +2344,7 @@ set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
 	int ret = -1;
 
 	irq = desc->irq;
-	cfg = desc->chip_data;
+	cfg = get_irq_desc_chip_data(desc);
 
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
 	ret = set_desc_affinity(desc, mask, &dest);
@@ -2396,7 +2397,7 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
 	if (get_irte(irq, &irte))
 		return ret;
 
-	cfg = desc->chip_data;
+	cfg = get_irq_desc_chip_data(desc);
 	if (assign_irq_vector(irq, cfg, mask))
 		return ret;
 
@@ -2500,7 +2501,7 @@ unlock:
 static void __irq_complete_move(struct irq_desc **descp, unsigned vector)
 {
 	struct irq_desc *desc = *descp;
-	struct irq_cfg *cfg = desc->chip_data;
+	struct irq_cfg *cfg = get_irq_desc_chip_data(desc);
 	unsigned me;
 
 	if (likely(!cfg->move_in_progress))
@@ -2520,7 +2521,7 @@ static void irq_complete_move(struct irq_desc **descp)
 void irq_force_complete_move(int irq)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
-	struct irq_cfg *cfg = desc->chip_data;
+	struct irq_cfg *cfg = get_irq_desc_chip_data(desc);
 
 	if (!cfg)
 		return;
@@ -2588,7 +2589,7 @@ static void eoi_ioapic_irq(struct irq_desc *desc)
 	unsigned int irq;
 
 	irq = desc->irq;
-	cfg = desc->chip_data;
+	cfg = get_irq_desc_chip_data(desc);
 
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
 	__eoi_ioapic_irq(irq, cfg);
@@ -2644,7 +2645,7 @@ static void ack_apic_level(unsigned int irq)
 	 * we use the above logic (mask+edge followed by unmask+level) from
 	 * Manfred Spraul to clear the remote IRR.
 	 */
-	cfg = desc->chip_data;
+	cfg = get_irq_desc_chip_data(desc);
 	i = cfg->vector;
 	v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
 
@@ -2695,7 +2696,7 @@ static void ack_apic_level(unsigned int irq)
 		 * accurate and is causing problems then it is a hardware bug
 		 * and you can go talk to the chipset vendor about it.
 		 */
-		cfg = desc->chip_data;
+		cfg = get_irq_desc_chip_data(desc);
 		if (!io_apic_level_ack_pending(cfg))
 			move_masked_irq(irq);
 		unmask_IO_APIC_irq_desc(desc);
@@ -2763,7 +2764,7 @@ static inline void init_IO_APIC_traps(void)
 	 * 0x80, because int 0x80 is hm, kind of importantish. ;)
 	 */
 	for_each_irq_desc(irq, desc) {
-		cfg = desc->chip_data;
+		cfg = get_irq_desc_chip_data(desc);
 		if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
 			/*
 			 * Hmm.. We don't have an entry for this,
@@ -2917,7 +2918,7 @@ int timer_through_8259 __initdata;
 static inline void __init check_timer(void)
 {
 	struct irq_desc *desc = irq_to_desc(0);
-	struct irq_cfg *cfg = desc->chip_data;
+	struct irq_cfg *cfg = get_irq_desc_chip_data(desc);
 	int node = cpu_to_node(0);
 	int apic1, pin1, apic2, pin2;
 	unsigned long flags;
@@ -3250,13 +3251,13 @@ unsigned int create_irq_nr(unsigned int irq_want, int node)
 			printk(KERN_INFO "can not get irq_desc for %d\n", new);
 			continue;
 		}
-		cfg_new = desc_new->chip_data;
+		cfg_new = get_irq_desc_chip_data(desc_new);
 
 		if (cfg_new->vector != 0)
 			continue;
 
 		desc_new = move_irq_desc(desc_new, node);
-		cfg_new = desc_new->chip_data;
+		cfg_new = get_irq_desc_chip_data(desc_new);
 
 		if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0)
 			irq = new;
@@ -3381,7 +3382,7 @@ static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
 	if (set_desc_affinity(desc, mask, &dest))
 		return -1;
 
-	cfg = desc->chip_data;
+	cfg = get_irq_desc_chip_data(desc);
 
 	__get_cached_msi_msg(desc->irq_data.msi_desc, &msg);
 
@@ -3403,7 +3404,7 @@ static int
 ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
-	struct irq_cfg *cfg = desc->chip_data;
+	struct irq_cfg *cfg = get_irq_desc_chip_data(desc);
 	unsigned int dest;
 	struct irte irte;
 
@@ -3595,7 +3596,7 @@ static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 	if (set_desc_affinity(desc, mask, &dest))
 		return -1;
 
-	cfg = desc->chip_data;
+	cfg = get_irq_desc_chip_data(desc);
 
 	dmar_msi_read(irq, &msg);
 
@@ -3650,7 +3651,7 @@ static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 	if (set_desc_affinity(desc, mask, &dest))
 		return -1;
 
-	cfg = desc->chip_data;
+	cfg = get_irq_desc_chip_data(desc);
 
 	hpet_msi_read(irq, &msg);
 
@@ -3756,7 +3757,7 @@ static int set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
 	if (set_desc_affinity(desc, mask, &dest))
 		return -1;
 
-	cfg = desc->chip_data;
+	cfg = get_irq_desc_chip_data(desc);
 
 	target_ht_irq(irq, dest, cfg->vector);
 
@@ -3903,7 +3904,7 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq,
 	 * IRQs < 16 are already in the irq_2_pin[] map
 	 */
 	if (irq >= legacy_pic->nr_legacy_irqs) {
-		cfg = desc->chip_data;
+		cfg = get_irq_desc_chip_data(desc);
 		if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) {
 			printk(KERN_INFO "can not add pin %d for irq %d\n",
 				pin, irq);
diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c
index 1132129db79..2233a42fb90 100644
--- a/arch/x86/kernel/uv_irq.c
+++ b/arch/x86/kernel/uv_irq.c
@@ -209,7 +209,7 @@ static void arch_disable_uv_irq(int mmr_pnode, unsigned long mmr_offset)
 static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
-	struct irq_cfg *cfg = desc->chip_data;
+	struct irq_cfg *cfg = get_irq_desc_chip_data(desc);
 	unsigned int dest;
 	unsigned long mmr_value;
 	struct uv_IO_APIC_route_entry *entry;
-- 
cgit v1.2.3-18-g5258


From dd5f15e5cf104c9170b76ae3274f35b42a3e4161 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 28 Sep 2010 15:18:35 +0200
Subject: x86: Cleanup io_apic

Sanitize functions. Remove irq_desc pointer magic.
Preparatory patch for further cleanups.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/io_apic.c | 109 ++++++++++++++++-------------------------
 1 file changed, 42 insertions(+), 67 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index fa0d92a6db5..3c4dee8a9ef 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -572,11 +572,6 @@ static void __unmask_and_level_IO_APIC_irq(struct irq_pin_list *entry)
 			     IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
 }
 
-static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)
-{
-	io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
-}
-
 static void io_apic_sync(struct irq_pin_list *entry)
 {
 	/*
@@ -588,44 +583,41 @@ static void io_apic_sync(struct irq_pin_list *entry)
 	readl(&io_apic->data);
 }
 
-static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
+static void mask_ioapic(struct irq_cfg *cfg)
 {
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&ioapic_lock, flags);
 	io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
+	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
-static void mask_IO_APIC_irq_desc(struct irq_desc *desc)
+static void mask_ioapic_irq(unsigned int irq)
 {
-	struct irq_cfg *cfg = get_irq_desc_chip_data(desc);
-	unsigned long flags;
+	struct irq_cfg *cfg = get_irq_chip_data(irq);
 
-	BUG_ON(!cfg);
+	mask_ioapic(cfg);
+}
 
-	raw_spin_lock_irqsave(&ioapic_lock, flags);
-	__mask_IO_APIC_irq(cfg);
-	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+static void __unmask_ioapic(struct irq_cfg *cfg)
+{
+	io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
 }
 
-static void unmask_IO_APIC_irq_desc(struct irq_desc *desc)
+static void unmask_ioapic(struct irq_cfg *cfg)
 {
-	struct irq_cfg *cfg = get_irq_desc_chip_data(desc);
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
-	__unmask_IO_APIC_irq(cfg);
+	__unmask_ioapic(cfg);
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
-static void mask_IO_APIC_irq(unsigned int irq)
+static void unmask_ioapic_irq(unsigned int irq)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
+	struct irq_cfg *cfg = get_irq_chip_data(irq);
 
-	mask_IO_APIC_irq_desc(desc);
-}
-static void unmask_IO_APIC_irq(unsigned int irq)
-{
-	struct irq_desc *desc = irq_to_desc(irq);
-
-	unmask_IO_APIC_irq_desc(desc);
+	unmask_ioapic(cfg);
 }
 
 static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
@@ -2239,7 +2231,7 @@ static unsigned int startup_ioapic_irq(unsigned int irq)
 			was_pending = 1;
 	}
 	cfg = irq_cfg(irq);
-	__unmask_IO_APIC_irq(cfg);
+	__unmask_ioapic(cfg);
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 
 	return was_pending;
@@ -2498,10 +2490,8 @@ unlock:
 	irq_exit();
 }
 
-static void __irq_complete_move(struct irq_desc **descp, unsigned vector)
+static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector)
 {
-	struct irq_desc *desc = *descp;
-	struct irq_cfg *cfg = get_irq_desc_chip_data(desc);
 	unsigned me;
 
 	if (likely(!cfg->move_in_progress))
@@ -2513,30 +2503,29 @@ static void __irq_complete_move(struct irq_desc **descp, unsigned vector)
 		send_cleanup_vector(cfg);
 }
 
-static void irq_complete_move(struct irq_desc **descp)
+static void irq_complete_move(struct irq_cfg *cfg)
 {
-	__irq_complete_move(descp, ~get_irq_regs()->orig_ax);
+	__irq_complete_move(cfg, ~get_irq_regs()->orig_ax);
 }
 
 void irq_force_complete_move(int irq)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
-	struct irq_cfg *cfg = get_irq_desc_chip_data(desc);
+	struct irq_cfg *cfg = get_irq_chip_data(irq);
 
 	if (!cfg)
 		return;
 
-	__irq_complete_move(&desc, cfg->vector);
+	__irq_complete_move(cfg, cfg->vector);
 }
 #else
-static inline void irq_complete_move(struct irq_desc **descp) {}
+static inline void irq_complete_move(struct irq_cfg *cfg) { }
 #endif
 
 static void ack_apic_edge(unsigned int irq)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
+	struct irq_cfg *cfg = get_irq_chip_data(irq);
 
-	irq_complete_move(&desc);
+	irq_complete_move(cfg);
 	move_native_irq(irq);
 	ack_APIC_irq();
 }
@@ -2559,10 +2548,12 @@ atomic_t irq_mis_count;
  * Otherwise, we simulate the EOI message manually by changing the trigger
  * mode to edge and then back to level, with RTE being masked during this.
 */
-static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
+static void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
 {
 	struct irq_pin_list *entry;
+	unsigned long flags;
 
+	raw_spin_lock_irqsave(&ioapic_lock, flags);
 	for_each_irq_pin(entry, cfg->irq_2_pin) {
 		if (mp_ioapics[entry->apic].apicver >= 0x20) {
 			/*
@@ -2580,36 +2571,22 @@ static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
 			__unmask_and_level_IO_APIC_irq(entry);
 		}
 	}
-}
-
-static void eoi_ioapic_irq(struct irq_desc *desc)
-{
-	struct irq_cfg *cfg;
-	unsigned long flags;
-	unsigned int irq;
-
-	irq = desc->irq;
-	cfg = get_irq_desc_chip_data(desc);
-
-	raw_spin_lock_irqsave(&ioapic_lock, flags);
-	__eoi_ioapic_irq(irq, cfg);
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
 static void ack_apic_level(unsigned int irq)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
+	struct irq_cfg *cfg = get_irq_desc_chip_data(desc);
+	int i, do_unmask_irq = 0;
 	unsigned long v;
-	int i;
-	struct irq_cfg *cfg;
-	int do_unmask_irq = 0;
 
-	irq_complete_move(&desc);
+	irq_complete_move(cfg);
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	/* If we are moving the irq we need to mask it */
 	if (unlikely(desc->status & IRQ_MOVE_PENDING)) {
 		do_unmask_irq = 1;
-		mask_IO_APIC_irq_desc(desc);
+		mask_ioapic(cfg);
 	}
 #endif
 
@@ -2645,7 +2622,6 @@ static void ack_apic_level(unsigned int irq)
 	 * we use the above logic (mask+edge followed by unmask+level) from
 	 * Manfred Spraul to clear the remote IRR.
 	 */
-	cfg = get_irq_desc_chip_data(desc);
 	i = cfg->vector;
 	v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
 
@@ -2665,7 +2641,7 @@ static void ack_apic_level(unsigned int irq)
 	if (!(v & (1 << (i & 0x1f)))) {
 		atomic_inc(&irq_mis_count);
 
-		eoi_ioapic_irq(desc);
+		eoi_ioapic_irq(irq, cfg);
 	}
 
 	/* Now we can move and renable the irq */
@@ -2696,10 +2672,9 @@ static void ack_apic_level(unsigned int irq)
 		 * accurate and is causing problems then it is a hardware bug
 		 * and you can go talk to the chipset vendor about it.
 		 */
-		cfg = get_irq_desc_chip_data(desc);
 		if (!io_apic_level_ack_pending(cfg))
 			move_masked_irq(irq);
-		unmask_IO_APIC_irq_desc(desc);
+		unmask_ioapic(cfg);
 	}
 }
 
@@ -2711,18 +2686,18 @@ static void ir_ack_apic_edge(unsigned int irq)
 
 static void ir_ack_apic_level(unsigned int irq)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
+	struct irq_cfg *cfg = get_irq_chip_data(irq);
 
 	ack_APIC_irq();
-	eoi_ioapic_irq(desc);
+	eoi_ioapic_irq(irq, cfg);
 }
 #endif /* CONFIG_INTR_REMAP */
 
 static struct irq_chip ioapic_chip __read_mostly = {
 	.name		= "IO-APIC",
 	.startup	= startup_ioapic_irq,
-	.mask		= mask_IO_APIC_irq,
-	.unmask		= unmask_IO_APIC_irq,
+	.mask		= mask_ioapic_irq,
+	.unmask		= unmask_ioapic_irq,
 	.ack		= ack_apic_edge,
 	.eoi		= ack_apic_level,
 #ifdef CONFIG_SMP
@@ -2734,8 +2709,8 @@ static struct irq_chip ioapic_chip __read_mostly = {
 static struct irq_chip ir_ioapic_chip __read_mostly = {
 	.name		= "IR-IO-APIC",
 	.startup	= startup_ioapic_irq,
-	.mask		= mask_IO_APIC_irq,
-	.unmask		= unmask_IO_APIC_irq,
+	.mask		= mask_ioapic_irq,
+	.unmask		= unmask_ioapic_irq,
 #ifdef CONFIG_INTR_REMAP
 	.ack		= ir_ack_apic_edge,
 	.eoi		= ir_ack_apic_level,
@@ -2996,7 +2971,7 @@ static inline void __init check_timer(void)
 			int idx;
 			idx = find_irq_entry(apic1, pin1, mp_INT);
 			if (idx != -1 && irq_trigger(idx))
-				unmask_IO_APIC_irq_desc(desc);
+				unmask_ioapic(cfg);
 		}
 		if (timer_irq_works()) {
 			if (nmi_watchdog == NMI_IO_APIC) {
-- 
cgit v1.2.3-18-g5258


From 61a38ce3f59cdb4654e9444329195bd57c3baf74 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 28 Sep 2010 16:00:34 +0200
Subject: x86: io_apic: Convert startup to new irq_chip function

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/io_apic.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 3c4dee8a9ef..5ced690b849 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2218,11 +2218,10 @@ static int __init timer_irq_works(void)
  * an edge even if it isn't on the 8259A...
  */
 
-static unsigned int startup_ioapic_irq(unsigned int irq)
+static unsigned int startup_ioapic_irq(struct irq_data *data)
 {
-	int was_pending = 0;
+	int was_pending = 0, irq = data->irq;
 	unsigned long flags;
-	struct irq_cfg *cfg;
 
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
 	if (irq < legacy_pic->nr_legacy_irqs) {
@@ -2230,8 +2229,7 @@ static unsigned int startup_ioapic_irq(unsigned int irq)
 		if (legacy_pic->irq_pending(irq))
 			was_pending = 1;
 	}
-	cfg = irq_cfg(irq);
-	__unmask_ioapic(cfg);
+	__unmask_ioapic(data->chip_data);
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 
 	return was_pending;
@@ -2695,7 +2693,7 @@ static void ir_ack_apic_level(unsigned int irq)
 
 static struct irq_chip ioapic_chip __read_mostly = {
 	.name		= "IO-APIC",
-	.startup	= startup_ioapic_irq,
+	.irq_startup	= startup_ioapic_irq,
 	.mask		= mask_ioapic_irq,
 	.unmask		= unmask_ioapic_irq,
 	.ack		= ack_apic_edge,
@@ -2708,7 +2706,7 @@ static struct irq_chip ioapic_chip __read_mostly = {
 
 static struct irq_chip ir_ioapic_chip __read_mostly = {
 	.name		= "IR-IO-APIC",
-	.startup	= startup_ioapic_irq,
+	.irq_startup	= startup_ioapic_irq,
 	.mask		= mask_ioapic_irq,
 	.unmask		= unmask_ioapic_irq,
 #ifdef CONFIG_INTR_REMAP
-- 
cgit v1.2.3-18-g5258


From 90297c5fe71d32a2a0ead38bd8f6b1112a2e5ac0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 28 Sep 2010 16:03:54 +0200
Subject: x86: ioapic: Convert mask to new irq_chip function

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/io_apic.c | 95 +++++++++++++++++++-----------------------
 1 file changed, 43 insertions(+), 52 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 5ced690b849..b8b013f0cfd 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -592,11 +592,9 @@ static void mask_ioapic(struct irq_cfg *cfg)
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
-static void mask_ioapic_irq(unsigned int irq)
+static void mask_ioapic_irq(struct irq_data *data)
 {
-	struct irq_cfg *cfg = get_irq_chip_data(irq);
-
-	mask_ioapic(cfg);
+	mask_ioapic(data->chip_data);
 }
 
 static void __unmask_ioapic(struct irq_cfg *cfg)
@@ -613,11 +611,9 @@ static void unmask_ioapic(struct irq_cfg *cfg)
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
-static void unmask_ioapic_irq(unsigned int irq)
+static void unmask_ioapic_irq(struct irq_data *data)
 {
-	struct irq_cfg *cfg = get_irq_chip_data(irq);
-
-	unmask_ioapic(cfg);
+	unmask_ioapic(data->chip_data);
 }
 
 static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
@@ -2235,10 +2231,9 @@ static unsigned int startup_ioapic_irq(struct irq_data *data)
 	return was_pending;
 }
 
-static int ioapic_retrigger_irq(unsigned int irq)
+static int ioapic_retrigger_irq(struct irq_data *data)
 {
-
-	struct irq_cfg *cfg = irq_cfg(irq);
+	struct irq_cfg *cfg = data->chip_data;
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&vector_lock, flags);
@@ -2519,12 +2514,10 @@ void irq_force_complete_move(int irq)
 static inline void irq_complete_move(struct irq_cfg *cfg) { }
 #endif
 
-static void ack_apic_edge(unsigned int irq)
+static void ack_apic_edge(struct irq_data *data)
 {
-	struct irq_cfg *cfg = get_irq_chip_data(irq);
-
-	irq_complete_move(cfg);
-	move_native_irq(irq);
+	irq_complete_move(data->chip_data);
+	move_native_irq(data->irq);
 	ack_APIC_irq();
 }
 
@@ -2572,11 +2565,11 @@ static void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
-static void ack_apic_level(unsigned int irq)
+static void ack_apic_level(struct irq_data *data)
 {
+	struct irq_cfg *cfg = data->chip_data;
+	int i, do_unmask_irq = 0, irq = data->irq;
 	struct irq_desc *desc = irq_to_desc(irq);
-	struct irq_cfg *cfg = get_irq_desc_chip_data(desc);
-	int i, do_unmask_irq = 0;
 	unsigned long v;
 
 	irq_complete_move(cfg);
@@ -2677,46 +2670,44 @@ static void ack_apic_level(unsigned int irq)
 }
 
 #ifdef CONFIG_INTR_REMAP
-static void ir_ack_apic_edge(unsigned int irq)
+static void ir_ack_apic_edge(struct irq_data *data)
 {
 	ack_APIC_irq();
 }
 
-static void ir_ack_apic_level(unsigned int irq)
+static void ir_ack_apic_level(struct irq_data *data)
 {
-	struct irq_cfg *cfg = get_irq_chip_data(irq);
-
 	ack_APIC_irq();
-	eoi_ioapic_irq(irq, cfg);
+	eoi_ioapic_irq(data->irq, data->chip_data);
 }
 #endif /* CONFIG_INTR_REMAP */
 
 static struct irq_chip ioapic_chip __read_mostly = {
 	.name		= "IO-APIC",
 	.irq_startup	= startup_ioapic_irq,
-	.mask		= mask_ioapic_irq,
-	.unmask		= unmask_ioapic_irq,
-	.ack		= ack_apic_edge,
-	.eoi		= ack_apic_level,
+	.irq_mask	= mask_ioapic_irq,
+	.irq_unmask	= unmask_ioapic_irq,
+	.irq_ack	= ack_apic_edge,
+	.irq_eoi	= ack_apic_level,
 #ifdef CONFIG_SMP
 	.set_affinity	= set_ioapic_affinity_irq,
 #endif
-	.retrigger	= ioapic_retrigger_irq,
+	.irq_retrigger	= ioapic_retrigger_irq,
 };
 
 static struct irq_chip ir_ioapic_chip __read_mostly = {
 	.name		= "IR-IO-APIC",
 	.irq_startup	= startup_ioapic_irq,
-	.mask		= mask_ioapic_irq,
-	.unmask		= unmask_ioapic_irq,
+	.irq_mask	= mask_ioapic_irq,
+	.irq_unmask	= unmask_ioapic_irq,
 #ifdef CONFIG_INTR_REMAP
-	.ack		= ir_ack_apic_edge,
-	.eoi		= ir_ack_apic_level,
+	.irq_ack	= ir_ack_apic_edge,
+	.irq_eoi	= ir_ack_apic_level,
 #ifdef CONFIG_SMP
 	.set_affinity	= set_ir_ioapic_affinity_irq,
 #endif
 #endif
-	.retrigger	= ioapic_retrigger_irq,
+	.irq_retrigger	= ioapic_retrigger_irq,
 };
 
 static inline void init_IO_APIC_traps(void)
@@ -2757,7 +2748,7 @@ static inline void init_IO_APIC_traps(void)
  * The local APIC irq-chip implementation:
  */
 
-static void mask_lapic_irq(unsigned int irq)
+static void mask_lapic_irq(struct irq_data *data)
 {
 	unsigned long v;
 
@@ -2765,7 +2756,7 @@ static void mask_lapic_irq(unsigned int irq)
 	apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
 }
 
-static void unmask_lapic_irq(unsigned int irq)
+static void unmask_lapic_irq(struct irq_data *data)
 {
 	unsigned long v;
 
@@ -2773,16 +2764,16 @@ static void unmask_lapic_irq(unsigned int irq)
 	apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
 }
 
-static void ack_lapic_irq(unsigned int irq)
+static void ack_lapic_irq(struct irq_data *data)
 {
 	ack_APIC_irq();
 }
 
 static struct irq_chip lapic_chip __read_mostly = {
 	.name		= "local-APIC",
-	.mask		= mask_lapic_irq,
-	.unmask		= unmask_lapic_irq,
-	.ack		= ack_lapic_irq,
+	.irq_mask	= mask_lapic_irq,
+	.irq_unmask	= unmask_lapic_irq,
+	.irq_ack	= ack_lapic_irq,
 };
 
 static void lapic_register_intr(int irq, struct irq_desc *desc)
@@ -3417,11 +3408,11 @@ static struct irq_chip msi_chip = {
 	.name		= "PCI-MSI",
 	.irq_unmask	= unmask_msi_irq,
 	.irq_mask	= mask_msi_irq,
-	.ack		= ack_apic_edge,
+	.irq_ack	= ack_apic_edge,
 #ifdef CONFIG_SMP
 	.set_affinity	= set_msi_irq_affinity,
 #endif
-	.retrigger	= ioapic_retrigger_irq,
+	.irq_retrigger	= ioapic_retrigger_irq,
 };
 
 static struct irq_chip msi_ir_chip = {
@@ -3429,12 +3420,12 @@ static struct irq_chip msi_ir_chip = {
 	.irq_unmask	= unmask_msi_irq,
 	.irq_mask	= mask_msi_irq,
 #ifdef CONFIG_INTR_REMAP
-	.ack		= ir_ack_apic_edge,
+	.irq_ack	= ir_ack_apic_edge,
 #ifdef CONFIG_SMP
 	.set_affinity	= ir_set_msi_irq_affinity,
 #endif
 #endif
-	.retrigger	= ioapic_retrigger_irq,
+	.irq_retrigger	= ioapic_retrigger_irq,
 };
 
 /*
@@ -3589,11 +3580,11 @@ static struct irq_chip dmar_msi_type = {
 	.name = "DMAR_MSI",
 	.unmask = dmar_msi_unmask,
 	.mask = dmar_msi_mask,
-	.ack = ack_apic_edge,
+	.irq_ack = ack_apic_edge,
 #ifdef CONFIG_SMP
 	.set_affinity = dmar_msi_set_affinity,
 #endif
-	.retrigger = ioapic_retrigger_irq,
+	.irq_retrigger = ioapic_retrigger_irq,
 };
 
 int arch_setup_dmar_msi(unsigned int irq)
@@ -3645,23 +3636,23 @@ static struct irq_chip ir_hpet_msi_type = {
 	.unmask = hpet_msi_unmask,
 	.mask = hpet_msi_mask,
 #ifdef CONFIG_INTR_REMAP
-	.ack = ir_ack_apic_edge,
+	.irq_ack = ir_ack_apic_edge,
 #ifdef CONFIG_SMP
 	.set_affinity = ir_set_msi_irq_affinity,
 #endif
 #endif
-	.retrigger = ioapic_retrigger_irq,
+	.irq_retrigger = ioapic_retrigger_irq,
 };
 
 static struct irq_chip hpet_msi_type = {
 	.name = "HPET_MSI",
 	.unmask = hpet_msi_unmask,
 	.mask = hpet_msi_mask,
-	.ack = ack_apic_edge,
+	.irq_ack = ack_apic_edge,
 #ifdef CONFIG_SMP
 	.set_affinity = hpet_msi_set_affinity,
 #endif
-	.retrigger = ioapic_retrigger_irq,
+	.irq_retrigger = ioapic_retrigger_irq,
 };
 
 int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
@@ -3743,11 +3734,11 @@ static struct irq_chip ht_irq_chip = {
 	.name		= "PCI-HT",
 	.mask		= mask_ht_irq,
 	.unmask		= unmask_ht_irq,
-	.ack		= ack_apic_edge,
+	.irq_ack	= ack_apic_edge,
 #ifdef CONFIG_SMP
 	.set_affinity	= set_ht_irq_affinity,
 #endif
-	.retrigger	= ioapic_retrigger_irq,
+	.irq_retrigger	= ioapic_retrigger_irq,
 };
 
 int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
-- 
cgit v1.2.3-18-g5258


From d0fbca8f9304d1760fdc906b35b06e89fbdbb51f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 28 Sep 2010 16:18:39 +0200
Subject: x86: ioapic/hpet: Convert to new chip functions

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/hpet.h    | 10 ++++++----
 arch/x86/kernel/apic/io_apic.c | 30 ++++++++++++++----------------
 arch/x86/kernel/hpet.c         | 16 ++++++----------
 3 files changed, 26 insertions(+), 30 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h
index 1d5c08a1bdf..2c392d663dc 100644
--- a/arch/x86/include/asm/hpet.h
+++ b/arch/x86/include/asm/hpet.h
@@ -74,10 +74,12 @@ extern void hpet_disable(void);
 extern unsigned int hpet_readl(unsigned int a);
 extern void force_hpet_resume(void);
 
-extern void hpet_msi_unmask(unsigned int irq);
-extern void hpet_msi_mask(unsigned int irq);
-extern void hpet_msi_write(unsigned int irq, struct msi_msg *msg);
-extern void hpet_msi_read(unsigned int irq, struct msi_msg *msg);
+struct irq_data;
+extern void hpet_msi_unmask(struct irq_data *data);
+extern void hpet_msi_mask(struct irq_data *data);
+struct hpet_dev;
+extern void hpet_msi_write(struct hpet_dev *hdev, struct msi_msg *msg);
+extern void hpet_msi_read(struct hpet_dev *hdev, struct msi_msg *msg);
 
 #ifdef CONFIG_PCI_MSI
 extern int arch_setup_hpet_msi(unsigned int irq, unsigned int id);
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index b8b013f0cfd..49aa857ff00 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3605,26 +3605,25 @@ int arch_setup_dmar_msi(unsigned int irq)
 #ifdef CONFIG_HPET_TIMER
 
 #ifdef CONFIG_SMP
-static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
+static int hpet_msi_set_affinity(struct irq_data *data,
+				 const struct cpumask *mask, bool force)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
-	struct irq_cfg *cfg;
+	struct irq_desc *desc = irq_to_desc(data->irq);
+	struct irq_cfg *cfg = data->chip_data;
 	struct msi_msg msg;
 	unsigned int dest;
 
 	if (set_desc_affinity(desc, mask, &dest))
 		return -1;
 
-	cfg = get_irq_desc_chip_data(desc);
-
-	hpet_msi_read(irq, &msg);
+	hpet_msi_read(data->handler_data, &msg);
 
 	msg.data &= ~MSI_DATA_VECTOR_MASK;
 	msg.data |= MSI_DATA_VECTOR(cfg->vector);
 	msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
 	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
-	hpet_msi_write(irq, &msg);
+	hpet_msi_write(data->handler_data, &msg);
 
 	return 0;
 }
@@ -3633,8 +3632,8 @@ static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 
 static struct irq_chip ir_hpet_msi_type = {
 	.name = "IR-HPET_MSI",
-	.unmask = hpet_msi_unmask,
-	.mask = hpet_msi_mask,
+	.irq_unmask = hpet_msi_unmask,
+	.irq_mask = hpet_msi_mask,
 #ifdef CONFIG_INTR_REMAP
 	.irq_ack = ir_ack_apic_edge,
 #ifdef CONFIG_SMP
@@ -3646,20 +3645,19 @@ static struct irq_chip ir_hpet_msi_type = {
 
 static struct irq_chip hpet_msi_type = {
 	.name = "HPET_MSI",
-	.unmask = hpet_msi_unmask,
-	.mask = hpet_msi_mask,
+	.irq_unmask = hpet_msi_unmask,
+	.irq_mask = hpet_msi_mask,
 	.irq_ack = ack_apic_edge,
 #ifdef CONFIG_SMP
-	.set_affinity = hpet_msi_set_affinity,
+	.irq_set_affinity = hpet_msi_set_affinity,
 #endif
 	.irq_retrigger = ioapic_retrigger_irq,
 };
 
 int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
 {
-	int ret;
 	struct msi_msg msg;
-	struct irq_desc *desc = irq_to_desc(irq);
+	int ret;
 
 	if (intr_remapping_enabled) {
 		struct intel_iommu *iommu = map_hpet_to_ir(id);
@@ -3677,8 +3675,8 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
 	if (ret < 0)
 		return ret;
 
-	hpet_msi_write(irq, &msg);
-	desc->status |= IRQ_MOVE_PCNTXT;
+	hpet_msi_write(get_irq_data(irq), &msg);
+	irq_set_status_flags(irq,IRQ_MOVE_PCNTXT);
 	if (irq_remapped(irq))
 		set_irq_chip_and_handler_name(irq, &ir_hpet_msi_type,
 					      handle_edge_irq, "edge");
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 7494999141b..efaf906daf9 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -440,9 +440,9 @@ static int hpet_legacy_next_event(unsigned long delta,
 static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev);
 static struct hpet_dev	*hpet_devs;
 
-void hpet_msi_unmask(unsigned int irq)
+void hpet_msi_unmask(struct irq_data *data)
 {
-	struct hpet_dev *hdev = get_irq_data(irq);
+	struct hpet_dev *hdev = data->handler_data;
 	unsigned int cfg;
 
 	/* unmask it */
@@ -451,10 +451,10 @@ void hpet_msi_unmask(unsigned int irq)
 	hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
 }
 
-void hpet_msi_mask(unsigned int irq)
+void hpet_msi_mask(struct irq_data *data)
 {
+	struct hpet_dev *hdev = data->handler_data;
 	unsigned int cfg;
-	struct hpet_dev *hdev = get_irq_data(irq);
 
 	/* mask it */
 	cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
@@ -462,18 +462,14 @@ void hpet_msi_mask(unsigned int irq)
 	hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
 }
 
-void hpet_msi_write(unsigned int irq, struct msi_msg *msg)
+void hpet_msi_write(struct hpet_dev *hdev, struct msi_msg *msg)
 {
-	struct hpet_dev *hdev = get_irq_data(irq);
-
 	hpet_writel(msg->data, HPET_Tn_ROUTE(hdev->num));
 	hpet_writel(msg->address_lo, HPET_Tn_ROUTE(hdev->num) + 4);
 }
 
-void hpet_msi_read(unsigned int irq, struct msi_msg *msg)
+void hpet_msi_read(struct hpet_dev *hdev, struct msi_msg *msg)
 {
-	struct hpet_dev *hdev = get_irq_data(irq);
-
 	msg->data = hpet_readl(HPET_Tn_ROUTE(hdev->num));
 	msg->address_lo = hpet_readl(HPET_Tn_ROUTE(hdev->num) + 4);
 	msg->address_hi = 0;
-- 
cgit v1.2.3-18-g5258


From 5c2837fbaa609e615ef9a1c58a4cd26ce90be35b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 28 Sep 2010 17:15:11 +0200
Subject: dmar: Convert to new irq chip functions

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Acked-by: David Woodhouse <dwmw2@infradead.org>
---
 arch/x86/kernel/apic/io_apic.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 49aa857ff00..72b253ef310 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3578,8 +3578,8 @@ static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 
 static struct irq_chip dmar_msi_type = {
 	.name = "DMAR_MSI",
-	.unmask = dmar_msi_unmask,
-	.mask = dmar_msi_mask,
+	.irq_unmask = dmar_msi_unmask,
+	.irq_mask = dmar_msi_mask,
 	.irq_ack = ack_apic_edge,
 #ifdef CONFIG_SMP
 	.set_affinity = dmar_msi_set_affinity,
-- 
cgit v1.2.3-18-g5258


From e9f7ac664bfc36685a8eb3315ec21c067d0cee36 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 28 Sep 2010 17:22:09 +0200
Subject: ht: Convert to new irq_chip functions

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Cc: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/kernel/apic/io_apic.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 72b253ef310..5579f3f5943 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3730,8 +3730,8 @@ static int set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
 
 static struct irq_chip ht_irq_chip = {
 	.name		= "PCI-HT",
-	.mask		= mask_ht_irq,
-	.unmask		= unmask_ht_irq,
+	.irq_mask	= mask_ht_irq,
+	.irq_unmask	= unmask_ht_irq,
 	.irq_ack	= ack_apic_edge,
 #ifdef CONFIG_SMP
 	.set_affinity	= set_ht_irq_affinity,
-- 
cgit v1.2.3-18-g5258


From 60c69948e5b6357ac0d5ef2a2d0ce31c173c3c64 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 28 Sep 2010 17:28:38 +0200
Subject: x86: ioapic: Clean up the direct access to irq_desc

Most of it is useless pseudo optimization.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/io_apic.c | 79 +++++++++++++++---------------------------
 1 file changed, 28 insertions(+), 51 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 5579f3f5943..82c3c66e333 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -150,10 +150,7 @@ static struct irq_cfg irq_cfgx[NR_IRQS];
 int __init arch_early_irq_init(void)
 {
 	struct irq_cfg *cfg;
-	struct irq_desc *desc;
-	int count;
-	int node;
-	int i;
+	int count, node, i;
 
 	if (!legacy_pic->nr_legacy_irqs) {
 		nr_irqs_gsi = 0;
@@ -165,8 +162,7 @@ int __init arch_early_irq_init(void)
 	node = cpu_to_node(0);
 
 	for (i = 0; i < count; i++) {
-		desc = irq_to_desc(i);
-		desc->chip_data = &cfg[i];
+		set_irq_chip_data(i, &cfg[i]);
 		zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node);
 		zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node);
 		/*
@@ -185,14 +181,7 @@ int __init arch_early_irq_init(void)
 #ifdef CONFIG_SPARSE_IRQ
 struct irq_cfg *irq_cfg(unsigned int irq)
 {
-	struct irq_cfg *cfg = NULL;
-	struct irq_desc *desc;
-
-	desc = irq_to_desc(irq);
-	if (desc)
-		cfg = get_irq_desc_chip_data(desc);
-
-	return cfg;
+	return get_irq_chip_data(irq);
 }
 
 static struct irq_cfg *get_one_free_irq_cfg(int node)
@@ -1316,17 +1305,17 @@ static inline int IO_APIC_irq_trigger(int irq)
 }
 #endif
 
-static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long trigger)
+static void ioapic_register_intr(unsigned int irq, unsigned long trigger)
 {
 
 	if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
 	    trigger == IOAPIC_LEVEL)
-		desc->status |= IRQ_LEVEL;
+		irq_set_status_flags(irq, IRQ_LEVEL);
 	else
-		desc->status &= ~IRQ_LEVEL;
+		irq_clear_status_flags(irq, IRQ_LEVEL);
 
 	if (irq_remapped(irq)) {
-		desc->status |= IRQ_MOVE_PCNTXT;
+		irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
 		if (trigger)
 			set_irq_chip_and_handler_name(irq, &ir_ioapic_chip,
 						      handle_fasteoi_irq,
@@ -1406,18 +1395,14 @@ int setup_ioapic_entry(int apic_id, int irq,
 	return 0;
 }
 
-static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq_desc *desc,
-			      int trigger, int polarity)
+static void setup_ioapic_irq(int apic_id, int pin, unsigned int irq,
+			     struct irq_cfg *cfg, int trigger, int polarity)
 {
-	struct irq_cfg *cfg;
 	struct IO_APIC_route_entry entry;
 	unsigned int dest;
 
 	if (!IO_APIC_IRQ(irq))
 		return;
-
-	cfg = get_irq_desc_chip_data(desc);
-
 	/*
 	 * For legacy irqs, cfg->domain starts with cpu 0 for legacy
 	 * controllers like 8259. Now that IO-APIC can handle this irq, update
@@ -1446,7 +1431,7 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
 		return;
 	}
 
-	ioapic_register_intr(irq, desc, trigger);
+	ioapic_register_intr(irq, trigger);
 	if (irq < legacy_pic->nr_legacy_irqs)
 		legacy_pic->mask(irq);
 
@@ -1511,8 +1496,8 @@ static void __init setup_IO_APIC_irqs(void)
 		 * don't mark it in pin_programmed, so later acpi could
 		 * set it correctly when irq < 16
 		 */
-		setup_IO_APIC_irq(apic_id, pin, irq, desc,
-				irq_trigger(idx), irq_polarity(idx));
+		setup_ioapic_irq(apic_id, pin, irq, cfg, irq_trigger(idx),
+				  irq_polarity(idx));
 	}
 
 	if (notcon)
@@ -1566,7 +1551,7 @@ void setup_IO_APIC_irq_extra(u32 gsi)
 	}
 	set_bit(pin, mp_ioapic_routing[apic_id].pin_programmed);
 
-	setup_IO_APIC_irq(apic_id, pin, irq, desc,
+	setup_ioapic_irq(apic_id, pin, irq, cfg,
 			irq_trigger(idx), irq_polarity(idx));
 }
 
@@ -2776,9 +2761,9 @@ static struct irq_chip lapic_chip __read_mostly = {
 	.irq_ack	= ack_lapic_irq,
 };
 
-static void lapic_register_intr(int irq, struct irq_desc *desc)
+static void lapic_register_intr(int irq)
 {
-	desc->status &= ~IRQ_LEVEL;
+	irq_clear_status_flags(irq, IRQ_LEVEL);
 	set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
 				      "edge");
 }
@@ -2881,8 +2866,7 @@ int timer_through_8259 __initdata;
  */
 static inline void __init check_timer(void)
 {
-	struct irq_desc *desc = irq_to_desc(0);
-	struct irq_cfg *cfg = get_irq_desc_chip_data(desc);
+	struct irq_cfg *cfg = get_irq_chip_data(0);
 	int node = cpu_to_node(0);
 	int apic1, pin1, apic2, pin2;
 	unsigned long flags;
@@ -2952,7 +2936,7 @@ static inline void __init check_timer(void)
 			add_pin_to_irq_node(cfg, node, apic1, pin1);
 			setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
 		} else {
-			/* for edge trigger, setup_IO_APIC_irq already
+			/* for edge trigger, setup_ioapic_irq already
 			 * leave it unmasked.
 			 * so only need to unmask if it is level-trigger
 			 * do we really have level trigger timer?
@@ -3020,7 +3004,7 @@ static inline void __init check_timer(void)
 	apic_printk(APIC_QUIET, KERN_INFO
 		    "...trying to set up timer as Virtual Wire IRQ...\n");
 
-	lapic_register_intr(0, desc);
+	lapic_register_intr(0);
 	apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector);	/* Fixed mode */
 	legacy_pic->unmask(0);
 
@@ -3457,8 +3441,8 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
 
 static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
 {
-	int ret;
 	struct msi_msg msg;
+	int ret;
 
 	ret = msi_compose_msg(dev, irq, &msg, -1);
 	if (ret < 0)
@@ -3468,11 +3452,7 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
 	write_msi_msg(irq, &msg);
 
 	if (irq_remapped(irq)) {
-		struct irq_desc *desc = irq_to_desc(irq);
-		/*
-		 * irq migration in process context
-		 */
-		desc->status |= IRQ_MOVE_PCNTXT;
+		irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
 		set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge");
 	} else
 		set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
@@ -3484,13 +3464,10 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
 
 int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 {
-	unsigned int irq;
-	int ret, sub_handle;
+	int node, ret, sub_handle, index = 0;
+	unsigned int irq, irq_want;
 	struct msi_desc *msidesc;
-	unsigned int irq_want;
 	struct intel_iommu *iommu = NULL;
-	int index = 0;
-	int node;
 
 	/* x86 doesn't support multiple MSI yet */
 	if (type == PCI_CAP_ID_MSI && nvec > 1)
@@ -3676,7 +3653,7 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
 		return ret;
 
 	hpet_msi_write(get_irq_data(irq), &msg);
-	irq_set_status_flags(irq,IRQ_MOVE_PCNTXT);
+	irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
 	if (irq_remapped(irq))
 		set_irq_chip_and_handler_name(irq, &ir_hpet_msi_type,
 					      handle_edge_irq, "edge");
@@ -3862,11 +3839,12 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq,
 	trigger = irq_attr->trigger;
 	polarity = irq_attr->polarity;
 
+	cfg = get_irq_desc_chip_data(desc);
+
 	/*
 	 * IRQs < 16 are already in the irq_2_pin[] map
 	 */
 	if (irq >= legacy_pic->nr_legacy_irqs) {
-		cfg = get_irq_desc_chip_data(desc);
 		if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) {
 			printk(KERN_INFO "can not add pin %d for irq %d\n",
 				pin, irq);
@@ -3874,7 +3852,7 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq,
 		}
 	}
 
-	setup_IO_APIC_irq(ioapic, pin, irq, desc, trigger, polarity);
+	setup_ioapic_irq(ioapic, pin, irq, cfg, trigger, polarity);
 
 	return 0;
 }
@@ -4258,13 +4236,12 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
 void __init pre_init_apic_IRQ0(void)
 {
 	struct irq_cfg *cfg;
-	struct irq_desc *desc;
 
 	printk(KERN_INFO "Early APIC setup for system timer0\n");
 #ifndef CONFIG_SMP
 	phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid);
 #endif
-	desc = irq_to_desc_alloc_node(0, 0);
+	irq_to_desc_alloc_node(0, 0);
 
 	setup_local_APIC();
 
@@ -4272,5 +4249,5 @@ void __init pre_init_apic_IRQ0(void)
 	add_pin_to_irq_node(cfg, 0, 0, 0);
 	set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
 
-	setup_IO_APIC_irq(0, 0, 0, desc, 0, 0);
+	setup_ioapic_irq(0, 0, 0, cfg, 0, 0);
 }
-- 
cgit v1.2.3-18-g5258


From f7e909eae444ff733ecc5628af76d89c363ab480 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 8 Oct 2010 21:40:23 +0200
Subject: x86: Prepare the affinity common functions for taking struct irq_data
 *

While at it rename it to sensible function names and fix the return
value from unsigned to int for __ioapic_set_affinity (set_desc_affinity).
Returning -1 in a function returning unsigned int is somewhat strange.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/hw_irq.h  |  6 ++--
 arch/x86/kernel/apic/io_apic.c | 77 ++++++++++++++++--------------------------
 arch/x86/kernel/uv_irq.c       |  2 +-
 3 files changed, 33 insertions(+), 52 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 46c0fe05f23..76848f27b1a 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -95,9 +95,9 @@ extern struct irq_cfg *irq_cfg(unsigned int);
 extern int assign_irq_vector(int, struct irq_cfg *, const struct cpumask *);
 extern void send_cleanup_vector(struct irq_cfg *);
 
-struct irq_desc;
-extern unsigned int set_desc_affinity(struct irq_desc *, const struct cpumask *,
-				      unsigned int *dest_id);
+struct irq_data;
+int __ioapic_set_affinity(struct irq_data *, const struct cpumask *,
+			  unsigned int *dest_id);
 extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin, struct io_apic_irq_attr *irq_attr);
 extern void setup_ioapic_dest(void);
 
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 82c3c66e333..60ca9a47087 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2279,65 +2279,46 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq
 }
 
 /*
- * Either sets desc->affinity to a valid value, and returns
+ * Either sets data->affinity to a valid value, and returns
  * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and
- * leaves desc->affinity untouched.
+ * leaves data->affinity untouched.
  */
-unsigned int
-set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask,
-		  unsigned int *dest_id)
+int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
+			  unsigned int *dest_id)
 {
-	struct irq_cfg *cfg;
-	unsigned int irq;
+	struct irq_cfg *cfg = data->chip_data;
 
 	if (!cpumask_intersects(mask, cpu_online_mask))
 		return -1;
 
-	irq = desc->irq;
-	cfg = get_irq_desc_chip_data(desc);
-	if (assign_irq_vector(irq, cfg, mask))
+	if (assign_irq_vector(data->irq, data->chip_data, mask))
 		return -1;
 
-	cpumask_copy(desc->affinity, mask);
+	cpumask_copy(data->affinity, mask);
 
-	*dest_id = apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
+	*dest_id = apic->cpu_mask_to_apicid_and(mask, cfg->domain);
 	return 0;
 }
 
 static int
-set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
+ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
+		    bool force)
 {
-	struct irq_cfg *cfg;
+	unsigned int dest, irq = data->irq;
 	unsigned long flags;
-	unsigned int dest;
-	unsigned int irq;
-	int ret = -1;
-
-	irq = desc->irq;
-	cfg = get_irq_desc_chip_data(desc);
+	int ret;
 
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
-	ret = set_desc_affinity(desc, mask, &dest);
+	ret = __ioapic_set_affinity(data, mask, &dest);
 	if (!ret) {
 		/* Only the high 8 bits are valid. */
 		dest = SET_APIC_LOGICAL_ID(dest);
-		__target_IO_APIC_irq(irq, dest, cfg);
+		__target_IO_APIC_irq(irq, dest, data->chip_data);
 	}
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
-
 	return ret;
 }
 
-static int
-set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
-{
-	struct irq_desc *desc;
-
-	desc = irq_to_desc(irq);
-
-	return set_ioapic_affinity_irq_desc(desc, mask);
-}
-
 #ifdef CONFIG_INTR_REMAP
 
 /*
@@ -2668,16 +2649,16 @@ static void ir_ack_apic_level(struct irq_data *data)
 #endif /* CONFIG_INTR_REMAP */
 
 static struct irq_chip ioapic_chip __read_mostly = {
-	.name		= "IO-APIC",
-	.irq_startup	= startup_ioapic_irq,
-	.irq_mask	= mask_ioapic_irq,
-	.irq_unmask	= unmask_ioapic_irq,
-	.irq_ack	= ack_apic_edge,
-	.irq_eoi	= ack_apic_level,
+	.name			= "IO-APIC",
+	.irq_startup		= startup_ioapic_irq,
+	.irq_mask		= mask_ioapic_irq,
+	.irq_unmask		= unmask_ioapic_irq,
+	.irq_ack		= ack_apic_edge,
+	.irq_eoi		= ack_apic_level,
 #ifdef CONFIG_SMP
-	.set_affinity	= set_ioapic_affinity_irq,
+	.irq_set_affinity	= ioapic_set_affinity,
 #endif
-	.irq_retrigger	= ioapic_retrigger_irq,
+	.irq_retrigger		= ioapic_retrigger_irq,
 };
 
 static struct irq_chip ir_ioapic_chip __read_mostly = {
@@ -3327,7 +3308,7 @@ static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
 	struct msi_msg msg;
 	unsigned int dest;
 
-	if (set_desc_affinity(desc, mask, &dest))
+	if (__ioapic_set_affinity(&desc->irq_data, mask, &dest))
 		return -1;
 
 	cfg = get_irq_desc_chip_data(desc);
@@ -3359,7 +3340,7 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
 	if (get_irte(irq, &irte))
 		return -1;
 
-	if (set_desc_affinity(desc, mask, &dest))
+	if (__ioapic_set_affinity(&desc->irq_data, mask, &dest))
 		return -1;
 
 	irte.vector = cfg->vector;
@@ -3534,7 +3515,7 @@ static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 	struct msi_msg msg;
 	unsigned int dest;
 
-	if (set_desc_affinity(desc, mask, &dest))
+	if (__ioapic_set_affinity(&desc->irq_data, mask, &dest))
 		return -1;
 
 	cfg = get_irq_desc_chip_data(desc);
@@ -3590,7 +3571,7 @@ static int hpet_msi_set_affinity(struct irq_data *data,
 	struct msi_msg msg;
 	unsigned int dest;
 
-	if (set_desc_affinity(desc, mask, &dest))
+	if (__ioapic_set_affinity(&desc->irq_data, mask, &dest))
 		return -1;
 
 	hpet_msi_read(data->handler_data, &msg);
@@ -3693,7 +3674,7 @@ static int set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
 	struct irq_cfg *cfg;
 	unsigned int dest;
 
-	if (set_desc_affinity(desc, mask, &dest))
+	if (__ioapic_set_affinity(&desc->irq_data, mask, &dest))
 		return -1;
 
 	cfg = get_irq_desc_chip_data(desc);
@@ -4045,14 +4026,14 @@ void __init setup_ioapic_dest(void)
 		 */
 		if (desc->status &
 		    (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
-			mask = desc->affinity;
+			mask = desc->irq_data.affinity;
 		else
 			mask = apic->target_cpus();
 
 		if (intr_remapping_enabled)
 			set_ir_ioapic_affinity_irq_desc(desc, mask);
 		else
-			set_ioapic_affinity_irq_desc(desc, mask);
+			ioapic_set_affinity(&desc->irq_data, mask, false);
 	}
 
 }
diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c
index 2233a42fb90..56a42e11ae0 100644
--- a/arch/x86/kernel/uv_irq.c
+++ b/arch/x86/kernel/uv_irq.c
@@ -216,7 +216,7 @@ static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask)
 	unsigned long mmr_offset;
 	int mmr_pnode;
 
-	if (set_desc_affinity(desc, mask, &dest))
+	if (__ioapic_set_affinity(&desc->irq_data, mask, &dest))
 		return -1;
 
 	mmr_value = 0;
-- 
cgit v1.2.3-18-g5258


From 5346b2a78fa3b900da672928978475acdd4632dc Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 8 Oct 2010 21:49:03 +0200
Subject: x86: Convert msi affinity setting to new chip functions

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Cc: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/kernel/apic/io_apic.c | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 60ca9a47087..268c5450392 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3301,26 +3301,24 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
 }
 
 #ifdef CONFIG_SMP
-static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
+static int
+msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
-	struct irq_cfg *cfg;
+	struct irq_cfg *cfg = data->chip_data;
 	struct msi_msg msg;
 	unsigned int dest;
 
-	if (__ioapic_set_affinity(&desc->irq_data, mask, &dest))
+	if (__ioapic_set_affinity(data, mask, &dest))
 		return -1;
 
-	cfg = get_irq_desc_chip_data(desc);
-
-	__get_cached_msi_msg(desc->irq_data.msi_desc, &msg);
+	__get_cached_msi_msg(data->msi_desc, &msg);
 
 	msg.data &= ~MSI_DATA_VECTOR_MASK;
 	msg.data |= MSI_DATA_VECTOR(cfg->vector);
 	msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
 	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
-	__write_msi_msg(desc->irq_data.msi_desc, &msg);
+	__write_msi_msg(data->msi_desc, &msg);
 
 	return 0;
 }
@@ -3370,14 +3368,14 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
  * which implement the MSI or MSI-X Capability Structure.
  */
 static struct irq_chip msi_chip = {
-	.name		= "PCI-MSI",
-	.irq_unmask	= unmask_msi_irq,
-	.irq_mask	= mask_msi_irq,
-	.irq_ack	= ack_apic_edge,
+	.name			= "PCI-MSI",
+	.irq_unmask		= unmask_msi_irq,
+	.irq_mask		= mask_msi_irq,
+	.irq_ack		= ack_apic_edge,
 #ifdef CONFIG_SMP
-	.set_affinity	= set_msi_irq_affinity,
+	.irq_set_affinity	= msi_set_affinity,
 #endif
-	.irq_retrigger	= ioapic_retrigger_irq,
+	.irq_retrigger		= ioapic_retrigger_irq,
 };
 
 static struct irq_chip msi_ir_chip = {
-- 
cgit v1.2.3-18-g5258


From f19f5ecc920215decfea54f26e3eb14064506675 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 8 Oct 2010 21:50:22 +0200
Subject: x86: Convert remapped ioapic affinity setting to new irq chip
 function

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
---
 arch/x86/kernel/apic/io_apic.c | 58 +++++++++++++++---------------------------
 1 file changed, 20 insertions(+), 38 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 268c5450392..49cc27d5658 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2333,24 +2333,21 @@ ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
  * the interrupt-remapping table entry.
  */
 static int
-migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
+ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
+		       bool force)
 {
-	struct irq_cfg *cfg;
+	struct irq_cfg *cfg = data->chip_data;
+	unsigned int dest, irq = data->irq;
 	struct irte irte;
-	unsigned int dest;
-	unsigned int irq;
-	int ret = -1;
 
 	if (!cpumask_intersects(mask, cpu_online_mask))
-		return ret;
+		return -EINVAL;
 
-	irq = desc->irq;
 	if (get_irte(irq, &irte))
-		return ret;
+		return -EBUSY;
 
-	cfg = get_irq_desc_chip_data(desc);
 	if (assign_irq_vector(irq, cfg, mask))
-		return ret;
+		return -EBUSY;
 
 	dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
 
@@ -2365,29 +2362,14 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
 	if (cfg->move_in_progress)
 		send_cleanup_vector(cfg);
 
-	cpumask_copy(desc->affinity, mask);
-
+	cpumask_copy(data->affinity, mask);
 	return 0;
 }
 
-/*
- * Migrates the IRQ destination in the process context.
- */
-static int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
-					    const struct cpumask *mask)
-{
-	return migrate_ioapic_irq_desc(desc, mask);
-}
-static int set_ir_ioapic_affinity_irq(unsigned int irq,
-				       const struct cpumask *mask)
-{
-	struct irq_desc *desc = irq_to_desc(irq);
-
-	return set_ir_ioapic_affinity_irq_desc(desc, mask);
-}
 #else
-static inline int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
-						   const struct cpumask *mask)
+static inline int
+ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
+		       bool force)
 {
 	return 0;
 }
@@ -2662,18 +2644,18 @@ static struct irq_chip ioapic_chip __read_mostly = {
 };
 
 static struct irq_chip ir_ioapic_chip __read_mostly = {
-	.name		= "IR-IO-APIC",
-	.irq_startup	= startup_ioapic_irq,
-	.irq_mask	= mask_ioapic_irq,
-	.irq_unmask	= unmask_ioapic_irq,
+	.name			= "IR-IO-APIC",
+	.irq_startup		= startup_ioapic_irq,
+	.irq_mask		= mask_ioapic_irq,
+	.irq_unmask		= unmask_ioapic_irq,
 #ifdef CONFIG_INTR_REMAP
-	.irq_ack	= ir_ack_apic_edge,
-	.irq_eoi	= ir_ack_apic_level,
+	.irq_ack		= ir_ack_apic_edge,
+	.irq_eoi		= ir_ack_apic_level,
 #ifdef CONFIG_SMP
-	.set_affinity	= set_ir_ioapic_affinity_irq,
+	.irq_set_affinity	= ir_ioapic_set_affinity,
 #endif
 #endif
-	.irq_retrigger	= ioapic_retrigger_irq,
+	.irq_retrigger		= ioapic_retrigger_irq,
 };
 
 static inline void init_IO_APIC_traps(void)
@@ -4029,7 +4011,7 @@ void __init setup_ioapic_dest(void)
 			mask = apic->target_cpus();
 
 		if (intr_remapping_enabled)
-			set_ir_ioapic_affinity_irq_desc(desc, mask);
+			ir_ioapic_set_affinity(&desc->irq_data, mask, false);
 		else
 			ioapic_set_affinity(&desc->irq_data, mask, false);
 	}
-- 
cgit v1.2.3-18-g5258


From b5d1c465794f521c352d9c1a33159750c9c3fa84 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 8 Oct 2010 22:15:49 +0200
Subject: x86: Convert remapped msi to new chip.irq_set_affinity function

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/kernel/apic/io_apic.c | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 49cc27d5658..13f8e28ba4d 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3310,17 +3310,17 @@ msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
  * done in the process context using interrupt-remapping hardware.
  */
 static int
-ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
+ir_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
+		    bool force)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
-	struct irq_cfg *cfg = get_irq_desc_chip_data(desc);
-	unsigned int dest;
+	struct irq_cfg *cfg = data->chip_data;
+	unsigned int dest, irq = data->irq;
 	struct irte irte;
 
 	if (get_irte(irq, &irte))
 		return -1;
 
-	if (__ioapic_set_affinity(&desc->irq_data, mask, &dest))
+	if (__ioapic_set_affinity(data, mask, &dest))
 		return -1;
 
 	irte.vector = cfg->vector;
@@ -3361,16 +3361,16 @@ static struct irq_chip msi_chip = {
 };
 
 static struct irq_chip msi_ir_chip = {
-	.name		= "IR-PCI-MSI",
-	.irq_unmask	= unmask_msi_irq,
-	.irq_mask	= mask_msi_irq,
+	.name			= "IR-PCI-MSI",
+	.irq_unmask		= unmask_msi_irq,
+	.irq_mask		= mask_msi_irq,
 #ifdef CONFIG_INTR_REMAP
-	.irq_ack	= ir_ack_apic_edge,
+	.irq_ack		= ir_ack_apic_edge,
 #ifdef CONFIG_SMP
-	.set_affinity	= ir_set_msi_irq_affinity,
+	.irq_set_affinity	= ir_msi_set_affinity,
 #endif
 #endif
-	.irq_retrigger	= ioapic_retrigger_irq,
+	.irq_retrigger		= ioapic_retrigger_irq,
 };
 
 /*
@@ -3569,16 +3569,16 @@ static int hpet_msi_set_affinity(struct irq_data *data,
 #endif /* CONFIG_SMP */
 
 static struct irq_chip ir_hpet_msi_type = {
-	.name = "IR-HPET_MSI",
-	.irq_unmask = hpet_msi_unmask,
-	.irq_mask = hpet_msi_mask,
+	.name			= "IR-HPET_MSI",
+	.irq_unmask		= hpet_msi_unmask,
+	.irq_mask		= hpet_msi_mask,
 #ifdef CONFIG_INTR_REMAP
-	.irq_ack = ir_ack_apic_edge,
+	.irq_ack		= ir_ack_apic_edge,
 #ifdef CONFIG_SMP
-	.set_affinity = ir_set_msi_irq_affinity,
+	.irq_set_affinity	= ir_msi_set_affinity,
 #endif
 #endif
-	.irq_retrigger = ioapic_retrigger_irq,
+	.irq_retrigger		= ioapic_retrigger_irq,
 };
 
 static struct irq_chip hpet_msi_type = {
-- 
cgit v1.2.3-18-g5258


From fe52b2d25919eaa01c51651a664f4f2ba6bd2a11 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 8 Oct 2010 22:19:29 +0200
Subject: x86: Convert dmar affinity setting to new chip function

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Cc: David Woodhouse <dwmw2@infradead.org>
---
 arch/x86/kernel/apic/io_apic.c | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 13f8e28ba4d..6f8ac4c542b 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3488,18 +3488,17 @@ void arch_teardown_msi_irq(unsigned int irq)
 
 #if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP)
 #ifdef CONFIG_SMP
-static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
+static int
+dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
+		      bool force)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
-	struct irq_cfg *cfg;
+	struct irq_cfg *cfg = data->chip_data;
+	unsigned int dest, irq = data->irq;
 	struct msi_msg msg;
-	unsigned int dest;
 
-	if (__ioapic_set_affinity(&desc->irq_data, mask, &dest))
+	if (__ioapic_set_affinity(data, mask, &dest))
 		return -1;
 
-	cfg = get_irq_desc_chip_data(desc);
-
 	dmar_msi_read(irq, &msg);
 
 	msg.data &= ~MSI_DATA_VECTOR_MASK;
@@ -3515,14 +3514,14 @@ static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 #endif /* CONFIG_SMP */
 
 static struct irq_chip dmar_msi_type = {
-	.name = "DMAR_MSI",
-	.irq_unmask = dmar_msi_unmask,
-	.irq_mask = dmar_msi_mask,
-	.irq_ack = ack_apic_edge,
+	.name			= "DMAR_MSI",
+	.irq_unmask		= dmar_msi_unmask,
+	.irq_mask		= dmar_msi_mask,
+	.irq_ack		= ack_apic_edge,
 #ifdef CONFIG_SMP
-	.set_affinity = dmar_msi_set_affinity,
+	.irq_set_affinity	= dmar_msi_set_affinity,
 #endif
-	.irq_retrigger = ioapic_retrigger_irq,
+	.irq_retrigger		= ioapic_retrigger_irq,
 };
 
 int arch_setup_dmar_msi(unsigned int irq)
-- 
cgit v1.2.3-18-g5258


From 0e09ddf2d71aeff92ff8055ac7600b85c255ee85 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 8 Oct 2010 22:21:26 +0200
Subject: x86: Cleanup hpet affinity setting

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/io_apic.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 6f8ac4c542b..0b9ec3cb311 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3545,12 +3545,11 @@ int arch_setup_dmar_msi(unsigned int irq)
 static int hpet_msi_set_affinity(struct irq_data *data,
 				 const struct cpumask *mask, bool force)
 {
-	struct irq_desc *desc = irq_to_desc(data->irq);
 	struct irq_cfg *cfg = data->chip_data;
 	struct msi_msg msg;
 	unsigned int dest;
 
-	if (__ioapic_set_affinity(&desc->irq_data, mask, &dest))
+	if (__ioapic_set_affinity(data, mask, &dest))
 		return -1;
 
 	hpet_msi_read(data->handler_data, &msg);
-- 
cgit v1.2.3-18-g5258


From be5b7bf73802a9391158d9fcc0bc6b07670c73a5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 8 Oct 2010 22:31:46 +0200
Subject: x86: Convert ht set_affinity to new chip function

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Cc: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/kernel/apic/io_apic.c | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 0b9ec3cb311..b144f7a9597 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3646,33 +3646,30 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
 	write_ht_irq_msg(irq, &msg);
 }
 
-static int set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
+static int
+ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
-	struct irq_cfg *cfg;
+	struct irq_cfg *cfg = data->chip_data;
 	unsigned int dest;
 
-	if (__ioapic_set_affinity(&desc->irq_data, mask, &dest))
+	if (__ioapic_set_affinity(data, mask, &dest))
 		return -1;
 
-	cfg = get_irq_desc_chip_data(desc);
-
-	target_ht_irq(irq, dest, cfg->vector);
-
+	target_ht_irq(data->irq, dest, cfg->vector);
 	return 0;
 }
 
 #endif
 
 static struct irq_chip ht_irq_chip = {
-	.name		= "PCI-HT",
-	.irq_mask	= mask_ht_irq,
-	.irq_unmask	= unmask_ht_irq,
-	.irq_ack	= ack_apic_edge,
+	.name			= "PCI-HT",
+	.irq_mask		= mask_ht_irq,
+	.irq_unmask		= unmask_ht_irq,
+	.irq_ack		= ack_apic_edge,
 #ifdef CONFIG_SMP
-	.set_affinity	= set_ht_irq_affinity,
+	.irq_set_affinity	= ht_set_affinity,
 #endif
-	.irq_retrigger	= ioapic_retrigger_irq,
+	.irq_retrigger		= ioapic_retrigger_irq,
 };
 
 int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
-- 
cgit v1.2.3-18-g5258


From 7e495529b62cf462eb2d8875fe15ca446b8e1f94 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 28 Sep 2010 23:31:50 +0200
Subject: x86: ioapic: Cleanup some more

Cleanup after the irq_chip conversion a bit.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/io_apic.c | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index b144f7a9597..43030995dcc 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -131,13 +131,9 @@ struct irq_pin_list {
 	struct irq_pin_list *next;
 };
 
-static struct irq_pin_list *get_one_free_irq_2_pin(int node)
+static struct irq_pin_list *alloc_irq_pin_list(int node)
 {
-	struct irq_pin_list *pin;
-
-	pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node);
-
-	return pin;
+	return kzalloc_node(sizeof(struct irq_pin_list), GFP_ATOMIC, node);
 }
 
 /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
@@ -232,7 +228,7 @@ init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int node)
 	if (!old_entry)
 		return;
 
-	entry = get_one_free_irq_2_pin(node);
+	entry = alloc_irq_pin_list(node);
 	if (!entry)
 		return;
 
@@ -242,7 +238,7 @@ init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int node)
 	tail		= entry;
 	old_entry	= old_entry->next;
 	while (old_entry) {
-		entry = get_one_free_irq_2_pin(node);
+		entry = alloc_irq_pin_list(node);
 		if (!entry) {
 			entry = head;
 			while (entry) {
@@ -471,7 +467,7 @@ static void ioapic_mask_entry(int apic, int pin)
  * fast in the common case, and fast for shared ISA-space IRQs.
  */
 static int
-add_pin_to_irq_node_nopanic(struct irq_cfg *cfg, int node, int apic, int pin)
+__add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
 {
 	struct irq_pin_list **last, *entry;
 
@@ -483,7 +479,7 @@ add_pin_to_irq_node_nopanic(struct irq_cfg *cfg, int node, int apic, int pin)
 		last = &entry->next;
 	}
 
-	entry = get_one_free_irq_2_pin(node);
+	entry = alloc_irq_pin_list(node);
 	if (!entry) {
 		printk(KERN_ERR "can not alloc irq_pin_list (%d,%d,%d)\n",
 				node, apic, pin);
@@ -498,7 +494,7 @@ add_pin_to_irq_node_nopanic(struct irq_cfg *cfg, int node, int apic, int pin)
 
 static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
 {
-	if (add_pin_to_irq_node_nopanic(cfg, node, apic, pin))
+	if (__add_pin_to_irq_node(cfg, node, apic, pin))
 		panic("IO-APIC: failed to add irq-pin. Can not proceed\n");
 }
 
@@ -3801,7 +3797,7 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq,
 	 * IRQs < 16 are already in the irq_2_pin[] map
 	 */
 	if (irq >= legacy_pic->nr_legacy_irqs) {
-		if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) {
+		if (__add_pin_to_irq_node(cfg, node, ioapic, pin)) {
 			printk(KERN_INFO "can not add pin %d for irq %d\n",
 				pin, irq);
 			return 0;
-- 
cgit v1.2.3-18-g5258


From 6e2fff50a5bd72a3f9e6f3ef6e9137efddb2d580 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 6 Oct 2010 22:07:03 +0200
Subject: x86: ioapic: Cleanup get_one_free_irq_cfg()

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/io_apic.c | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 43030995dcc..452f781a042 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -185,19 +185,18 @@ static struct irq_cfg *get_one_free_irq_cfg(int node)
 	struct irq_cfg *cfg;
 
 	cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
-	if (cfg) {
-		if (!zalloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) {
-			kfree(cfg);
-			cfg = NULL;
-		} else if (!zalloc_cpumask_var_node(&cfg->old_domain,
-							  GFP_ATOMIC, node)) {
-			free_cpumask_var(cfg->domain);
-			kfree(cfg);
-			cfg = NULL;
-		}
-	}
-
+	if (!cfg)
+		return NULL;
+	if (!zalloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node))
+		goto out_cfg;
+	if (!zalloc_cpumask_var_node(&cfg->old_domain, GFP_ATOMIC, node))
+		goto out_domain;
 	return cfg;
+out_domain:
+	free_cpumask_var(cfg->domain);
+out_cfg:
+	kfree(cfg);
+	return NULL;
 }
 
 int arch_init_chip_data(struct irq_desc *desc, int node)
-- 
cgit v1.2.3-18-g5258


From 08c33db6d044d9dc74ddf8d9ee3cb1fa3eca262b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 6 Oct 2010 22:14:21 +0200
Subject: x86: Implement new allocator functions

Implement new allocator functions which make use of the core changes.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/io_apic.c | 54 ++++++++++++++++++++++++++++++++++++------
 1 file changed, 47 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 452f781a042..065c5dc88b8 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -199,6 +199,13 @@ out_cfg:
 	return NULL;
 }
 
+static void free_irq_cfg(struct irq_cfg *cfg)
+{
+	free_cpumask_var(cfg->domain);
+	free_cpumask_var(cfg->old_domain);
+	kfree(cfg);
+}
+
 int arch_init_chip_data(struct irq_desc *desc, int node)
 {
 	struct irq_cfg *cfg;
@@ -299,13 +306,6 @@ void arch_init_copy_chip_data(struct irq_desc *old_desc,
 	init_copy_irq_2_pin(old_cfg, cfg, node);
 }
 
-static void free_irq_cfg(struct irq_cfg *cfg)
-{
-	free_cpumask_var(cfg->domain);
-	free_cpumask_var(cfg->old_domain);
-	kfree(cfg);
-}
-
 void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
 {
 	struct irq_cfg *old_cfg, *cfg;
@@ -325,13 +325,53 @@ void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
 /* end for move_irq_desc */
 
 #else
+
 struct irq_cfg *irq_cfg(unsigned int irq)
 {
 	return irq < nr_irqs ? irq_cfgx + irq : NULL;
 }
 
+static struct irq_cfg *get_one_free_irq_cfg(unsigned int irq, int node)
+{
+	return irq_cfgx + irq;
+}
+
+static inline void free_irq_cfg(struct irq_cfg *cfg) { }
+
 #endif
 
+static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node)
+{
+	int res = irq_alloc_desc_at(at, node);
+	struct irq_cfg *cfg;
+
+	if (res < 0) {
+		if (res != -EEXIST)
+			return NULL;
+		cfg = get_irq_chip_data(at);
+		if (cfg)
+			return cfg;
+	}
+
+	cfg = get_one_free_irq_cfg(node);
+	if (cfg)
+		set_irq_chip_data(at, cfg);
+	else
+		irq_free_desc(at);
+	return cfg;
+}
+
+static int alloc_irq_from(unsigned int from, int node)
+{
+	return irq_alloc_desc_from(from, node);
+}
+
+static void free_irq_at(unsigned int at, struct irq_cfg *cfg)
+{
+	free_irq_cfg(cfg);
+	irq_free_desc(at);
+}
+
 struct io_apic {
 	unsigned int index;
 	unsigned int unused[3];
-- 
cgit v1.2.3-18-g5258


From f981a3dc1941035a108da1276c448de6b10ddac9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 8 Oct 2010 10:44:21 +0200
Subject: x86: io_apic: Prepare alloc/free_irq_cfg()

Rename the grossly misnamed get_one_free_irq_cfg() to alloc_irq_cfg().
Add a (not yet used) irq number argument to free_irq_cfg()

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/io_apic.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 065c5dc88b8..06da8fe2647 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -180,7 +180,7 @@ struct irq_cfg *irq_cfg(unsigned int irq)
 	return get_irq_chip_data(irq);
 }
 
-static struct irq_cfg *get_one_free_irq_cfg(int node)
+static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node)
 {
 	struct irq_cfg *cfg;
 
@@ -199,7 +199,7 @@ out_cfg:
 	return NULL;
 }
 
-static void free_irq_cfg(struct irq_cfg *cfg)
+static void free_irq_cfg(unsigned int at, struct irq_cfg *cfg)
 {
 	free_cpumask_var(cfg->domain);
 	free_cpumask_var(cfg->old_domain);
@@ -212,7 +212,7 @@ int arch_init_chip_data(struct irq_desc *desc, int node)
 
 	cfg = get_irq_desc_chip_data(desc);
 	if (!cfg) {
-		cfg = get_one_free_irq_cfg(node);
+		cfg = alloc_irq_cfg(desc->irq, node);
 		desc->chip_data = cfg;
 		if (!cfg) {
 			printk(KERN_ERR "can not alloc irq_cfg\n");
@@ -289,7 +289,7 @@ void arch_init_copy_chip_data(struct irq_desc *old_desc,
 	struct irq_cfg *cfg;
 	struct irq_cfg *old_cfg;
 
-	cfg = get_one_free_irq_cfg(node);
+	cfg = alloc_irq_cfg(desc->irq, node);
 
 	if (!cfg)
 		return;
@@ -318,7 +318,7 @@ void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
 
 	if (old_cfg) {
 		free_irq_2_pin(old_cfg, cfg);
-		free_irq_cfg(old_cfg);
+		free_irq_cfg(old_desc->irq, old_cfg);
 		old_desc->chip_data = NULL;
 	}
 }
@@ -331,12 +331,12 @@ struct irq_cfg *irq_cfg(unsigned int irq)
 	return irq < nr_irqs ? irq_cfgx + irq : NULL;
 }
 
-static struct irq_cfg *get_one_free_irq_cfg(unsigned int irq, int node)
+static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node)
 {
 	return irq_cfgx + irq;
 }
 
-static inline void free_irq_cfg(struct irq_cfg *cfg) { }
+static inline void free_irq_cfg(unsigned int at, struct irq_cfg *cfg) { }
 
 #endif
 
@@ -353,7 +353,7 @@ static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node)
 			return cfg;
 	}
 
-	cfg = get_one_free_irq_cfg(node);
+	cfg = alloc_irq_cfg(at, node);
 	if (cfg)
 		set_irq_chip_data(at, cfg);
 	else
@@ -368,7 +368,7 @@ static int alloc_irq_from(unsigned int from, int node)
 
 static void free_irq_at(unsigned int at, struct irq_cfg *cfg)
 {
-	free_irq_cfg(cfg);
+	free_irq_cfg(at, cfg);
 	irq_free_desc(at);
 }
 
-- 
cgit v1.2.3-18-g5258


From fe6dab4e79e82ec35879bfe1a8968b7d15ac0d91 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 8 Oct 2010 22:44:02 -0700
Subject: x86: Don't setup ioapic irq for sci twice

The sparseirq rework triggered a warning in the iommu code, which was
caused by setting up ioapic for ACPI irq 9 twice. This function is
solely to handle interrupts which are on a secondary ioapic and
outside the legacy irq range.

Replace the sparse irq_to_desc check with a non ifdeffed version.

[ tglx: Moved it before the ioapic sparse conversion and simplified
  	the inverse logic ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <4CB00122.3030301@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/io_apic.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 06da8fe2647..5aae718a713 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1565,11 +1565,11 @@ void setup_IO_APIC_irq_extra(u32 gsi)
 		return;
 
 	irq = pin_2_irq(idx, apic_id, pin);
-#ifdef CONFIG_SPARSE_IRQ
-	desc = irq_to_desc(irq);
-	if (desc)
+
+	/* Only handle the non legacy irqs on secondary ioapics */
+	if (apic_id == 0 || irq < NR_IRQS_LEGACY)
 		return;
-#endif
+
 	desc = irq_to_desc_alloc_node(irq, node);
 	if (!desc) {
 		printk(KERN_INFO "can not get irq_desc for %d\n", irq);
-- 
cgit v1.2.3-18-g5258


From fbc6bff04a095e049be290ff6f6ac68839166bd6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 28 Sep 2010 20:34:53 +0200
Subject: x86: ioapic: Cleanup sparse irq code

Switch over to the new allocator and remove all the magic which was
caused by the unability to destroy irq descriptors. Get rid of the
create_irq_nr() loop for sparse and non sparse irq.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig               |   1 -
 arch/x86/kernel/apic/io_apic.c | 107 ++++++++++++++++++-----------------------
 2 files changed, 48 insertions(+), 60 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 3ec657f7ee7..8cc510874e1 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -61,7 +61,6 @@ config X86
 	select HAVE_USER_RETURN_NOTIFIER
 	select HAVE_GENERIC_HARDIRQS
 	select HAVE_SPARSE_IRQ
-	select NUMA_IRQ_DESC if (SPARSE_IRQ && NUMA)
 	select GENERIC_IRQ_PROBE
 	select GENERIC_PENDING_IRQ if SMP
 
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 5aae718a713..ed340297571 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -157,6 +157,9 @@ int __init arch_early_irq_init(void)
 	count = ARRAY_SIZE(irq_cfgx);
 	node = cpu_to_node(0);
 
+	/* Make sure the legacy interrupts are marked in the bitmap */
+	irq_reserve_irqs(0, legacy_pic->nr_legacy_irqs);
+
 	for (i = 0; i < count; i++) {
 		set_irq_chip_data(i, &cfg[i]);
 		zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node);
@@ -201,11 +204,15 @@ out_cfg:
 
 static void free_irq_cfg(unsigned int at, struct irq_cfg *cfg)
 {
+	if (!cfg)
+		return;
+	set_irq_chip_data(at, NULL);
 	free_cpumask_var(cfg->domain);
 	free_cpumask_var(cfg->old_domain);
 	kfree(cfg);
 }
 
+#if 0
 int arch_init_chip_data(struct irq_desc *desc, int node)
 {
 	struct irq_cfg *cfg;
@@ -323,6 +330,7 @@ void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
 	}
 }
 /* end for move_irq_desc */
+#endif
 
 #else
 
@@ -1479,11 +1487,9 @@ static struct {
 
 static void __init setup_IO_APIC_irqs(void)
 {
-	int apic_id, pin, idx, irq;
-	int notcon = 0;
-	struct irq_desc *desc;
-	struct irq_cfg *cfg;
+	int apic_id, pin, idx, irq, notcon = 0;
 	int node = cpu_to_node(0);
+	struct irq_cfg *cfg;
 
 	apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
 
@@ -1520,12 +1526,10 @@ static void __init setup_IO_APIC_irqs(void)
 				apic->multi_timer_check(apic_id, irq))
 			continue;
 
-		desc = irq_to_desc_alloc_node(irq, node);
-		if (!desc) {
-			printk(KERN_INFO "can not get irq_desc for %d\n", irq);
+		cfg = alloc_irq_and_cfg_at(irq, node);
+		if (!cfg)
 			continue;
-		}
-		cfg = get_irq_desc_chip_data(desc);
+
 		add_pin_to_irq_node(cfg, node, apic_id, pin);
 		/*
 		 * don't mark it in pin_programmed, so later acpi could
@@ -1547,9 +1551,7 @@ static void __init setup_IO_APIC_irqs(void)
  */
 void setup_IO_APIC_irq_extra(u32 gsi)
 {
-	int apic_id = 0, pin, idx, irq;
-	int node = cpu_to_node(0);
-	struct irq_desc *desc;
+	int apic_id = 0, pin, idx, irq, node = cpu_to_node(0);
 	struct irq_cfg *cfg;
 
 	/*
@@ -1570,13 +1572,10 @@ void setup_IO_APIC_irq_extra(u32 gsi)
 	if (apic_id == 0 || irq < NR_IRQS_LEGACY)
 		return;
 
-	desc = irq_to_desc_alloc_node(irq, node);
-	if (!desc) {
-		printk(KERN_INFO "can not get irq_desc for %d\n", irq);
+	cfg = alloc_irq_and_cfg_at(irq, node);
+	if (!cfg)
 		return;
-	}
 
-	cfg = get_irq_desc_chip_data(desc);
 	add_pin_to_irq_node(cfg, node, apic_id, pin);
 
 	if (test_bit(pin, mp_ioapic_routing[apic_id].pin_programmed)) {
@@ -3177,44 +3176,37 @@ device_initcall(ioapic_init_sysfs);
 /*
  * Dynamic irq allocate and deallocation
  */
-unsigned int create_irq_nr(unsigned int irq_want, int node)
+unsigned int create_irq_nr(unsigned int from, int node)
 {
-	/* Allocate an unused irq */
-	unsigned int irq;
-	unsigned int new;
+	struct irq_cfg *cfg;
 	unsigned long flags;
-	struct irq_cfg *cfg_new = NULL;
-	struct irq_desc *desc_new = NULL;
-
-	irq = 0;
-	if (irq_want < nr_irqs_gsi)
-		irq_want = nr_irqs_gsi;
-
-	raw_spin_lock_irqsave(&vector_lock, flags);
-	for (new = irq_want; new < nr_irqs; new++) {
-		desc_new = irq_to_desc_alloc_node(new, node);
-		if (!desc_new) {
-			printk(KERN_INFO "can not get irq_desc for %d\n", new);
-			continue;
-		}
-		cfg_new = get_irq_desc_chip_data(desc_new);
-
-		if (cfg_new->vector != 0)
-			continue;
+	unsigned int ret = 0;
+	int irq;
 
-		desc_new = move_irq_desc(desc_new, node);
-		cfg_new = get_irq_desc_chip_data(desc_new);
+	if (from < nr_irqs_gsi)
+		from = nr_irqs_gsi;
 
-		if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0)
-			irq = new;
-		break;
+	irq = alloc_irq_from(from, node);
+	if (irq < 0)
+		return 0;
+	cfg = alloc_irq_cfg(irq, node);
+	if (!cfg) {
+		free_irq_at(irq, NULL);
+		return 0;
 	}
-	raw_spin_unlock_irqrestore(&vector_lock, flags);
 
-	if (irq > 0)
-		dynamic_irq_init_keep_chip_data(irq);
+	raw_spin_lock_irqsave(&vector_lock, flags);
+	if (!__assign_irq_vector(irq, cfg, apic->target_cpus()))
+		ret = irq;
+	raw_spin_unlock_irqrestore(&vector_lock, flags);
 
-	return irq;
+	if (ret) {
+		set_irq_chip_data(irq, cfg);
+		irq_clear_status_flags(irq, IRQ_NOREQUEST);
+	} else {
+		free_irq_at(irq, cfg);
+	}
+	return ret;
 }
 
 int create_irq(void)
@@ -3234,14 +3226,16 @@ int create_irq(void)
 
 void destroy_irq(unsigned int irq)
 {
+	struct irq_cfg *cfg = get_irq_chip_data(irq);
 	unsigned long flags;
 
-	dynamic_irq_cleanup_keep_chip_data(irq);
+	irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE);
 
 	free_irte(irq);
 	raw_spin_lock_irqsave(&vector_lock, flags);
-	__clear_irq_vector(irq, get_irq_chip_data(irq));
+	__clear_irq_vector(irq, cfg);
 	raw_spin_unlock_irqrestore(&vector_lock, flags);
+	free_irq_at(irq, cfg);
 }
 
 /*
@@ -3802,7 +3796,6 @@ int __init arch_probe_nr_irqs(void)
 static int __io_apic_set_pci_routing(struct device *dev, int irq,
 				struct io_apic_irq_attr *irq_attr)
 {
-	struct irq_desc *desc;
 	struct irq_cfg *cfg;
 	int node;
 	int ioapic, pin;
@@ -3820,18 +3813,14 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq,
 	else
 		node = cpu_to_node(0);
 
-	desc = irq_to_desc_alloc_node(irq, node);
-	if (!desc) {
-		printk(KERN_INFO "can not get irq_desc %d\n", irq);
+	cfg = alloc_irq_and_cfg_at(irq, node);
+	if (!cfg)
 		return 0;
-	}
 
 	pin = irq_attr->ioapic_pin;
 	trigger = irq_attr->trigger;
 	polarity = irq_attr->polarity;
 
-	cfg = get_irq_desc_chip_data(desc);
-
 	/*
 	 * IRQs < 16 are already in the irq_2_pin[] map
 	 */
@@ -4232,11 +4221,11 @@ void __init pre_init_apic_IRQ0(void)
 #ifndef CONFIG_SMP
 	phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid);
 #endif
-	irq_to_desc_alloc_node(0, 0);
+	/* Make sure the irq descriptor is set up */
+	cfg = alloc_irq_and_cfg_at(0, 0);
 
 	setup_local_APIC();
 
-	cfg = irq_cfg(0);
 	add_pin_to_irq_node(cfg, 0, 0, 0);
 	set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
 
-- 
cgit v1.2.3-18-g5258


From bc5fdf9f3aad37406b3c8d635a7940cd65de0c12 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 8 Oct 2010 10:40:53 +0200
Subject: x86: io_apic: Remove the now unused sparse_irq arch_* functions

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/io_apic.c | 120 -----------------------------------------
 1 file changed, 120 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index ed340297571..6ff6bb883c5 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -212,126 +212,6 @@ static void free_irq_cfg(unsigned int at, struct irq_cfg *cfg)
 	kfree(cfg);
 }
 
-#if 0
-int arch_init_chip_data(struct irq_desc *desc, int node)
-{
-	struct irq_cfg *cfg;
-
-	cfg = get_irq_desc_chip_data(desc);
-	if (!cfg) {
-		cfg = alloc_irq_cfg(desc->irq, node);
-		desc->chip_data = cfg;
-		if (!cfg) {
-			printk(KERN_ERR "can not alloc irq_cfg\n");
-			BUG_ON(1);
-		}
-	}
-
-	return 0;
-}
-
-/* for move_irq_desc */
-static void
-init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int node)
-{
-	struct irq_pin_list *old_entry, *head, *tail, *entry;
-
-	cfg->irq_2_pin = NULL;
-	old_entry = old_cfg->irq_2_pin;
-	if (!old_entry)
-		return;
-
-	entry = alloc_irq_pin_list(node);
-	if (!entry)
-		return;
-
-	entry->apic	= old_entry->apic;
-	entry->pin	= old_entry->pin;
-	head		= entry;
-	tail		= entry;
-	old_entry	= old_entry->next;
-	while (old_entry) {
-		entry = alloc_irq_pin_list(node);
-		if (!entry) {
-			entry = head;
-			while (entry) {
-				head = entry->next;
-				kfree(entry);
-				entry = head;
-			}
-			/* still use the old one */
-			return;
-		}
-		entry->apic	= old_entry->apic;
-		entry->pin	= old_entry->pin;
-		tail->next	= entry;
-		tail		= entry;
-		old_entry	= old_entry->next;
-	}
-
-	tail->next = NULL;
-	cfg->irq_2_pin = head;
-}
-
-static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg)
-{
-	struct irq_pin_list *entry, *next;
-
-	if (old_cfg->irq_2_pin == cfg->irq_2_pin)
-		return;
-
-	entry = old_cfg->irq_2_pin;
-
-	while (entry) {
-		next = entry->next;
-		kfree(entry);
-		entry = next;
-	}
-	old_cfg->irq_2_pin = NULL;
-}
-
-void arch_init_copy_chip_data(struct irq_desc *old_desc,
-				 struct irq_desc *desc, int node)
-{
-	struct irq_cfg *cfg;
-	struct irq_cfg *old_cfg;
-
-	cfg = alloc_irq_cfg(desc->irq, node);
-
-	if (!cfg)
-		return;
-
-	desc->chip_data = cfg;
-
-	old_cfg = old_desc->chip_data;
-
-	cfg->vector = old_cfg->vector;
-	cfg->move_in_progress = old_cfg->move_in_progress;
-	cpumask_copy(cfg->domain, old_cfg->domain);
-	cpumask_copy(cfg->old_domain, old_cfg->old_domain);
-
-	init_copy_irq_2_pin(old_cfg, cfg, node);
-}
-
-void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
-{
-	struct irq_cfg *old_cfg, *cfg;
-
-	old_cfg = get_irq_desc_chip_data(old_desc);
-	cfg = get_irq_desc_chip_data(desc);
-
-	if (old_cfg == cfg)
-		return;
-
-	if (old_cfg) {
-		free_irq_2_pin(old_cfg, cfg);
-		free_irq_cfg(old_desc->irq, old_cfg);
-		old_desc->chip_data = NULL;
-	}
-}
-/* end for move_irq_desc */
-#endif
-
 #else
 
 struct irq_cfg *irq_cfg(unsigned int irq)
-- 
cgit v1.2.3-18-g5258


From 423f085952fd7253407cb92984cc2d495a564481 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 10 Oct 2010 11:39:09 +0200
Subject: x86: Embedd irq_2_iommu into irq_cfg

That interrupt remapping code is x86 specific and tied to the io_apic
code. No need for separate allocator functions in the interrupt
remapping code. This allows to simplify the code and irq_2_iommu is
small (13 bytes on 64bit) so it's not a real problem even if interrupt
remapping is runtime disabled. If it's compile time disabled the
impact is zero.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/include/asm/hw_irq.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 76848f27b1a..e756c4bfed9 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -78,6 +78,13 @@ static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr,
 	irq_attr->polarity	= polarity;
 }
 
+struct irq_2_iommu {
+	struct intel_iommu *iommu;
+	u16 irte_index;
+	u16 sub_handle;
+	u8  irte_mask;
+};
+
 /*
  * This is performance-critical, we want to do it O(1)
  *
@@ -89,6 +96,9 @@ struct irq_cfg {
 	cpumask_var_t		old_domain;
 	u8			vector;
 	u8			move_in_progress : 1;
+#ifdef CONFIG_INTR_REMAP
+	struct irq_2_iommu	irq_2_iommu;
+#endif
 };
 
 extern struct irq_cfg *irq_cfg(unsigned int);
-- 
cgit v1.2.3-18-g5258


From 1a0730d6649113c820217387a011a17dd4aff3ad Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 11 Oct 2010 11:55:37 +0200
Subject: x86: Speed up the irq_remapped check in hot pathes

irq_2_iommu is in struct irq_cfg, so we can do the irq_remapped check
based on irq_cfg instead of going through a lookup function. That's
especially interesting in the eoi_ioapic_irq() hotpath.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/include/asm/irq_remapping.h |  8 ++++++++
 arch/x86/kernel/apic/io_apic.c       | 12 ++++++------
 2 files changed, 14 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index 8d841505344..1c23360fb2d 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -24,10 +24,18 @@ static inline void prepare_irte(struct irte *irte, int vector,
 	irte->dest_id = IRTE_DEST(dest);
 	irte->redir_hint = 1;
 }
+static inline bool irq_remapped(struct irq_cfg *cfg)
+{
+	return cfg->irq_2_iommu.iommu != NULL;
+}
 #else
 static void prepare_irte(struct irte *irte, int vector, unsigned int dest)
 {
 }
+static inline bool irq_remapped(struct irq_cfg *cfg)
+{
+	return false;
+}
 #endif
 
 #endif	/* _ASM_X86_IRQ_REMAPPING_H */
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 6ff6bb883c5..1b8e8a10612 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1237,7 +1237,7 @@ static void ioapic_register_intr(unsigned int irq, unsigned long trigger)
 	else
 		irq_clear_status_flags(irq, IRQ_LEVEL);
 
-	if (irq_remapped(irq)) {
+	if (irq_remapped(get_irq_chip_data(irq))) {
 		irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
 		if (trigger)
 			set_irq_chip_and_handler_name(irq, &ir_ioapic_chip,
@@ -2183,7 +2183,7 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq
 		 * With interrupt-remapping, destination information comes
 		 * from interrupt-remapping table entry.
 		 */
-		if (!irq_remapped(irq))
+		if (!irq_remapped(cfg))
 			io_apic_write(apic, 0x11 + pin*2, dest);
 		reg = io_apic_read(apic, 0x10 + pin*2);
 		reg &= ~IO_APIC_REDIR_VECTOR_MASK;
@@ -2415,7 +2415,7 @@ static void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
 			 * intr-remapping table entry. Hence for the io-apic
 			 * EOI we use the pin number.
 			 */
-			if (irq_remapped(irq))
+			if (irq_remapped(cfg))
 				io_apic_eoi(entry->apic, entry->pin);
 			else
 				io_apic_eoi(entry->apic, cfg->vector);
@@ -3139,7 +3139,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
 
 	dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
 
-	if (irq_remapped(irq)) {
+	if (irq_remapped(get_irq_chip_data(irq))) {
 		struct irte irte;
 		int ir_index;
 		u16 sub_handle;
@@ -3321,7 +3321,7 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
 	set_irq_msi(irq, msidesc);
 	write_msi_msg(irq, &msg);
 
-	if (irq_remapped(irq)) {
+	if (irq_remapped(get_irq_chip_data(irq))) {
 		irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
 		set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge");
 	} else
@@ -3522,7 +3522,7 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
 
 	hpet_msi_write(get_irq_data(irq), &msg);
 	irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
-	if (irq_remapped(irq))
+	if (irq_remapped(get_irq_chip_data(irq)))
 		set_irq_chip_and_handler_name(irq, &ir_hpet_msi_type,
 					      handle_edge_irq, "edge");
 	else
-- 
cgit v1.2.3-18-g5258


From 1a8ce7ff68d777195da2d340561bda610e533a64 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 4 Oct 2010 21:08:56 +0200
Subject: x86: Make io_apic.c local functions static

No users outside of io_apic.c

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/io_apic.h |  6 ------
 arch/x86/kernel/apic/io_apic.c | 10 +++++-----
 2 files changed, 5 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 9cb2edb87c2..c8be4566c3d 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -170,12 +170,6 @@ extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
 
 extern void probe_nr_irqs_gsi(void);
 
-extern int setup_ioapic_entry(int apic, int irq,
-			      struct IO_APIC_route_entry *entry,
-			      unsigned int destination, int trigger,
-			      int polarity, int vector, int pin);
-extern void ioapic_write_entry(int apic, int pin,
-			       struct IO_APIC_route_entry e);
 extern void setup_ioapic_ids_from_mpc(void);
 
 struct mp_ioapic_gsi{
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 1b8e8a10612..0102543b564 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -364,7 +364,7 @@ __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
 	io_apic_write(apic, 0x10 + 2*pin, eu.w1);
 }
 
-void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
 {
 	unsigned long flags;
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
@@ -1259,10 +1259,10 @@ static void ioapic_register_intr(unsigned int irq, unsigned long trigger)
 					      handle_edge_irq, "edge");
 }
 
-int setup_ioapic_entry(int apic_id, int irq,
-		       struct IO_APIC_route_entry *entry,
-		       unsigned int destination, int trigger,
-		       int polarity, int vector, int pin)
+static int setup_ioapic_entry(int apic_id, int irq,
+			      struct IO_APIC_route_entry *entry,
+			      unsigned int destination, int trigger,
+			      int polarity, int vector, int pin)
 {
 	/*
 	 * add it to the IO-APIC irq-routing table:
-- 
cgit v1.2.3-18-g5258


From 48b2650196364e4ef124efb841b63c2326e4ccb2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 30 Sep 2010 11:43:08 +0200
Subject: x86: uv: Clean up the direct access to irq_desc

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/hw_irq.h  |  1 -
 arch/x86/kernel/apic/io_apic.c |  2 +-
 arch/x86/kernel/uv_irq.c       | 55 +++++++++++++++---------------------------
 3 files changed, 20 insertions(+), 38 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index e756c4bfed9..d5905fd8ba4 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -101,7 +101,6 @@ struct irq_cfg {
 #endif
 };
 
-extern struct irq_cfg *irq_cfg(unsigned int);
 extern int assign_irq_vector(int, struct irq_cfg *, const struct cpumask *);
 extern void send_cleanup_vector(struct irq_cfg *);
 
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 0102543b564..5193f201b91 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -178,7 +178,7 @@ int __init arch_early_irq_init(void)
 }
 
 #ifdef CONFIG_SPARSE_IRQ
-struct irq_cfg *irq_cfg(unsigned int irq)
+static struct irq_cfg *irq_cfg(unsigned int irq)
 {
 	return get_irq_chip_data(irq);
 }
diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c
index 56a42e11ae0..7b24460917d 100644
--- a/arch/x86/kernel/uv_irq.c
+++ b/arch/x86/kernel/uv_irq.c
@@ -28,34 +28,21 @@ struct uv_irq_2_mmr_pnode{
 static spinlock_t		uv_irq_lock;
 static struct rb_root		uv_irq_root;
 
-static int uv_set_irq_affinity(unsigned int, const struct cpumask *);
+static int uv_set_irq_affinity(struct irq_data *, const struct cpumask *, bool);
 
-static void uv_noop(unsigned int irq)
-{
-}
-
-static unsigned int uv_noop_ret(unsigned int irq)
-{
-	return 0;
-}
+static void uv_noop(struct irq_data *data) { }
 
-static void uv_ack_apic(unsigned int irq)
+static void uv_ack_apic(struct irq_data *data)
 {
 	ack_APIC_irq();
 }
 
 static struct irq_chip uv_irq_chip = {
-	.name		= "UV-CORE",
-	.startup	= uv_noop_ret,
-	.shutdown	= uv_noop,
-	.enable		= uv_noop,
-	.disable	= uv_noop,
-	.ack		= uv_noop,
-	.mask		= uv_noop,
-	.unmask		= uv_noop,
-	.eoi		= uv_ack_apic,
-	.end		= uv_noop,
-	.set_affinity	= uv_set_irq_affinity,
+	.name			= "UV-CORE",
+	.irq_mask		= uv_noop,
+	.irq_unmask		= uv_noop,
+	.irq_eoi		= uv_ack_apic,
+	.irq_set_affinity	= uv_set_irq_affinity,
 };
 
 /*
@@ -144,26 +131,22 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
 		       unsigned long mmr_offset, int limit)
 {
 	const struct cpumask *eligible_cpu = cpumask_of(cpu);
-	struct irq_desc *desc = irq_to_desc(irq);
-	struct irq_cfg *cfg;
-	int mmr_pnode;
+	struct irq_cfg *cfg = get_irq_chip_data(irq);
 	unsigned long mmr_value;
 	struct uv_IO_APIC_route_entry *entry;
-	int err;
+	int mmr_pnode, err;
 
 	BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
 			sizeof(unsigned long));
 
-	cfg = irq_cfg(irq);
-
 	err = assign_irq_vector(irq, cfg, eligible_cpu);
 	if (err != 0)
 		return err;
 
 	if (limit == UV_AFFINITY_CPU)
-		desc->status |= IRQ_NO_BALANCING;
+		irq_set_status_flags(irq, IRQ_NO_BALANCING);
 	else
-		desc->status |= IRQ_MOVE_PCNTXT;
+		irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
 
 	set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq,
 				      irq_name);
@@ -206,17 +189,17 @@ static void arch_disable_uv_irq(int mmr_pnode, unsigned long mmr_offset)
 	uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
 }
 
-static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask)
+static int
+uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask,
+		    bool force)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
-	struct irq_cfg *cfg = get_irq_desc_chip_data(desc);
+	struct irq_cfg *cfg = data->chip_data;
 	unsigned int dest;
-	unsigned long mmr_value;
+	unsigned long mmr_value, mmr_offset;
 	struct uv_IO_APIC_route_entry *entry;
-	unsigned long mmr_offset;
 	int mmr_pnode;
 
-	if (__ioapic_set_affinity(&desc->irq_data, mask, &dest))
+	if (__ioapic_set_affinity(data, mask, &dest))
 		return -1;
 
 	mmr_value = 0;
@@ -231,7 +214,7 @@ static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask)
 	entry->dest		= dest;
 
 	/* Get previously stored MMR and pnode of hub sourcing interrupts */
-	if (uv_irq_2_mmr_info(irq, &mmr_offset, &mmr_pnode))
+	if (uv_irq_2_mmr_info(data->irq, &mmr_offset, &mmr_pnode))
 		return -1;
 
 	uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
-- 
cgit v1.2.3-18-g5258


From ad9f43340f48c5f7a0a5ef7656986e23d06bf996 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 30 Sep 2010 11:26:43 +0200
Subject: x86: Use sane enumeration

Instead of looping through all interrupts, use the bitmap lookup to
find the next.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/io_apic.c | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 5193f201b91..057b0e13d1c 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1160,7 +1160,6 @@ void __setup_vector_irq(int cpu)
 	/* Initialize vector_irq on a new cpu */
 	int irq, vector;
 	struct irq_cfg *cfg;
-	struct irq_desc *desc;
 
 	/*
 	 * vector_lock will make sure that we don't run into irq vector
@@ -1169,9 +1168,10 @@ void __setup_vector_irq(int cpu)
 	 */
 	raw_spin_lock(&vector_lock);
 	/* Mark the inuse vectors */
-	for_each_irq_desc(irq, desc) {
-		cfg = get_irq_desc_chip_data(desc);
-
+	for_each_active_irq(irq) {
+		cfg = get_irq_chip_data(irq);
+		if (!cfg)
+			continue;
 		/*
 		 * If it is a legacy IRQ handled by the legacy PIC, this cpu
 		 * will be part of the irq_cfg's domain.
@@ -1516,7 +1516,6 @@ __apicdebuginit(void) print_IO_APIC(void)
 	union IO_APIC_reg_03 reg_03;
 	unsigned long flags;
 	struct irq_cfg *cfg;
-	struct irq_desc *desc;
 	unsigned int irq;
 
 	printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
@@ -1603,10 +1602,10 @@ __apicdebuginit(void) print_IO_APIC(void)
 	}
 	}
 	printk(KERN_DEBUG "IRQ to pin mappings:\n");
-	for_each_irq_desc(irq, desc) {
+	for_each_active_irq(irq) {
 		struct irq_pin_list *entry;
 
-		cfg = get_irq_desc_chip_data(desc);
+		cfg = get_irq_chip_data(irq);
 		if (!cfg)
 			continue;
 		entry = cfg->irq_2_pin;
@@ -2574,9 +2573,8 @@ static struct irq_chip ir_ioapic_chip __read_mostly = {
 
 static inline void init_IO_APIC_traps(void)
 {
-	int irq;
-	struct irq_desc *desc;
 	struct irq_cfg *cfg;
+	unsigned int irq;
 
 	/*
 	 * NOTE! The local APIC isn't very good at handling
@@ -2589,8 +2587,8 @@ static inline void init_IO_APIC_traps(void)
 	 * Also, we've got to be careful not to trash gate
 	 * 0x80, because int 0x80 is hm, kind of importantish. ;)
 	 */
-	for_each_irq_desc(irq, desc) {
-		cfg = get_irq_desc_chip_data(desc);
+	for_each_active_irq(irq) {
+		cfg = get_irq_chip_data(irq);
 		if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
 			/*
 			 * Hmm.. We don't have an entry for this,
@@ -2601,7 +2599,7 @@ static inline void init_IO_APIC_traps(void)
 				legacy_pic->make_irq(irq);
 			else
 				/* Strange. Oh, well.. */
-				desc->chip = &no_irq_chip;
+				set_irq_chip(irq, &no_irq_chip);
 		}
 	}
 }
-- 
cgit v1.2.3-18-g5258


From c2f31c37b7303150ffcce53f6c6ed4ac62952b34 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 30 Sep 2010 12:19:03 +0200
Subject: x86: lguest: Use new irq allocator

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Cc: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/x86/lguest/boot.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 2d4e6fcac83..73b1e1a1f48 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -838,12 +838,12 @@ static void __init lguest_init_IRQ(void)
  * rather than set them in lguest_init_IRQ we are called here every time an
  * lguest device needs an interrupt.
  *
- * FIXME: irq_to_desc_alloc_node() can fail due to lack of memory, we should
+ * FIXME: irq_alloc_desc_at() can fail due to lack of memory, we should
  * pass that up!
  */
 void lguest_setup_irq(unsigned int irq)
 {
-	irq_to_desc_alloc_node(irq, 0);
+	irq_alloc_desc_at(irq, 0);
 	set_irq_chip_and_handler_name(irq, &lguest_irq_controller,
 				      handle_level_irq, "level");
 }
-- 
cgit v1.2.3-18-g5258


From 2ee39065988d632b403f8470942b0b5edee3632b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 6 Oct 2010 16:28:51 +0200
Subject: x86: Switch sparse_irq allocations to GFP_KERNEL

No callers from atomic context (except boot) anymore.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/io_apic.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 057b0e13d1c..20e47e04539 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -133,7 +133,7 @@ struct irq_pin_list {
 
 static struct irq_pin_list *alloc_irq_pin_list(int node)
 {
-	return kzalloc_node(sizeof(struct irq_pin_list), GFP_ATOMIC, node);
+	return kzalloc_node(sizeof(struct irq_pin_list), GFP_KERNEL, node);
 }
 
 /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
@@ -162,8 +162,8 @@ int __init arch_early_irq_init(void)
 
 	for (i = 0; i < count; i++) {
 		set_irq_chip_data(i, &cfg[i]);
-		zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node);
-		zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node);
+		zalloc_cpumask_var_node(&cfg[i].domain, GFP_KERNEL, node);
+		zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_KERNEL, node);
 		/*
 		 * For legacy IRQ's, start with assigning irq0 to irq15 to
 		 * IRQ0_VECTOR to IRQ15_VECTOR on cpu 0.
@@ -187,12 +187,12 @@ static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node)
 {
 	struct irq_cfg *cfg;
 
-	cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
+	cfg = kzalloc_node(sizeof(*cfg), GFP_KERNEL, node);
 	if (!cfg)
 		return NULL;
-	if (!zalloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node))
+	if (!zalloc_cpumask_var_node(&cfg->domain, GFP_KERNEL, node))
 		goto out_cfg;
-	if (!zalloc_cpumask_var_node(&cfg->old_domain, GFP_ATOMIC, node))
+	if (!zalloc_cpumask_var_node(&cfg->old_domain, GFP_KERNEL, node))
 		goto out_domain;
 	return cfg;
 out_domain:
@@ -595,14 +595,14 @@ struct IO_APIC_route_entry **alloc_ioapic_entries(void)
 	struct IO_APIC_route_entry **ioapic_entries;
 
 	ioapic_entries = kzalloc(sizeof(*ioapic_entries) * nr_ioapics,
-				GFP_ATOMIC);
+				GFP_KERNEL);
 	if (!ioapic_entries)
 		return 0;
 
 	for (apic = 0; apic < nr_ioapics; apic++) {
 		ioapic_entries[apic] =
 			kzalloc(sizeof(struct IO_APIC_route_entry) *
-				nr_ioapic_registers[apic], GFP_ATOMIC);
+				nr_ioapic_registers[apic], GFP_KERNEL);
 		if (!ioapic_entries[apic])
 			goto nomem;
 	}
-- 
cgit v1.2.3-18-g5258


From bf1ebf007911d70a89de9a4a1b36d403e8eb064b Mon Sep 17 00:00:00 2001
From: Daniel Drake <dsd@laptop.org>
Date: Sun, 10 Oct 2010 10:40:32 +0100
Subject: x86, olpc: Add XO-1 poweroff support

Add a pm_power_off handler for the OLPC XO-1 laptop.

The driver can be built modular and follows the behaviour of the
APM driver, setting pm_power_off to NULL on unload. However, the
ability to unload the module will probably be removed (with a simple
__module_get(THIS_MODULE)) if/when XO-1 suspend/resume support is
added to this file at a later date.

Signed-off-by: Daniel Drake <dsd@laptop.org>
LKML-Reference: <20101010094032.9AE669D401B@zog.reactivated.net>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/Kconfig           |   6 ++
 arch/x86/kernel/Makefile   |   1 +
 arch/x86/kernel/olpc-xo1.c | 140 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 147 insertions(+)
 create mode 100644 arch/x86/kernel/olpc-xo1.c

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c2552557134..0bcb8765b1c 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2066,6 +2066,12 @@ config OLPC
 	  Add support for detecting the unique features of the OLPC
 	  XO hardware.
 
+config OLPC_XO1
+	tristate "OLPC XO-1 support"
+	depends on OLPC
+	---help---
+	  Add support for non-essential features of the OLPC XO-1 laptop.
+
 config OLPC_OPENFIRMWARE
 	bool "Support for OLPC's Open Firmware"
 	depends on !X86_64 && !X86_PAE
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 0925676266b..e9777791e39 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -104,6 +104,7 @@ obj-$(CONFIG_SCx200)		+= scx200.o
 scx200-y			+= scx200_32.o
 
 obj-$(CONFIG_OLPC)		+= olpc.o
+obj-$(CONFIG_OLPC_XO1)		+= olpc-xo1.o
 obj-$(CONFIG_OLPC_OPENFIRMWARE)	+= olpc_ofw.o
 obj-$(CONFIG_X86_MRST)		+= mrst.o
 
diff --git a/arch/x86/kernel/olpc-xo1.c b/arch/x86/kernel/olpc-xo1.c
new file mode 100644
index 00000000000..f5442c03abc
--- /dev/null
+++ b/arch/x86/kernel/olpc-xo1.c
@@ -0,0 +1,140 @@
+/*
+ * Support for features of the OLPC XO-1 laptop
+ *
+ * Copyright (C) 2010 One Laptop per Child
+ * Copyright (C) 2006 Red Hat, Inc.
+ * Copyright (C) 2006 Advanced Micro Devices, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/pci_ids.h>
+#include <linux/platform_device.h>
+#include <linux/pm.h>
+
+#include <asm/io.h>
+#include <asm/olpc.h>
+
+#define DRV_NAME "olpc-xo1"
+
+#define PMS_BAR		4
+#define ACPI_BAR	5
+
+/* PMC registers (PMS block) */
+#define PM_SCLK		0x10
+#define PM_IN_SLPCTL	0x20
+#define PM_WKXD		0x34
+#define PM_WKD		0x30
+#define PM_SSC		0x54
+
+/* PM registers (ACPI block) */
+#define PM1_CNT		0x08
+#define PM_GPE0_STS	0x18
+
+static unsigned long acpi_base;
+static unsigned long pms_base;
+
+static void xo1_power_off(void)
+{
+	printk(KERN_INFO "OLPC XO-1 power off sequence...\n");
+
+	/* Enable all of these controls with 0 delay */
+	outl(0x40000000, pms_base + PM_SCLK);
+	outl(0x40000000, pms_base + PM_IN_SLPCTL);
+	outl(0x40000000, pms_base + PM_WKXD);
+	outl(0x40000000, pms_base + PM_WKD);
+
+	/* Clear status bits (possibly unnecessary) */
+	outl(0x0002ffff, pms_base  + PM_SSC);
+	outl(0xffffffff, acpi_base + PM_GPE0_STS);
+
+	/* Write SLP_EN bit to start the machinery */
+	outl(0x00002000, acpi_base + PM1_CNT);
+}
+
+/* Read the base addresses from the PCI BAR info */
+static int __devinit setup_bases(struct pci_dev *pdev)
+{
+	int r;
+
+	r = pci_enable_device_io(pdev);
+	if (r) {
+		dev_err(&pdev->dev, "can't enable device IO\n");
+		return r;
+	}
+
+	r = pci_request_region(pdev, ACPI_BAR, DRV_NAME);
+	if (r) {
+		dev_err(&pdev->dev, "can't alloc PCI BAR #%d\n", ACPI_BAR);
+		return r;
+	}
+
+	r = pci_request_region(pdev, PMS_BAR, DRV_NAME);
+	if (r) {
+		dev_err(&pdev->dev, "can't alloc PCI BAR #%d\n", PMS_BAR);
+		pci_release_region(pdev, ACPI_BAR);
+		return r;
+	}
+
+	acpi_base = pci_resource_start(pdev, ACPI_BAR);
+	pms_base = pci_resource_start(pdev, PMS_BAR);
+
+	return 0;
+}
+
+static int __devinit olpc_xo1_probe(struct platform_device *pdev)
+{
+	struct pci_dev *pcidev;
+	int r;
+
+	pcidev = pci_get_device(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA,
+				NULL);
+	if (!pdev)
+		return -ENODEV;
+
+	r = setup_bases(pcidev);
+	if (r)
+		return r;
+
+	pm_power_off = xo1_power_off;
+
+	printk(KERN_INFO "OLPC XO-1 support registered\n");
+	return 0;
+}
+
+static int __devexit olpc_xo1_remove(struct platform_device *pdev)
+{
+	pm_power_off = NULL;
+	return 0;
+}
+
+static struct platform_driver olpc_xo1_driver = {
+	.driver = {
+		.name = DRV_NAME,
+		.owner = THIS_MODULE,
+	},
+	.probe = olpc_xo1_probe,
+	.remove = __devexit_p(olpc_xo1_remove),
+};
+
+static int __init olpc_xo1_init(void)
+{
+	return platform_driver_register(&olpc_xo1_driver);
+}
+
+static void __exit olpc_xo1_exit(void)
+{
+	platform_driver_unregister(&olpc_xo1_driver);
+}
+
+MODULE_AUTHOR("Daniel Drake <dsd@laptop.org>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:olpc-xo1");
+
+module_init(olpc_xo1_init);
+module_exit(olpc_xo1_exit);
-- 
cgit v1.2.3-18-g5258


From 5bcd757f93cc713cf61bbeefceda7539d9afca55 Mon Sep 17 00:00:00 2001
From: Matthew Garrett <mjg@redhat.com>
Date: Mon, 4 Oct 2010 14:59:31 -0400
Subject: x86/amd-iommu: Reenable AMD IOMMU if it's mysteriously vanished over
 suspend

AMD's reference BIOS code had a bug that could result in the
firmware failing to reenable the iommu on resume. It
transpires that this causes certain less than desirable
behaviour when it comes to PCI accesses, to whit them ending
up somewhere near Bristol when the more desirable outcome
was Edinburgh. Sadness ensues, perhaps along with filesystem
corruption.  Let's make sure that it gets turned back on,
and that we restore its configuration so decisions it makes
bear some resemblance to those made by reasonable people
rather than crack-addled lemurs who spent all your DMA on
Thunderbird.

Signed-off-by: Matthew Garrett <mjg@redhat.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/include/asm/amd_iommu_types.h |  21 ++++--
 arch/x86/kernel/amd_iommu_init.c       | 122 +++++++++++++++++++++++++++++----
 2 files changed, 123 insertions(+), 20 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index 08616180dea..bdd20c8891e 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -416,13 +416,22 @@ struct amd_iommu {
 	struct dma_ops_domain *default_dom;
 
 	/*
-	 * This array is required to work around a potential BIOS bug.
-	 * The BIOS may miss to restore parts of the PCI configuration
-	 * space when the system resumes from S3. The result is that the
-	 * IOMMU does not execute commands anymore which leads to system
-	 * failure.
+	 * We can't rely on the BIOS to restore all values on reinit, so we
+	 * need to stash them
 	 */
-	u32 cache_cfg[4];
+
+	/* The iommu BAR */
+	u32 stored_addr_lo;
+	u32 stored_addr_hi;
+
+	/*
+	 * Each iommu has 6 l1s, each of which is documented as having 0x12
+	 * registers
+	 */
+	u32 stored_l1[6][0x12];
+
+	/* The l2 indirect registers */
+	u32 stored_l2[0x83];
 };
 
 /*
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 5a170cbbbed..44710d86a43 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -194,6 +194,39 @@ static inline unsigned long tbl_size(int entry_size)
 	return 1UL << shift;
 }
 
+/* Access to l1 and l2 indexed register spaces */
+
+static u32 iommu_read_l1(struct amd_iommu *iommu, u16 l1, u8 address)
+{
+	u32 val;
+
+	pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
+	pci_read_config_dword(iommu->dev, 0xfc, &val);
+	return val;
+}
+
+static void iommu_write_l1(struct amd_iommu *iommu, u16 l1, u8 address, u32 val)
+{
+	pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16 | 1 << 31));
+	pci_write_config_dword(iommu->dev, 0xfc, val);
+	pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
+}
+
+static u32 iommu_read_l2(struct amd_iommu *iommu, u8 address)
+{
+	u32 val;
+
+	pci_write_config_dword(iommu->dev, 0xf0, address);
+	pci_read_config_dword(iommu->dev, 0xf4, &val);
+	return val;
+}
+
+static void iommu_write_l2(struct amd_iommu *iommu, u8 address, u32 val)
+{
+	pci_write_config_dword(iommu->dev, 0xf0, (address | 1 << 8));
+	pci_write_config_dword(iommu->dev, 0xf4, val);
+}
+
 /****************************************************************************
  *
  * AMD IOMMU MMIO register space handling functions
@@ -619,6 +652,7 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu)
 {
 	int cap_ptr = iommu->cap_ptr;
 	u32 range, misc;
+	int i, j;
 
 	pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET,
 			      &iommu->cap);
@@ -633,12 +667,29 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu)
 					MMIO_GET_LD(range));
 	iommu->evt_msi_num = MMIO_MSI_NUM(misc);
 
-	if (is_rd890_iommu(iommu->dev)) {
-		pci_read_config_dword(iommu->dev, 0xf0, &iommu->cache_cfg[0]);
-		pci_read_config_dword(iommu->dev, 0xf4, &iommu->cache_cfg[1]);
-		pci_read_config_dword(iommu->dev, 0xf8, &iommu->cache_cfg[2]);
-		pci_read_config_dword(iommu->dev, 0xfc, &iommu->cache_cfg[3]);
-	}
+	if (!is_rd890_iommu(iommu->dev))
+		return;
+
+	/*
+	 * Some rd890 systems may not be fully reconfigured by the BIOS, so
+	 * it's necessary for us to store this information so it can be
+	 * reprogrammed on resume
+	 */
+
+	pci_read_config_dword(iommu->dev, iommu->cap_ptr + 4,
+			      &iommu->stored_addr_lo);
+	pci_read_config_dword(iommu->dev, iommu->cap_ptr + 8,
+			      &iommu->stored_addr_hi);
+
+	/* Low bit locks writes to configuration space */
+	iommu->stored_addr_lo &= ~1;
+
+	for (i = 0; i < 6; i++)
+		for (j = 0; j < 0x12; j++)
+			iommu->stored_l1[i][j] = iommu_read_l1(iommu, i, j);
+
+	for (i = 0; i < 0x83; i++)
+		iommu->stored_l2[i] = iommu_read_l2(iommu, i);
 }
 
 /*
@@ -1127,14 +1178,53 @@ static void iommu_init_flags(struct amd_iommu *iommu)
 	iommu_feature_enable(iommu, CONTROL_COHERENT_EN);
 }
 
-static void iommu_apply_quirks(struct amd_iommu *iommu)
+static void iommu_apply_resume_quirks(struct amd_iommu *iommu)
 {
-	if (is_rd890_iommu(iommu->dev)) {
-		pci_write_config_dword(iommu->dev, 0xf0, iommu->cache_cfg[0]);
-		pci_write_config_dword(iommu->dev, 0xf4, iommu->cache_cfg[1]);
-		pci_write_config_dword(iommu->dev, 0xf8, iommu->cache_cfg[2]);
-		pci_write_config_dword(iommu->dev, 0xfc, iommu->cache_cfg[3]);
-	}
+	int i, j;
+	u32 ioc_feature_control;
+	struct pci_dev *pdev = NULL;
+
+	/* RD890 BIOSes may not have completely reconfigured the iommu */
+	if (!is_rd890_iommu(iommu->dev))
+		return;
+
+	/*
+	 * First, we need to ensure that the iommu is enabled. This is
+	 * controlled by a register in the northbridge
+	 */
+	pdev = pci_get_bus_and_slot(iommu->dev->bus->number, PCI_DEVFN(0, 0));
+
+	if (!pdev)
+		return;
+
+	/* Select Northbridge indirect register 0x75 and enable writing */
+	pci_write_config_dword(pdev, 0x60, 0x75 | (1 << 7));
+	pci_read_config_dword(pdev, 0x64, &ioc_feature_control);
+
+	/* Enable the iommu */
+	if (!(ioc_feature_control & 0x1))
+		pci_write_config_dword(pdev, 0x64, ioc_feature_control | 1);
+
+	pci_dev_put(pdev);
+
+	/* Restore the iommu BAR */
+	pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
+			       iommu->stored_addr_lo);
+	pci_write_config_dword(iommu->dev, iommu->cap_ptr + 8,
+			       iommu->stored_addr_hi);
+
+	/* Restore the l1 indirect regs for each of the 6 l1s */
+	for (i = 0; i < 6; i++)
+		for (j = 0; j < 0x12; j++)
+			iommu_write_l1(iommu, i, j, iommu->stored_l1[i][j]);
+
+	/* Restore the l2 indirect regs */
+	for (i = 0; i < 0x83; i++)
+		iommu_write_l2(iommu, i, iommu->stored_l2[i]);
+
+	/* Lock PCI setup registers */
+	pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
+			       iommu->stored_addr_lo | 1);
 }
 
 /*
@@ -1147,7 +1237,6 @@ static void enable_iommus(void)
 
 	for_each_iommu(iommu) {
 		iommu_disable(iommu);
-		iommu_apply_quirks(iommu);
 		iommu_init_flags(iommu);
 		iommu_set_device_table(iommu);
 		iommu_enable_command_buffer(iommu);
@@ -1173,6 +1262,11 @@ static void disable_iommus(void)
 
 static int amd_iommu_resume(struct sys_device *dev)
 {
+	struct amd_iommu *iommu;
+
+	for_each_iommu(iommu)
+		iommu_apply_resume_quirks(iommu);
+
 	/* re-load the hardware */
 	enable_iommus();
 
-- 
cgit v1.2.3-18-g5258


From 5d0d71569e671239ae0d905ced9b65cd843f99ee Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 13 Oct 2010 11:13:21 +0200
Subject: x86/amd-iommu: Update copyright headers

This patch updates the copyright headers in all source files
of the AMD IOMMU driver.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/include/asm/amd_iommu.h       | 2 +-
 arch/x86/include/asm/amd_iommu_proto.h | 2 +-
 arch/x86/include/asm/amd_iommu_types.h | 2 +-
 arch/x86/kernel/amd_iommu.c            | 2 +-
 arch/x86/kernel/amd_iommu_init.c       | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h
index 5af2982133b..f16a2caca1e 100644
--- a/arch/x86/include/asm/amd_iommu.h
+++ b/arch/x86/include/asm/amd_iommu.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
+ * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
  * Author: Joerg Roedel <joerg.roedel@amd.com>
  *         Leo Duran <leo.duran@amd.com>
  *
diff --git a/arch/x86/include/asm/amd_iommu_proto.h b/arch/x86/include/asm/amd_iommu_proto.h
index cb030374b90..916bc8111a0 100644
--- a/arch/x86/include/asm/amd_iommu_proto.h
+++ b/arch/x86/include/asm/amd_iommu_proto.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2009 Advanced Micro Devices, Inc.
+ * Copyright (C) 2009-2010 Advanced Micro Devices, Inc.
  * Author: Joerg Roedel <joerg.roedel@amd.com>
  *
  * This program is free software; you can redistribute it and/or modify it
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index bdd20c8891e..e3509fc303b 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
+ * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
  * Author: Joerg Roedel <joerg.roedel@amd.com>
  *         Leo Duran <leo.duran@amd.com>
  *
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 679b6450382..d2fdb0826df 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
+ * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
  * Author: Joerg Roedel <joerg.roedel@amd.com>
  *         Leo Duran <leo.duran@amd.com>
  *
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 44710d86a43..3cb482e123d 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
+ * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
  * Author: Joerg Roedel <joerg.roedel@amd.com>
  *         Leo Duran <leo.duran@amd.com>
  *
-- 
cgit v1.2.3-18-g5258


From 447b1d43ded08c11f4f3d344b4e07e6dcddbef95 Mon Sep 17 00:00:00 2001
From: Daniel Drake <dsd@laptop.org>
Date: Wed, 13 Oct 2010 19:10:42 +0100
Subject: x86, olpc: Register XO-1 platform devices

The upcoming XO-1 rfkill driver (for drivers/platform/x86) will register
itself with the name "xo1-rfkill", and the already-merged XO-1 poweroff
code uses name "olpc-xo1"

Add the necessary mechanics so that these devices are properly
initialized on XO-1 laptops.

Signed-off-by: Daniel Drake <dsd@laptop.org>
LKML-Reference: <20101013181042.90C8F9D401B@zog.reactivated.net>
Cc: Matthew Garrett <mjg@redhat.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/olpc.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c
index 74726130259..edaf3fe8dc5 100644
--- a/arch/x86/kernel/olpc.c
+++ b/arch/x86/kernel/olpc.c
@@ -17,6 +17,7 @@
 #include <linux/spinlock.h>
 #include <linux/io.h>
 #include <linux/string.h>
+#include <linux/platform_device.h>
 
 #include <asm/geode.h>
 #include <asm/setup.h>
@@ -223,8 +224,25 @@ static bool __init platform_detect(void)
 	return true;
 }
 
+static int __init add_xo1_platform_devices(void)
+{
+	struct platform_device *pdev;
+
+	pdev = platform_device_register_simple("xo1-rfkill", -1, NULL, 0);
+	if (IS_ERR(pdev))
+		return PTR_ERR(pdev);
+
+	pdev = platform_device_register_simple("olpc-xo1", -1, NULL, 0);
+	if (IS_ERR(pdev))
+		return PTR_ERR(pdev);
+
+	return 0;
+}
+
 static int __init olpc_init(void)
 {
+	int r = 0;
+
 	if (!olpc_ofw_present() || !platform_detect())
 		return 0;
 
@@ -251,6 +269,12 @@ static int __init olpc_init(void)
 			olpc_platform_info.boardrev >> 4,
 			olpc_platform_info.ecver);
 
+	if (olpc_platform_info.boardrev < olpc_board_pre(0xd0)) { /* XO-1 */
+		r = add_xo1_platform_devices();
+		if (r)
+			return r;
+	}
+
 	return 0;
 }
 
-- 
cgit v1.2.3-18-g5258


From d7acb92fea932ad2e7846480aeacddc2c03c8485 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Wed, 13 Oct 2010 16:00:29 -0700
Subject: x86-64, asm: If the assembler supports fxsave64, use it

Kbuild allows for us to probe for the existence of specific constructs
in the assembler, use them to find out if we can use fxsave64 and
permit the compiler to generate better code.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/Makefile           | 8 ++++++--
 arch/x86/include/asm/i387.h | 2 +-
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 8aa1b59b907..40668a96ca0 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -96,8 +96,12 @@ cfi := $(call as-instr,.cfi_startproc\n.cfi_rel_offset $(sp-y)$(comma)0\n.cfi_en
 # is .cfi_signal_frame supported too?
 cfi-sigframe := $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1)
 cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTIONS=1)
-KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections)
-KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections)
+
+# does binutils support specific instructions?
+asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1)
+
+KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr)
+KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr)
 
 LDFLAGS := -m elf_$(UTS_MACHINE)
 
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index a73a8d5a5e6..70f105b352e 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -175,7 +175,7 @@ static inline void fpu_fxsave(struct fpu *fpu)
 	   uses any extended registers for addressing, a second REX prefix
 	   will be generated (to the assembler, rex64 followed by semicolon
 	   is a separate instruction), and hence the 64-bitness is lost. */
-#if 0
+#ifdef CONFIG_AS_FXSAVEQ
 	/* Using "fxsaveq %0" would be the ideal choice, but is only supported
 	   starting with gas 2.16. */
 	__asm__ __volatile__("fxsaveq %0"
-- 
cgit v1.2.3-18-g5258


From fef5ba797991f9335bcfc295942b684f9bf613a1 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Wed, 13 Oct 2010 16:02:24 -0700
Subject: xen: Cope with unmapped pages when initializing kernel pagetable

Xen requires that all pages containing pagetable entries to be mapped
read-only.  If pages used for the initial pagetable are already mapped
then we can change the mapping to RO.  However, if they are initially
unmapped, we need to make sure that when they are later mapped, they
are also mapped RO.

We do this by knowing that the kernel pagetable memory is pre-allocated
in the range e820_table_start - e820_table_end, so any pfn within this
range should be mapped read-only.  However, the pagetable setup code
early_ioremaps the pages to write their entries, so we must make sure
that mappings created in the early_ioremap fixmap area are mapped RW.
(Those mappings are removed before the pages are presented to Xen
as pagetable pages.)

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
LKML-Reference: <4CB63A80.8060702@goop.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/io.h |  1 +
 arch/x86/mm/ioremap.c     |  5 +++++
 arch/x86/xen/mmu.c        | 26 ++++++++++++++++++--------
 3 files changed, 24 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 30a3e977612..66aee6c4123 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -348,6 +348,7 @@ extern void __iomem *early_memremap(resource_size_t phys_addr,
 				    unsigned long size);
 extern void early_iounmap(void __iomem *addr, unsigned long size);
 extern void fixup_early_ioremap(void);
+extern bool is_early_ioremap_ptep(pte_t *ptep);
 
 #define IO_SPACE_LIMIT 0xffff
 
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 3ba6e0608c5..0369843511d 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -362,6 +362,11 @@ static inline pte_t * __init early_ioremap_pte(unsigned long addr)
 	return &bm_pte[pte_index(addr)];
 }
 
+bool __init is_early_ioremap_ptep(pte_t *ptep)
+{
+	return ptep >= &bm_pte[0] && ptep < &bm_pte[PAGE_SIZE/sizeof(pte_t)];
+}
+
 static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata;
 
 void __init early_ioremap_init(void)
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 4fe04ac0bae..7d55e9ee3a7 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -56,6 +56,7 @@
 #include <asm/e820.h>
 #include <asm/linkage.h>
 #include <asm/page.h>
+#include <asm/init.h>
 
 #include <asm/xen/hypercall.h>
 #include <asm/xen/hypervisor.h>
@@ -360,7 +361,8 @@ void make_lowmem_page_readonly(void *vaddr)
 	unsigned int level;
 
 	pte = lookup_address(address, &level);
-	BUG_ON(pte == NULL);
+	if (pte == NULL)
+		return;		/* vaddr missing */
 
 	ptev = pte_wrprotect(*pte);
 
@@ -375,7 +377,8 @@ void make_lowmem_page_readwrite(void *vaddr)
 	unsigned int level;
 
 	pte = lookup_address(address, &level);
-	BUG_ON(pte == NULL);
+	if (pte == NULL)
+		return;		/* vaddr missing */
 
 	ptev = pte_mkwrite(*pte);
 
@@ -1509,13 +1512,25 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
 #endif
 }
 
-#ifdef CONFIG_X86_32
 static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
 {
+	unsigned long pfn = pte_pfn(pte);
+
+#ifdef CONFIG_X86_32
 	/* If there's an existing pte, then don't allow _PAGE_RW to be set */
 	if (pte_val_ma(*ptep) & _PAGE_PRESENT)
 		pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
 			       pte_val_ma(pte));
+#endif
+
+	/*
+	 * If the new pfn is within the range of the newly allocated
+	 * kernel pagetable, and it isn't being mapped into an
+	 * early_ioremap fixmap slot, make sure it is RO.
+	 */
+	if (!is_early_ioremap_ptep(ptep) &&
+	    pfn >= e820_table_start && pfn < e820_table_end)
+		pte = pte_wrprotect(pte);
 
 	return pte;
 }
@@ -1528,7 +1543,6 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
 
 	xen_set_pte(ptep, pte);
 }
-#endif
 
 static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
 {
@@ -1973,11 +1987,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
 	.alloc_pmd_clone = paravirt_nop,
 	.release_pmd = xen_release_pmd_init,
 
-#ifdef CONFIG_X86_64
-	.set_pte = xen_set_pte,
-#else
 	.set_pte = xen_set_pte_init,
-#endif
 	.set_pte_at = xen_set_pte_at,
 	.set_pmd = xen_set_pmd_hyper,
 
-- 
cgit v1.2.3-18-g5258


From 03f1a17cd5c69deccd3cfe1b954b9426d7a686e3 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 13 Oct 2010 21:00:23 -0700
Subject: x86/vsmp: Eliminate kconfig dependency warning

Fix kconfig dependency warning to satisfy dependencies:

warning: (X86_VSMP && X86_64 && PCI && X86_EXTENDED_PLATFORM ||
XEN && PARAVIRT_GUEST && (X86_64 || X86_32 && X86_PAE && !X86_VISWS) && X86_CMPXCHG && X86_TSC || KVM_CLOCK && PARAVIRT_GUEST || KVM_GUEST && PARAVIRT_GUEST || LGUEST_GUEST && PARAVIRT_GUEST && X86_32) selects PARAVIRT which has unmet direct dependencies (PARAVIRT_GUEST)

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Ravikiran Thirumalai <kiran@scalex86.org>
LKML-Reference: <20101013210023.9a033222.randy.dunlap@oracle.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cea0cd9a316..64e817ec7ff 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -372,6 +372,7 @@ endif
 
 config X86_VSMP
 	bool "ScaleMP vSMP"
+	select PARAVIRT_GUEST
 	select PARAVIRT
 	depends on X86_64 && PCI
 	depends on X86_EXTENDED_PLATFORM
-- 
cgit v1.2.3-18-g5258


From 3caa37519ccbb200c7fbbf6ff4fb306a30f29425 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Thu, 14 Oct 2010 12:10:36 +0900
Subject: x86: Use __stop_machine() in text_poke_smp()

Use __stop_machine() in text_poke_smp() because the caller
must get online_cpus before calling text_poke_smp(), but
stop_machine() do it again. We don't need it.

Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: 2nddept-manager@sdl.hitachi.co.jp
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
LKML-Reference: <20101014031036.4100.83989.stgit@ltc236.sdl.hitachi.co.jp>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/alternative.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index cb0e6d385f6..a36bb90aef5 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -637,7 +637,8 @@ void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
 	tpp.len = len;
 	atomic_set(&stop_machine_first, 1);
 	wrote_text = 0;
-	stop_machine(stop_machine_text_poke, (void *)&tpp, NULL);
+	/* Use __stop_machine() because the caller already got online_cpus. */
+	__stop_machine(stop_machine_text_poke, (void *)&tpp, NULL);
 	return addr;
 }
 
-- 
cgit v1.2.3-18-g5258


From 3cba11d32bb4b24c3ba257043595772df4b9c7b5 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Thu, 14 Oct 2010 12:10:42 +0900
Subject: kconfig/x86: Add HAVE_TEXT_POKE_SMP config for stop_machine
 dependency

Since the text_poke_smp() definately depends on actual
stop_machine() on smp, add that dependency to Kconfig.

Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: 2nddept-manager@sdl.hitachi.co.jp
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
LKML-Reference: <20101014031042.4100.90877.stgit@ltc236.sdl.hitachi.co.jp>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b431a0824a9..c14d8b4d2f7 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -60,6 +60,7 @@ config X86
 	select HAVE_ARCH_KMEMCHECK
 	select HAVE_USER_RETURN_NOTIFIER
 	select HAVE_ARCH_JUMP_LABEL
+	select HAVE_TEXT_POKE_SMP
 
 config INSTRUCTION_DECODER
 	def_bool (KPROBES || PERF_EVENTS)
@@ -2126,6 +2127,10 @@ config HAVE_ATOMIC_IOMAP
 	def_bool y
 	depends on X86_32
 
+config HAVE_TEXT_POKE_SMP
+	bool
+	select STOP_MACHINE if SMP
+
 source "net/Kconfig"
 
 source "drivers/Kconfig"
-- 
cgit v1.2.3-18-g5258


From 67e87f0a1c5cbc750f81ebf6a128e8ff6f4376cc Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Wed, 13 Oct 2010 16:34:15 -0700
Subject: x86-64: Only set max_pfn_mapped to 512 MiB if we enter via head_64.S

head_64.S maps up to 512 MiB, but that is not necessarity true for
other entry paths, such as Xen.

Thus, co-locate the setting of max_pfn_mapped with the code to
actually set up the page tables in head_64.S.  The 32-bit code is
already so co-located.  (The Xen code already sets max_pfn_mapped
correctly for its own use case.)

-v2:

 Yinghai fixed the following bug in this patch:

 |
 | max_pfn_mapped is in .bss section, so we need to set that
 | after bss get cleared. Without that we crash on bootup.
 |
 | That is safe because Xen does not call x86_64_start_kernel().
 |

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Fixed-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
LKML-Reference: <4CB6AB24.9020504@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/head64.c | 2 ++
 arch/x86/kernel/setup.c  | 1 -
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 97adf9828b9..2d2673c28af 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -80,6 +80,8 @@ void __init x86_64_start_kernel(char * real_mode_data)
 	/* Cleanup the over mapped high alias */
 	cleanup_highmap();
 
+	max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
+
 	for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) {
 #ifdef CONFIG_EARLY_PRINTK
 		set_intr_gate(i, &early_idt_handlers[i]);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b11a238b2e3..c3cebfe7bfc 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -932,7 +932,6 @@ void __init setup_arch(char **cmdline_p)
 		max_low_pfn = max_pfn;
 
 	high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
-	max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
 #endif
 
 	/*
-- 
cgit v1.2.3-18-g5258


From ebc8827f75954fe315492883eee5cb3f355d547d Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 27 Sep 2010 18:50:51 +0200
Subject: x86: Barf when vmalloc and kmemcheck faults happen in NMI

In x86, faults exit by executing the iret instruction, which then
reenables NMIs if we faulted in NMI context. Then if a fault
happens in NMI, another NMI can nest after the fault exits.

But we don't yet support nested NMIs because we have only one NMI
stack. To prevent from that, check that vmalloc and kmemcheck
faults don't happen in this context. Most of the other kernel faults
in NMIs can be more easily spotted by finding explicit
copy_from,to_user() calls on review.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 arch/x86/mm/fault.c               | 4 ++++
 arch/x86/mm/kmemcheck/kmemcheck.c | 2 ++
 2 files changed, 6 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 4c4508e8a20..a24c6cfdccc 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -251,6 +251,8 @@ static noinline __kprobes int vmalloc_fault(unsigned long address)
 	if (!(address >= VMALLOC_START && address < VMALLOC_END))
 		return -1;
 
+	WARN_ON_ONCE(in_nmi());
+
 	/*
 	 * Synchronize this task's top level page-table
 	 * with the 'reference' page table.
@@ -369,6 +371,8 @@ static noinline __kprobes int vmalloc_fault(unsigned long address)
 	if (!(address >= VMALLOC_START && address < VMALLOC_END))
 		return -1;
 
+	WARN_ON_ONCE(in_nmi());
+
 	/*
 	 * Copy kernel mappings over when needed. This can also
 	 * happen within a race in page table update. In the later
diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c
index b3b531a4f8e..d87dd6d042d 100644
--- a/arch/x86/mm/kmemcheck/kmemcheck.c
+++ b/arch/x86/mm/kmemcheck/kmemcheck.c
@@ -631,6 +631,8 @@ bool kmemcheck_fault(struct pt_regs *regs, unsigned long address,
 	if (!pte)
 		return false;
 
+	WARN_ON_ONCE(in_nmi());
+
 	if (error_code & 2)
 		kmemcheck_access(regs, address, KMEMCHECK_WRITE);
 	else
-- 
cgit v1.2.3-18-g5258


From 72441cb1fd77d092f09ddfac748955703884c9a7 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 13 Oct 2010 17:12:30 -0400
Subject: ftrace/x86: Add support for C version of recordmcount

This patch adds the support for the C version of recordmcount and
compile times show ~ 12% improvement.

After verifying this works, other archs can add:

 HAVE_C_MCOUNT_RECORD

in its Kconfig and it will use the C version of recordmcount
instead of the perl version.

Cc: <linux-arch@vger.kernel.org>
Cc: Michal Marek <mmarek@suse.cz>
Cc: linux-kbuild@vger.kernel.org
Cc: John Reiser <jreiser@bitwagon.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c14d8b4d2f7..788b50ef5fc 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -33,6 +33,7 @@ config X86
 	select HAVE_KRETPROBES
 	select HAVE_OPTPROBES
 	select HAVE_FTRACE_MCOUNT_RECORD
+	select HAVE_C_MCOUNT_RECORD
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_FUNCTION_TRACER
 	select HAVE_FUNCTION_GRAPH_TRACER
-- 
cgit v1.2.3-18-g5258


From cf4db2597ae93b60efc0a7a4ec08690b75d629b1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 14 Oct 2010 23:32:44 -0400
Subject: ftrace: Rename config option HAVE_C_MCOUNT_RECORD to
 HAVE_C_RECORDMCOUNT

The config option used by archs to let the build system know that
the C version of the recordmcount works for said arch is currently
called HAVE_C_MCOUNT_RECORD which enables BUILD_C_RECORDMCOUNT. To
be more consistent with the name that all archs may use, it has been
renamed to HAVE_C_RECORDMCOUNT. This will be less confusing since
we are building a C recordmcount and not a mcount_record.

Suggested-by: Ingo Molnar <mingo@elte.hu>
Cc: <linux-arch@vger.kernel.org>
Cc: Michal Marek <mmarek@suse.cz>
Cc: linux-kbuild@vger.kernel.org
Cc: John Reiser <jreiser@bitwagon.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 788b50ef5fc..9815221976a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -33,7 +33,7 @@ config X86
 	select HAVE_KRETPROBES
 	select HAVE_OPTPROBES
 	select HAVE_FTRACE_MCOUNT_RECORD
-	select HAVE_C_MCOUNT_RECORD
+	select HAVE_C_RECORDMCOUNT
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_FUNCTION_TRACER
 	select HAVE_FUNCTION_GRAPH_TRACER
-- 
cgit v1.2.3-18-g5258


From 9e9006e9090dc1f88d5127cb69f416013e7ecd60 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Thu, 14 Oct 2010 10:13:13 -0700
Subject: x86, olpc: XO-1 uses/depends on PCI

olpc-xo1 uses pci_*() interfaces so it should depend on PCI.

Otherwise we get build failure like:

 arch/x86/kernel/olpc-xo1.c:65: error: implicit declaration of function 'pci_enable_device_io'
 arch/x86/kernel/olpc-xo1.c:71: error: implicit declaration of function 'pci_request_region'
 arch/x86/kernel/olpc-xo1.c:80: error: implicit declaration of function 'pci_release_region'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Acked-by: Daniel Drake <dsd@laptop.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
LKML-Reference: <20101014101313.adf7eb2a.randy.dunlap@oracle.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 0bcb8765b1c..c010b8d8271 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2068,7 +2068,7 @@ config OLPC
 
 config OLPC_XO1
 	tristate "OLPC XO-1 support"
-	depends on OLPC
+	depends on OLPC && PCI
 	---help---
 	  Add support for non-essential features of the OLPC XO-1 laptop.
 
-- 
cgit v1.2.3-18-g5258


From 3acbf0849bcbb639fde53dc627e3b55a4c6429d2 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Tue, 31 Aug 2010 10:44:17 +0200
Subject: oprofile, x86: Add support for AMD family 12h

This patch adds support for AMD family 12h (Llano) cpus.

Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/nmi_int.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index bd1489c3ce0..0b0d1d62820 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -726,6 +726,9 @@ int __init op_nmi_init(struct oprofile_operations *ops)
 		case 0x11:
 			cpu_type = "x86-64/family11h";
 			break;
+		case 0x12:
+			cpu_type = "x86-64/family12h";
+			break;
 		default:
 			return -ENODEV;
 		}
-- 
cgit v1.2.3-18-g5258


From e63414740e15b4e2dc54c63fb9ea501b257fb0b5 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Thu, 26 Aug 2010 12:30:17 +0200
Subject: oprofile, x86: Add support for AMD family 14h

This patch adds support for AMD family 14h (Ontario/Zacate) cpus.

Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/nmi_int.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 0b0d1d62820..4e8baad36d3 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -729,6 +729,9 @@ int __init op_nmi_init(struct oprofile_operations *ops)
 		case 0x12:
 			cpu_type = "x86-64/family12h";
 			break;
+		case 0x14:
+			cpu_type = "x86-64/family14h";
+			break;
 		default:
 			return -ENODEV;
 		}
-- 
cgit v1.2.3-18-g5258


From 4ac945f002c0bebdeb530cbc3729e22895e64a7e Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Tue, 21 Sep 2010 15:58:32 +0200
Subject: oprofile, x86: Check IBS capability bits 1 and 2

There are IBS CPUID feature flags in CPUID Fn8000_001B to detect if
the cpu supports IBS fetch sampling (FetchSam) and/or IBS execution
sampling (OpSam). This patch adds checks if the both features are
available.

Spec:

 http://support.amd.com/us/Processor_TechDocs/31116.pdf

Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/op_model_amd.c | 59 ++++++++++++++++++++++++++--------------
 1 file changed, 38 insertions(+), 21 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index b67a6b5aa8d..96852d5480e 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -70,9 +70,22 @@ static u64 ibs_op_ctl;
  * Same bit mask as for IBS cpuid feature flags (Fn8000_001B_EAX), but
  * bit 0 is used to indicate the existence of IBS.
  */
-#define IBS_CAPS_AVAIL			(1LL<<0)
-#define IBS_CAPS_RDWROPCNT		(1LL<<3)
-#define IBS_CAPS_OPCNT			(1LL<<4)
+#define IBS_CAPS_AVAIL			(1U<<0)
+#define IBS_CAPS_FETCHSAM		(1U<<1)
+#define IBS_CAPS_OPSAM			(1U<<2)
+#define IBS_CAPS_RDWROPCNT		(1U<<3)
+#define IBS_CAPS_OPCNT			(1U<<4)
+
+#define IBS_CAPS_DEFAULT		(IBS_CAPS_AVAIL		\
+					 | IBS_CAPS_FETCHSAM	\
+					 | IBS_CAPS_OPSAM)
+
+/*
+ * IBS APIC setup
+ */
+#define IBSCTL				0x1cc
+#define IBSCTL_LVT_OFFSET_VALID		(1ULL<<8)
+#define IBSCTL_LVT_OFFSET_MASK		0x0F
 
 /*
  * IBS randomization macros
@@ -92,12 +105,12 @@ static u32 get_ibs_caps(void)
 	/* check IBS cpuid feature flags */
 	max_level = cpuid_eax(0x80000000);
 	if (max_level < IBS_CPUID_FEATURES)
-		return IBS_CAPS_AVAIL;
+		return IBS_CAPS_DEFAULT;
 
 	ibs_caps = cpuid_eax(IBS_CPUID_FEATURES);
 	if (!(ibs_caps & IBS_CAPS_AVAIL))
 		/* cpuid flags not valid */
-		return IBS_CAPS_AVAIL;
+		return IBS_CAPS_DEFAULT;
 
 	return ibs_caps;
 }
@@ -527,22 +540,26 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root)
 	ibs_config.op_enabled = 0;
 	ibs_config.dispatched_ops = 0;
 
-	dir = oprofilefs_mkdir(sb, root, "ibs_fetch");
-	oprofilefs_create_ulong(sb, dir, "enable",
-				&ibs_config.fetch_enabled);
-	oprofilefs_create_ulong(sb, dir, "max_count",
-				&ibs_config.max_cnt_fetch);
-	oprofilefs_create_ulong(sb, dir, "rand_enable",
-				&ibs_config.rand_en);
-
-	dir = oprofilefs_mkdir(sb, root, "ibs_op");
-	oprofilefs_create_ulong(sb, dir, "enable",
-				&ibs_config.op_enabled);
-	oprofilefs_create_ulong(sb, dir, "max_count",
-				&ibs_config.max_cnt_op);
-	if (ibs_caps & IBS_CAPS_OPCNT)
-		oprofilefs_create_ulong(sb, dir, "dispatched_ops",
-					&ibs_config.dispatched_ops);
+	if (ibs_caps & IBS_CAPS_FETCHSAM) {
+		dir = oprofilefs_mkdir(sb, root, "ibs_fetch");
+		oprofilefs_create_ulong(sb, dir, "enable",
+					&ibs_config.fetch_enabled);
+		oprofilefs_create_ulong(sb, dir, "max_count",
+					&ibs_config.max_cnt_fetch);
+		oprofilefs_create_ulong(sb, dir, "rand_enable",
+					&ibs_config.rand_en);
+	}
+
+	if (ibs_caps & IBS_CAPS_OPSAM) {
+		dir = oprofilefs_mkdir(sb, root, "ibs_op");
+		oprofilefs_create_ulong(sb, dir, "enable",
+					&ibs_config.op_enabled);
+		oprofilefs_create_ulong(sb, dir, "max_count",
+					&ibs_config.max_cnt_op);
+		if (ibs_caps & IBS_CAPS_OPCNT)
+			oprofilefs_create_ulong(sb, dir, "dispatched_ops",
+						&ibs_config.dispatched_ops);
+	}
 
 	return 0;
 }
-- 
cgit v1.2.3-18-g5258


From fc889aa23f4767c1c3f77fce11e17bb0a638971f Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Tue, 21 Sep 2010 18:09:00 +0200
Subject: oprofile, x86: Remove duplicate check for IBS_CAPS_OPCNT

Since oprofile is setting up ibs_op/dispatched_ops in the fs only if
the feature is available, its corresponding variable
ibs_config.dispatched_ops is only set, if the feature is
available. Thus the check is duplicate and can be removed.

Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/op_model_amd.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index 96852d5480e..d5e9dab71be 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -257,8 +257,7 @@ static inline void op_amd_start_ibs(void)
 			ibs_op_ctl = min(ibs_op_ctl + IBS_RANDOM_MAXCNT_OFFSET,
 					 IBS_OP_MAX_CNT);
 		}
-		if (ibs_caps & IBS_CAPS_OPCNT && ibs_config.dispatched_ops)
-			ibs_op_ctl |= IBS_OP_CNT_CTL;
+		ibs_op_ctl |= ibs_config.dispatched_ops ? IBS_OP_CNT_CTL : 0;
 		ibs_op_ctl |= IBS_OP_ENABLE;
 		val = op_amd_randomize_ibs_op(ibs_op_ctl);
 		wrmsrl(MSR_AMD64_IBSOPCTL, val);
-- 
cgit v1.2.3-18-g5258


From 53b39e9480ef8a286cef9899c455a979acd0eed9 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Tue, 21 Sep 2010 17:58:15 +0200
Subject: oprofile, x86: Introduce struct ibs_state

This patch introduces struct ibs_state that will extended by additinal
members in follow-on patches.

Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/op_model_amd.c | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index d5e9dab71be..9d450973677 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -48,7 +48,7 @@ static unsigned long reset_value[NUM_VIRT_COUNTERS];
 
 static u32 ibs_caps;
 
-struct op_ibs_config {
+struct ibs_config {
 	unsigned long op_enabled;
 	unsigned long fetch_enabled;
 	unsigned long max_cnt_fetch;
@@ -57,8 +57,12 @@ struct op_ibs_config {
 	unsigned long dispatched_ops;
 };
 
-static struct op_ibs_config ibs_config;
-static u64 ibs_op_ctl;
+struct ibs_state {
+	u64	ibs_op_ctl;
+};
+
+static struct ibs_config ibs_config;
+static struct ibs_state ibs_state;
 
 /*
  * IBS cpuid feature detection
@@ -219,7 +223,7 @@ op_amd_handle_ibs(struct pt_regs * const regs,
 			oprofile_write_commit(&entry);
 
 			/* reenable the IRQ */
-			ctl = op_amd_randomize_ibs_op(ibs_op_ctl);
+			ctl = op_amd_randomize_ibs_op(ibs_state.ibs_op_ctl);
 			wrmsrl(MSR_AMD64_IBSOPCTL, ctl);
 		}
 	}
@@ -232,6 +236,8 @@ static inline void op_amd_start_ibs(void)
 	if (!ibs_caps)
 		return;
 
+	memset(&ibs_state, 0, sizeof(ibs_state));
+
 	if (ibs_config.fetch_enabled) {
 		val = (ibs_config.max_cnt_fetch >> 4) & IBS_FETCH_MAX_CNT;
 		val |= ibs_config.rand_en ? IBS_FETCH_RAND_EN : 0;
@@ -240,13 +246,13 @@ static inline void op_amd_start_ibs(void)
 	}
 
 	if (ibs_config.op_enabled) {
-		ibs_op_ctl = ibs_config.max_cnt_op >> 4;
+		val = ibs_config.max_cnt_op >> 4;
 		if (!(ibs_caps & IBS_CAPS_RDWROPCNT)) {
 			/*
 			 * IbsOpCurCnt not supported.  See
 			 * op_amd_randomize_ibs_op() for details.
 			 */
-			ibs_op_ctl = clamp(ibs_op_ctl, 0x0081ULL, 0xFF80ULL);
+			val = clamp(val, 0x0081ULL, 0xFF80ULL);
 		} else {
 			/*
 			 * The start value is randomized with a
@@ -254,12 +260,13 @@ static inline void op_amd_start_ibs(void)
 			 * with the half of the randomized range. Also
 			 * avoid underflows.
 			 */
-			ibs_op_ctl = min(ibs_op_ctl + IBS_RANDOM_MAXCNT_OFFSET,
-					 IBS_OP_MAX_CNT);
+			val = min(val + IBS_RANDOM_MAXCNT_OFFSET,
+				  IBS_OP_MAX_CNT);
 		}
-		ibs_op_ctl |= ibs_config.dispatched_ops ? IBS_OP_CNT_CTL : 0;
-		ibs_op_ctl |= IBS_OP_ENABLE;
-		val = op_amd_randomize_ibs_op(ibs_op_ctl);
+		val |= ibs_config.dispatched_ops ? IBS_OP_CNT_CTL : 0;
+		val |= IBS_OP_ENABLE;
+		ibs_state.ibs_op_ctl = val;
+		val = op_amd_randomize_ibs_op(ibs_state.ibs_op_ctl);
 		wrmsrl(MSR_AMD64_IBSOPCTL, val);
 	}
 }
-- 
cgit v1.2.3-18-g5258


From 25da6950475becb35d7a3bb3b5fbdc715a76887e Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Tue, 21 Sep 2010 15:49:31 +0200
Subject: oprofile, x86: Add support for IBS branch target address reporting

This patch adds support for IBS branch target address reporting. A new
MSR (MSRC001_103B IBS Branch Target Address) has been added that
provides the logical address in canonical form for the branch
target. The size of the IBS sample that is transferred to the userland
has been increased.

For backward compatibility, the userland daemon must explicit enable
the feature by writing to the oprofilefs file

 ibs_op/branch_target

After enabling branch target address reporting, the userland daemon
must handle the extended size of the IBS sample.

Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/include/asm/msr-index.h |  1 +
 arch/x86/oprofile/op_model_amd.c | 26 ++++++++++++++++++++------
 2 files changed, 21 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 986f7790fdb..91ba8e6b630 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -121,6 +121,7 @@
 #define MSR_AMD64_IBSDCLINAD		0xc0011038
 #define MSR_AMD64_IBSDCPHYSAD		0xc0011039
 #define MSR_AMD64_IBSCTL		0xc001103a
+#define MSR_AMD64_IBSBRTARGET		0xc001103b
 
 /* Fam 10h MSRs */
 #define MSR_FAM10H_MMIO_CONF_BASE	0xc0010058
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index 9d450973677..9de33fa9531 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -55,10 +55,13 @@ struct ibs_config {
 	unsigned long max_cnt_op;
 	unsigned long rand_en;
 	unsigned long dispatched_ops;
+	unsigned long branch_target;
 };
 
 struct ibs_state {
-	u64	ibs_op_ctl;
+	u64		ibs_op_ctl;
+	int		branch_target;
+	unsigned long	sample_size;
 };
 
 static struct ibs_config ibs_config;
@@ -79,6 +82,7 @@ static struct ibs_state ibs_state;
 #define IBS_CAPS_OPSAM			(1U<<2)
 #define IBS_CAPS_RDWROPCNT		(1U<<3)
 #define IBS_CAPS_OPCNT			(1U<<4)
+#define IBS_CAPS_BRNTRGT		(1U<<5)
 
 #define IBS_CAPS_DEFAULT		(IBS_CAPS_AVAIL		\
 					 | IBS_CAPS_FETCHSAM	\
@@ -207,8 +211,8 @@ op_amd_handle_ibs(struct pt_regs * const regs,
 		rdmsrl(MSR_AMD64_IBSOPCTL, ctl);
 		if (ctl & IBS_OP_VAL) {
 			rdmsrl(MSR_AMD64_IBSOPRIP, val);
-			oprofile_write_reserve(&entry, regs, val,
-					       IBS_OP_CODE, IBS_OP_SIZE);
+			oprofile_write_reserve(&entry, regs, val, IBS_OP_CODE,
+					       ibs_state.sample_size);
 			oprofile_add_data64(&entry, val);
 			rdmsrl(MSR_AMD64_IBSOPDATA, val);
 			oprofile_add_data64(&entry, val);
@@ -220,6 +224,10 @@ op_amd_handle_ibs(struct pt_regs * const regs,
 			oprofile_add_data64(&entry, val);
 			rdmsrl(MSR_AMD64_IBSDCPHYSAD, val);
 			oprofile_add_data64(&entry, val);
+			if (ibs_state.branch_target) {
+				rdmsrl(MSR_AMD64_IBSBRTARGET, val);
+				oprofile_add_data(&entry, (unsigned long)val);
+			}
 			oprofile_write_commit(&entry);
 
 			/* reenable the IRQ */
@@ -266,6 +274,11 @@ static inline void op_amd_start_ibs(void)
 		val |= ibs_config.dispatched_ops ? IBS_OP_CNT_CTL : 0;
 		val |= IBS_OP_ENABLE;
 		ibs_state.ibs_op_ctl = val;
+		ibs_state.sample_size = IBS_OP_SIZE;
+		if (ibs_config.branch_target) {
+			ibs_state.branch_target = 1;
+			ibs_state.sample_size++;
+		}
 		val = op_amd_randomize_ibs_op(ibs_state.ibs_op_ctl);
 		wrmsrl(MSR_AMD64_IBSOPCTL, val);
 	}
@@ -540,11 +553,9 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root)
 	/* model specific files */
 
 	/* setup some reasonable defaults */
+	memset(&ibs_config, 0, sizeof(ibs_config));
 	ibs_config.max_cnt_fetch = 250000;
-	ibs_config.fetch_enabled = 0;
 	ibs_config.max_cnt_op = 250000;
-	ibs_config.op_enabled = 0;
-	ibs_config.dispatched_ops = 0;
 
 	if (ibs_caps & IBS_CAPS_FETCHSAM) {
 		dir = oprofilefs_mkdir(sb, root, "ibs_fetch");
@@ -565,6 +576,9 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root)
 		if (ibs_caps & IBS_CAPS_OPCNT)
 			oprofilefs_create_ulong(sb, dir, "dispatched_ops",
 						&ibs_config.dispatched_ops);
+		if (ibs_caps & IBS_CAPS_BRNTRGT)
+			oprofilefs_create_ulong(sb, dir, "branch_target",
+						&ibs_config.branch_target);
 	}
 
 	return 0;
-- 
cgit v1.2.3-18-g5258


From b47fad3bfb5940cc3e28a1c69716f6dc44e4b7e6 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 22 Sep 2010 17:45:39 +0200
Subject: oprofile, x86: Add support for IBS periodic op counter extension

The count value for IBS op sampling has been extended by 7 bits. The
feature is reflected in bit 6 (OpCntExt) of the IBS capability
register (CPUID Fn8000_001B_EAX).

Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/include/asm/perf_event.h | 19 ++++++++++---------
 arch/x86/oprofile/op_model_amd.c  | 22 +++++++++++++++++++---
 2 files changed, 29 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 6e742cc4251..550e26b1dbb 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -111,17 +111,18 @@ union cpuid10_edx {
 #define X86_PMC_IDX_FIXED_BTS				(X86_PMC_IDX_FIXED + 16)
 
 /* IbsFetchCtl bits/masks */
-#define IBS_FETCH_RAND_EN		(1ULL<<57)
-#define IBS_FETCH_VAL			(1ULL<<49)
-#define IBS_FETCH_ENABLE		(1ULL<<48)
-#define IBS_FETCH_CNT			0xFFFF0000ULL
-#define IBS_FETCH_MAX_CNT		0x0000FFFFULL
+#define IBS_FETCH_RAND_EN	(1ULL<<57)
+#define IBS_FETCH_VAL		(1ULL<<49)
+#define IBS_FETCH_ENABLE	(1ULL<<48)
+#define IBS_FETCH_CNT		0xFFFF0000ULL
+#define IBS_FETCH_MAX_CNT	0x0000FFFFULL
 
 /* IbsOpCtl bits */
-#define IBS_OP_CNT_CTL			(1ULL<<19)
-#define IBS_OP_VAL			(1ULL<<18)
-#define IBS_OP_ENABLE			(1ULL<<17)
-#define IBS_OP_MAX_CNT			0x0000FFFFULL
+#define IBS_OP_CNT_CTL		(1ULL<<19)
+#define IBS_OP_VAL		(1ULL<<18)
+#define IBS_OP_ENABLE		(1ULL<<17)
+#define IBS_OP_MAX_CNT		0x0000FFFFULL
+#define IBS_OP_MAX_CNT_EXT	0x007FFFFFULL	/* not a register bit mask */
 
 #ifdef CONFIG_PERF_EVENTS
 extern void init_hw_perf_events(void);
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index 9de33fa9531..65f0a1eb6b8 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -83,6 +83,7 @@ static struct ibs_state ibs_state;
 #define IBS_CAPS_RDWROPCNT		(1U<<3)
 #define IBS_CAPS_OPCNT			(1U<<4)
 #define IBS_CAPS_BRNTRGT		(1U<<5)
+#define IBS_CAPS_OPCNTEXT		(1U<<6)
 
 #define IBS_CAPS_DEFAULT		(IBS_CAPS_AVAIL		\
 					 | IBS_CAPS_FETCHSAM	\
@@ -246,8 +247,16 @@ static inline void op_amd_start_ibs(void)
 
 	memset(&ibs_state, 0, sizeof(ibs_state));
 
+	/*
+	 * Note: Since the max count settings may out of range we
+	 * write back the actual used values so that userland can read
+	 * it.
+	 */
+
 	if (ibs_config.fetch_enabled) {
-		val = (ibs_config.max_cnt_fetch >> 4) & IBS_FETCH_MAX_CNT;
+		val = ibs_config.max_cnt_fetch >> 4;
+		val = min(val, IBS_FETCH_MAX_CNT);
+		ibs_config.max_cnt_fetch = val << 4;
 		val |= ibs_config.rand_en ? IBS_FETCH_RAND_EN : 0;
 		val |= IBS_FETCH_ENABLE;
 		wrmsrl(MSR_AMD64_IBSFETCHCTL, val);
@@ -261,6 +270,7 @@ static inline void op_amd_start_ibs(void)
 			 * op_amd_randomize_ibs_op() for details.
 			 */
 			val = clamp(val, 0x0081ULL, 0xFF80ULL);
+			ibs_config.max_cnt_op = val << 4;
 		} else {
 			/*
 			 * The start value is randomized with a
@@ -268,9 +278,15 @@ static inline void op_amd_start_ibs(void)
 			 * with the half of the randomized range. Also
 			 * avoid underflows.
 			 */
-			val = min(val + IBS_RANDOM_MAXCNT_OFFSET,
-				  IBS_OP_MAX_CNT);
+			val += IBS_RANDOM_MAXCNT_OFFSET;
+			if (ibs_caps & IBS_CAPS_OPCNTEXT)
+				val = min(val, IBS_OP_MAX_CNT_EXT);
+			else
+				val = min(val, IBS_OP_MAX_CNT);
+			ibs_config.max_cnt_op =
+				(val - IBS_RANDOM_MAXCNT_OFFSET) << 4;
 		}
+		val = ((val & ~IBS_OP_MAX_CNT) << 4) | (val & IBS_OP_MAX_CNT);
 		val |= ibs_config.dispatched_ops ? IBS_OP_CNT_CTL : 0;
 		val |= IBS_OP_ENABLE;
 		ibs_state.ibs_op_ctl = val;
-- 
cgit v1.2.3-18-g5258


From 6038f373a3dc1f1c26496e60b6c40b164716f07e Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Sun, 15 Aug 2010 18:52:59 +0200
Subject: llseek: automatically add .llseek fop

All file_operations should get a .llseek operation so we can make
nonseekable_open the default for future file operations without a
.llseek pointer.

The three cases that we can automatically detect are no_llseek, seq_lseek
and default_llseek. For cases where we can we can automatically prove that
the file offset is always ignored, we use noop_llseek, which maintains
the current behavior of not returning an error from a seek.

New drivers should normally not use noop_llseek but instead use no_llseek
and call nonseekable_open at open time.  Existing drivers can be converted
to do the same when the maintainer knows for certain that no user code
relies on calling seek on the device file.

The generated code is often incorrectly indented and right now contains
comments that clarify for each added line why a specific variant was
chosen. In the version that gets submitted upstream, the comments will
be gone and I will manually fix the indentation, because there does not
seem to be a way to do that using coccinelle.

Some amount of new code is currently sitting in linux-next that should get
the same modifications, which I will do at the end of the merge window.

Many thanks to Julia Lawall for helping me learn to write a semantic
patch that does all this.

===== begin semantic patch =====
// This adds an llseek= method to all file operations,
// as a preparation for making no_llseek the default.
//
// The rules are
// - use no_llseek explicitly if we do nonseekable_open
// - use seq_lseek for sequential files
// - use default_llseek if we know we access f_pos
// - use noop_llseek if we know we don't access f_pos,
//   but we still want to allow users to call lseek
//
@ open1 exists @
identifier nested_open;
@@
nested_open(...)
{
<+...
nonseekable_open(...)
...+>
}

@ open exists@
identifier open_f;
identifier i, f;
identifier open1.nested_open;
@@
int open_f(struct inode *i, struct file *f)
{
<+...
(
nonseekable_open(...)
|
nested_open(...)
)
...+>
}

@ read disable optional_qualifier exists @
identifier read_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
expression E;
identifier func;
@@
ssize_t read_f(struct file *f, char *p, size_t s, loff_t *off)
{
<+...
(
   *off = E
|
   *off += E
|
   func(..., off, ...)
|
   E = *off
)
...+>
}

@ read_no_fpos disable optional_qualifier exists @
identifier read_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
@@
ssize_t read_f(struct file *f, char *p, size_t s, loff_t *off)
{
... when != off
}

@ write @
identifier write_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
expression E;
identifier func;
@@
ssize_t write_f(struct file *f, const char *p, size_t s, loff_t *off)
{
<+...
(
  *off = E
|
  *off += E
|
  func(..., off, ...)
|
  E = *off
)
...+>
}

@ write_no_fpos @
identifier write_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
@@
ssize_t write_f(struct file *f, const char *p, size_t s, loff_t *off)
{
... when != off
}

@ fops0 @
identifier fops;
@@
struct file_operations fops = {
 ...
};

@ has_llseek depends on fops0 @
identifier fops0.fops;
identifier llseek_f;
@@
struct file_operations fops = {
...
 .llseek = llseek_f,
...
};

@ has_read depends on fops0 @
identifier fops0.fops;
identifier read_f;
@@
struct file_operations fops = {
...
 .read = read_f,
...
};

@ has_write depends on fops0 @
identifier fops0.fops;
identifier write_f;
@@
struct file_operations fops = {
...
 .write = write_f,
...
};

@ has_open depends on fops0 @
identifier fops0.fops;
identifier open_f;
@@
struct file_operations fops = {
...
 .open = open_f,
...
};

// use no_llseek if we call nonseekable_open
////////////////////////////////////////////
@ nonseekable1 depends on !has_llseek && has_open @
identifier fops0.fops;
identifier nso ~= "nonseekable_open";
@@
struct file_operations fops = {
...  .open = nso, ...
+.llseek = no_llseek, /* nonseekable */
};

@ nonseekable2 depends on !has_llseek @
identifier fops0.fops;
identifier open.open_f;
@@
struct file_operations fops = {
...  .open = open_f, ...
+.llseek = no_llseek, /* open uses nonseekable */
};

// use seq_lseek for sequential files
/////////////////////////////////////
@ seq depends on !has_llseek @
identifier fops0.fops;
identifier sr ~= "seq_read";
@@
struct file_operations fops = {
...  .read = sr, ...
+.llseek = seq_lseek, /* we have seq_read */
};

// use default_llseek if there is a readdir
///////////////////////////////////////////
@ fops1 depends on !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier readdir_e;
@@
// any other fop is used that changes pos
struct file_operations fops = {
... .readdir = readdir_e, ...
+.llseek = default_llseek, /* readdir is present */
};

// use default_llseek if at least one of read/write touches f_pos
/////////////////////////////////////////////////////////////////
@ fops2 depends on !fops1 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier read.read_f;
@@
// read fops use offset
struct file_operations fops = {
... .read = read_f, ...
+.llseek = default_llseek, /* read accesses f_pos */
};

@ fops3 depends on !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier write.write_f;
@@
// write fops use offset
struct file_operations fops = {
... .write = write_f, ...
+	.llseek = default_llseek, /* write accesses f_pos */
};

// Use noop_llseek if neither read nor write accesses f_pos
///////////////////////////////////////////////////////////

@ fops4 depends on !fops1 && !fops2 && !fops3 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier read_no_fpos.read_f;
identifier write_no_fpos.write_f;
@@
// write fops use offset
struct file_operations fops = {
...
 .write = write_f,
 .read = read_f,
...
+.llseek = noop_llseek, /* read and write both use no f_pos */
};

@ depends on has_write && !has_read && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier write_no_fpos.write_f;
@@
struct file_operations fops = {
... .write = write_f, ...
+.llseek = noop_llseek, /* write uses no f_pos */
};

@ depends on has_read && !has_write && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier read_no_fpos.read_f;
@@
struct file_operations fops = {
... .read = read_f, ...
+.llseek = noop_llseek, /* read uses no f_pos */
};

@ depends on !has_read && !has_write && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
@@
struct file_operations fops = {
...
+.llseek = noop_llseek, /* no read or write fn */
};
===== End semantic patch =====

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Julia Lawall <julia@diku.dk>
Cc: Christoph Hellwig <hch@infradead.org>
---
 arch/x86/kernel/apm_32.c                  | 1 +
 arch/x86/kernel/cpu/mcheck/mce-severity.c | 1 +
 arch/x86/kernel/cpu/mcheck/mce.c          | 1 +
 arch/x86/kernel/kdebugfs.c                | 1 +
 arch/x86/kernel/microcode_core.c          | 1 +
 arch/x86/kernel/tlb_uv.c                  | 1 +
 arch/x86/xen/debugfs.c                    | 1 +
 7 files changed, 7 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 4c9c67bf09b..fbbc4dadecc 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -1926,6 +1926,7 @@ static const struct file_operations apm_bios_fops = {
 	.unlocked_ioctl	= do_ioctl,
 	.open		= do_open,
 	.release	= do_release,
+	.llseek		= noop_llseek,
 };
 
 static struct miscdevice apm_device = {
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 8a85dd1b1aa..1e8d66c1336 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -192,6 +192,7 @@ static const struct file_operations severities_coverage_fops = {
 	.release	= seq_release,
 	.read		= seq_read,
 	.write		= severities_coverage_write,
+	.llseek		= seq_lseek,
 };
 
 static int __init severities_debugfs_init(void)
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index ed41562909f..7a35b72d7c0 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1665,6 +1665,7 @@ struct file_operations mce_chrdev_ops = {
 	.read			= mce_read,
 	.poll			= mce_poll,
 	.unlocked_ioctl		= mce_ioctl,
+	.llseek		= no_llseek,
 };
 EXPORT_SYMBOL_GPL(mce_chrdev_ops);
 
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index 8afd9f321f1..90fcf62854b 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -78,6 +78,7 @@ static int setup_data_open(struct inode *inode, struct file *file)
 static const struct file_operations fops_setup_data = {
 	.read		= setup_data_read,
 	.open		= setup_data_open,
+	.llseek		= default_llseek,
 };
 
 static int __init
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index fa6551d36c1..0b3d37e8360 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -232,6 +232,7 @@ static const struct file_operations microcode_fops = {
 	.owner			= THIS_MODULE,
 	.write			= microcode_write,
 	.open			= microcode_open,
+	.llseek		= no_llseek,
 };
 
 static struct miscdevice microcode_dev = {
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 312ef029281..50ac949c7f1 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -1285,6 +1285,7 @@ static const struct file_operations tunables_fops = {
 	.open		= tunables_open,
 	.read		= tunables_read,
 	.write		= tunables_write,
+	.llseek		= default_llseek,
 };
 
 static int __init uv_ptc_init(void)
diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c
index 1304bcec8ee..7c0fedd98ea 100644
--- a/arch/x86/xen/debugfs.c
+++ b/arch/x86/xen/debugfs.c
@@ -106,6 +106,7 @@ static const struct file_operations u32_array_fops = {
 	.open	= u32_array_open,
 	.release= xen_array_release,
 	.read	= u32_array_read,
+	.llseek = no_llseek,
 };
 
 struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode,
-- 
cgit v1.2.3-18-g5258


From 940b3c7b193ec54141976a8642c00582f423690c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Oct 2010 19:36:26 +0200
Subject: x86: sfi: Make local functions static

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Len Brown <lenb@kernel.org>
---
 arch/x86/kernel/sfi.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/sfi.c b/arch/x86/kernel/sfi.c
index cb22acf3ed0..dd4c281ffe5 100644
--- a/arch/x86/kernel/sfi.c
+++ b/arch/x86/kernel/sfi.c
@@ -34,7 +34,7 @@
 #ifdef CONFIG_X86_LOCAL_APIC
 static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
 
-void __init mp_sfi_register_lapic_address(unsigned long address)
+static void __init mp_sfi_register_lapic_address(unsigned long address)
 {
 	mp_lapic_addr = address;
 
@@ -46,7 +46,7 @@ void __init mp_sfi_register_lapic_address(unsigned long address)
 }
 
 /* All CPUs enumerated by SFI must be present and enabled */
-void __cpuinit mp_sfi_register_lapic(u8 id)
+static void __cpuinit mp_sfi_register_lapic(u8 id)
 {
 	if (MAX_APICS - id <= 0) {
 		pr_warning("Processor #%d invalid (max %d)\n",
-- 
cgit v1.2.3-18-g5258


From 40ffa93791985ab300fd488072e9f37ccf72e88c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Oct 2010 21:08:14 +0200
Subject: x86: Remove stale pmtimer_64.c

This file is unused since the apic unification in 2.6.29, but nobody
noticed.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/Makefile     |  1 -
 arch/x86/kernel/pmtimer_64.c | 69 --------------------------------------------
 2 files changed, 70 deletions(-)
 delete mode 100644 arch/x86/kernel/pmtimer_64.c

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 0925676266b..dd9a2e459c7 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -120,7 +120,6 @@ obj-$(CONFIG_SWIOTLB)			+= pci-swiotlb.o
 # 64 bit specific files
 ifeq ($(CONFIG_X86_64),y)
 	obj-$(CONFIG_X86_UV)		+= tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o
-	obj-$(CONFIG_X86_PM_TIMER)	+= pmtimer_64.o
 	obj-$(CONFIG_AUDIT)		+= audit_64.o
 
 	obj-$(CONFIG_GART_IOMMU)	+= pci-gart_64.o aperture_64.o
diff --git a/arch/x86/kernel/pmtimer_64.c b/arch/x86/kernel/pmtimer_64.c
deleted file mode 100644
index b112406f199..00000000000
--- a/arch/x86/kernel/pmtimer_64.c
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Ported over from i386 by AK, original copyright was:
- *
- * (C) Dominik Brodowski <linux@brodo.de> 2003
- *
- * Driver to use the Power Management Timer (PMTMR) available in some
- * southbridges as primary timing source for the Linux kernel.
- *
- * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c,
- * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4.
- *
- * This file is licensed under the GPL v2.
- *
- * Dropped all the hardware bug workarounds for now. Hopefully they
- * are not needed on 64bit chipsets.
- */
-
-#include <linux/jiffies.h>
-#include <linux/kernel.h>
-#include <linux/time.h>
-#include <linux/init.h>
-#include <linux/cpumask.h>
-#include <linux/acpi_pmtmr.h>
-
-#include <asm/io.h>
-#include <asm/proto.h>
-#include <asm/msr.h>
-#include <asm/vsyscall.h>
-
-static inline u32 cyc2us(u32 cycles)
-{
-	/* The Power Management Timer ticks at 3.579545 ticks per microsecond.
-	 * 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%]
-	 *
-	 * Even with HZ = 100, delta is at maximum 35796 ticks, so it can
-	 * easily be multiplied with 286 (=0x11E) without having to fear
-	 * u32 overflows.
-	 */
-	cycles *= 286;
-	return (cycles >> 10);
-}
-
-static unsigned pmtimer_wait_tick(void)
-{
-	u32 a, b;
-	for (a = b = inl(pmtmr_ioport) & ACPI_PM_MASK;
-	     a == b;
-	     b = inl(pmtmr_ioport) & ACPI_PM_MASK)
-		cpu_relax();
-	return b;
-}
-
-/* note: wait time is rounded up to one tick */
-void pmtimer_wait(unsigned us)
-{
-	u32 a, b;
-	a = pmtimer_wait_tick();
-	do {
-		b = inl(pmtmr_ioport);
-		cpu_relax();
-	} while (cyc2us(b - a) < us);
-}
-
-static int __init nopmtimer_setup(char *s)
-{
-	pmtmr_ioport = 0;
-	return 1;
-}
-
-__setup("nopmtimer", nopmtimer_setup);
-- 
cgit v1.2.3-18-g5258


From 80e7b19ae167197e84f378809b8ccddd0f99c1fd Mon Sep 17 00:00:00 2001
From: Daniel Drake <dsd@laptop.org>
Date: Thu, 23 Sep 2010 17:28:04 +0100
Subject: PCI: OLPC: Only enable PCI configuration type override on XO-1

This configuration type override is for XO-1 only and must not happen
on XO-1.5.

Acked-by: Andres Salomon <dilinger@queued.net>
Signed-off-by: Daniel Drake <dsd@laptop.org>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/Kconfig       | 2 +-
 arch/x86/kernel/olpc.c | 6 ++++--
 arch/x86/pci/olpc.c    | 2 +-
 3 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cea0cd9a316..0ed4c9bfcd1 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1900,7 +1900,7 @@ config PCI_GODIRECT
 	bool "Direct"
 
 config PCI_GOOLPC
-	bool "OLPC"
+	bool "OLPC XO-1"
 	depends on OLPC
 
 config PCI_GOANY
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c
index 0e0cdde519b..635888cf050 100644
--- a/arch/x86/kernel/olpc.c
+++ b/arch/x86/kernel/olpc.c
@@ -242,8 +242,10 @@ static int __init olpc_init(void)
 			(unsigned char *) &olpc_platform_info.ecver, 1);
 
 #ifdef CONFIG_PCI_OLPC
-	/* If the VSA exists let it emulate PCI, if not emulate in kernel */
-	if (!cs5535_has_vsa2())
+	/* If the VSA exists let it emulate PCI, if not emulate in kernel.
+	 * XO-1 only. */
+	if (olpc_platform_info.boardrev < olpc_board_pre(0xd0) &&
+			!cs5535_has_vsa2())
 		x86_init.pci.arch_init = pci_olpc_init;
 #endif
 
diff --git a/arch/x86/pci/olpc.c b/arch/x86/pci/olpc.c
index b34815408f5..13700ec8e2e 100644
--- a/arch/x86/pci/olpc.c
+++ b/arch/x86/pci/olpc.c
@@ -304,7 +304,7 @@ static struct pci_raw_ops pci_olpc_conf = {
 
 int __init pci_olpc_init(void)
 {
-	printk(KERN_INFO "PCI: Using configuration type OLPC\n");
+	printk(KERN_INFO "PCI: Using configuration type OLPC XO-1\n");
 	raw_pci_ops = &pci_olpc_conf;
 	is_lx = is_geode_lx();
 	return 0;
-- 
cgit v1.2.3-18-g5258


From 25143fd1270d28782ae0620aa86ef5f8c14030fd Mon Sep 17 00:00:00 2001
From: Seth Heasley <seth.heasley@intel.com>
Date: Fri, 10 Sep 2010 16:36:39 -0700
Subject: x86/PCI: irq and pci_ids patch for Intel Patsburg DeviceIDs

This patch adds the LPC Controller DeviceIDs for the Intel Patsburg PCH.

Signed-off-by: Seth Heasley <seth.heasley@intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/irq.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index f547ee05f71..ee7fc8fc8a8 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -589,6 +589,7 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
 	case PCI_DEVICE_ID_INTEL_ICH10_1:
 	case PCI_DEVICE_ID_INTEL_ICH10_2:
 	case PCI_DEVICE_ID_INTEL_ICH10_3:
+	case PCI_DEVICE_ID_INTEL_PBG_LPC:
 		r->name = "PIIX/ICH";
 		r->get = pirq_piix_get;
 		r->set = pirq_piix_set;
-- 
cgit v1.2.3-18-g5258


From cb04e95bdd0bfd618ab731c84a3ab56b56974df8 Mon Sep 17 00:00:00 2001
From: Seth Heasley <seth.heasley@intel.com>
Date: Mon, 4 Oct 2010 13:27:14 -0700
Subject: PCI: update Intel chipset names and defines

This patch updates the defines for Intel devices in
include/linux/pci_ids.h, referenced in arch/x86/pci/irq.c and
drivers/i2c/busses/i2c-i801.c, reflecting approved legal branding, and
using fuller code-names for products under development.

Acked-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Seth Heasley <seth.heasley@intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/irq.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index ee7fc8fc8a8..9f9bfb705cf 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -584,28 +584,28 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
 	case PCI_DEVICE_ID_INTEL_ICH9_3:
 	case PCI_DEVICE_ID_INTEL_ICH9_4:
 	case PCI_DEVICE_ID_INTEL_ICH9_5:
-	case PCI_DEVICE_ID_INTEL_TOLAPAI_0:
+	case PCI_DEVICE_ID_INTEL_EP80579_0:
 	case PCI_DEVICE_ID_INTEL_ICH10_0:
 	case PCI_DEVICE_ID_INTEL_ICH10_1:
 	case PCI_DEVICE_ID_INTEL_ICH10_2:
 	case PCI_DEVICE_ID_INTEL_ICH10_3:
-	case PCI_DEVICE_ID_INTEL_PBG_LPC:
+	case PCI_DEVICE_ID_INTEL_PATSBURG_LPC:
 		r->name = "PIIX/ICH";
 		r->get = pirq_piix_get;
 		r->set = pirq_piix_set;
 		return 1;
 	}
 
-	if ((device >= PCI_DEVICE_ID_INTEL_PCH_LPC_MIN) && 
-		(device <= PCI_DEVICE_ID_INTEL_PCH_LPC_MAX)) {
+	if ((device >= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MIN) && 
+		(device <= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MAX)) {
 		r->name = "PIIX/ICH";
 		r->get = pirq_piix_get;
 		r->set = pirq_piix_set;
 		return 1;
 	}
 
-	if ((device >= PCI_DEVICE_ID_INTEL_CPT_LPC_MIN) && 
-		(device <= PCI_DEVICE_ID_INTEL_CPT_LPC_MAX)) {
+	if ((device >= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MIN) && 
+		(device <= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MAX)) {
 		r->name = "PIIX/ICH";
 		r->get = pirq_piix_get;
 		r->set = pirq_piix_set;
-- 
cgit v1.2.3-18-g5258


From 1ca98fa652bb5dc3c8793335db9ccc5d0f2e1f65 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bjorn.helgaas@hp.com>
Date: Mon, 4 Oct 2010 12:49:24 -0600
Subject: x86/PCI: MMCONFIG: fix region end calculation

The end of an MMCONFIG region depends on the ending bus number, not on the
number of buses the region covers.  We previously computed the wrong ending
address whenever the starting bus number was non-zero, e.g.,:

  MMCONFIG for [bus 00-1f] at [mem 0xe0000000-0xe1ffffff] (base 0xe0000000)
  MMCONFIG for [bus 20-3f] at [mem 0xe2000000-0xe1ffffff] (base 0xe0000000)

The correct regions are:

  MMCONFIG for [bus 00-1f] at [mem 0xe0000000-0xe1ffffff] (base 0xe0000000)
  MMCONFIG for [bus 20-3f] at [mem 0xe2000000-0xe3ffffff] (base 0xe0000000)

Signed-off-by: Bjorn Helgaas <bjorn.helgaas@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/mmconfig-shared.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index a918553ebc7..e282886616a 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -65,7 +65,6 @@ static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start,
 							int end, u64 addr)
 {
 	struct pci_mmcfg_region *new;
-	int num_buses;
 	struct resource *res;
 
 	if (addr == 0)
@@ -82,10 +81,9 @@ static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start,
 
 	list_add_sorted(new);
 
-	num_buses = end - start + 1;
 	res = &new->res;
 	res->start = addr + PCI_MMCFG_BUS_OFFSET(start);
-	res->end = addr + PCI_MMCFG_BUS_OFFSET(num_buses) - 1;
+	res->end = addr + PCI_MMCFG_BUS_OFFSET(end + 1) - 1;
 	res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
 	snprintf(new->name, PCI_MMCFG_RESOURCE_NAME_LEN,
 		 "PCI MMCONFIG %04x [bus %02x-%02x]", segment, start, end);
-- 
cgit v1.2.3-18-g5258


From 50a23e6eec6f20d55a3a920e47adb455bff6046e Mon Sep 17 00:00:00 2001
From: "Justin P. Mattock" <justinmattock@gmail.com>
Date: Sat, 16 Oct 2010 10:36:23 -0700
Subject: Update broken web addresses in arch directory.

The patch below updates broken web addresses in the arch directory.

Signed-off-by: Justin P. Mattock <justinmattock@gmail.com>
Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org>
Cc: Finn Thain <fthain@telegraphics.com.au>
Cc: Randy Dunlap <rdunlap@xenotime.net>
Reviewed-by: Finn Thain <fthain@telegraphics.com.au>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 arch/x86/kernel/apm_32.c          | 4 ++--
 arch/x86/kernel/microcode_core.c  | 2 +-
 arch/x86/kernel/microcode_intel.c | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 4c9c67bf09b..9fed1cc7ba9 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -189,8 +189,8 @@
  *   Intel Order Number 241704-001.  Microsoft Part Number 781-110-X01.
  *
  * [This document is available free from Intel by calling 800.628.8686 (fax
- * 916.356.6100) or 800.548.4725; or via anonymous ftp from
- * ftp://ftp.intel.com/pub/IAL/software_specs/apmv11.doc.  It is also
+ * 916.356.6100) or 800.548.4725; or from
+ * http://www.microsoft.com/whdc/archive/amp_12.mspx  It is also
  * available from Microsoft by calling 206.882.8080.]
  *
  * APM 1.2 Reference:
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index fa6551d36c1..b9c5c548224 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -12,7 +12,7 @@
  *	Software Developer's Manual
  *	Order Number 253668 or free download from:
  *
- *	http://developer.intel.com/design/pentium4/manuals/253668.htm
+ *	http://developer.intel.com/Assets/PDF/manual/253668.pdf	
  *
  *	For more information, go to http://www.urbanmyth.org/microcode
  *
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 356170262a9..dcb65cc0a05 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -12,7 +12,7 @@
  *	Software Developer's Manual
  *	Order Number 253668 or free download from:
  *
- *	http://developer.intel.com/design/pentium4/manuals/253668.htm
+ *	http://developer.intel.com/Assets/PDF/manual/253668.pdf	
  *
  *	For more information, go to http://www.urbanmyth.org/microcode
  *
-- 
cgit v1.2.3-18-g5258


From 23ace955c22cb9bdf703e4bdc9bf7379166113cd Mon Sep 17 00:00:00 2001
From: Alex Nixon <alex.nixon@citrix.com>
Date: Mon, 9 Feb 2009 12:05:46 -0800
Subject: xen: Don't disable the I/O space

If a guest domain wants to access PCI devices through the frontend
driver (coming later in the patch series), it will need access to the
I/O space.

[ Impact: Allow for domU IO access, preparing for pci passthrough ]

Signed-off-by: Alex Nixon <alex.nixon@citrix.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/setup.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 328b0030542..c413132702f 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -260,7 +260,5 @@ void __init xen_arch_setup(void)
 
 	pm_idle = xen_idle;
 
-	paravirt_disable_iospace();
-
 	fiddle_vdso();
 }
-- 
cgit v1.2.3-18-g5258


From d8e0420603cf1ce9cb459c00ea0b7337de41b968 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Mon, 9 Feb 2009 12:05:46 -0800
Subject: xen: define BIOVEC_PHYS_MERGEABLE()

Impact: allow Xen control of bio merging

When running in Xen domain with device access, we need to make sure
the block subsystem doesn't merge requests across pages which aren't
machine physically contiguous.  To do this, we define our own
BIOVEC_PHYS_MERGEABLE.  When CONFIG_XEN isn't enabled, or we're not
running in a Xen domain, this has identical behaviour to the normal
implementation.  When running under Xen, we also make sure the
underlying machine pages are the same or adjacent.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/include/asm/io.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 30a3e977612..0ad29d40156 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -41,6 +41,8 @@
 #include <asm-generic/int-ll64.h>
 #include <asm/page.h>
 
+#include <xen/xen.h>
+
 #define build_mmio_read(name, size, type, reg, barrier) \
 static inline type name(const volatile void __iomem *addr) \
 { type ret; asm volatile("mov" size " %1,%0":reg (ret) \
@@ -349,6 +351,17 @@ extern void __iomem *early_memremap(resource_size_t phys_addr,
 extern void early_iounmap(void __iomem *addr, unsigned long size);
 extern void fixup_early_ioremap(void);
 
+#ifdef CONFIG_XEN
+struct bio_vec;
+
+extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
+				      const struct bio_vec *vec2);
+
+#define BIOVEC_PHYS_MERGEABLE(vec1, vec2)				\
+	(__BIOVEC_PHYS_MERGEABLE(vec1, vec2) &&				\
+	 (!xen_domain() || xen_biovec_phys_mergeable(vec1, vec2)))
+#endif	/* CONFIG_XEN */
+
 #define IO_SPACE_LIMIT 0xffff
 
 #endif /* _ASM_X86_IO_H */
-- 
cgit v1.2.3-18-g5258


From 7b586d71858091f0958e5808b7e3d5390c2ae47d Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Thu, 12 Feb 2009 17:22:49 -0800
Subject: x86/io_apic: add get_nr_irqs_gsi()

Impact: new interface to get max GSI

Add get_nr_irqs_gsi() to return nr_irqs_gsi.  Xen will use this to
determine how many irqs it needs to reserve for hardware irqs.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: "H. Peter Anvin" <hpa@zytor.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: x86@kernel.org
Cc: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/include/asm/io_apic.h | 1 +
 arch/x86/kernel/apic/io_apic.c | 5 +++++
 2 files changed, 6 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index c8be4566c3d..a6b28d017c2 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -169,6 +169,7 @@ extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
 extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
 
 extern void probe_nr_irqs_gsi(void);
+extern int get_nr_irqs_gsi(void);
 
 extern void setup_ioapic_ids_from_mpc(void);
 
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 20e47e04539..44bb914a42b 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3649,6 +3649,11 @@ void __init probe_nr_irqs_gsi(void)
 	printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi);
 }
 
+int get_nr_irqs_gsi(void)
+{
+	return nr_irqs_gsi;
+}
+
 #ifdef CONFIG_SPARSE_IRQ
 int __init arch_probe_nr_irqs(void)
 {
-- 
cgit v1.2.3-18-g5258


From 44de3395a4bb61341dfb7b3b7c94edfddeabae4b Mon Sep 17 00:00:00 2001
From: Alex Nixon <alex.nixon@citrix.com>
Date: Thu, 18 Mar 2010 14:28:12 -0400
Subject: x86/PCI: Clean up pci_cache_line_size

Separate out x86 cache_line_size initialisation code into its own
function (so it can be shared by Xen later in this patch series)

[ Impact: cleanup ]

Signed-off-by: Alex Nixon <alex.nixon@citrix.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: "H. Peter Anvin" <hpa@zytor.com>
Reviewed-by: Matthew Wilcox <willy@linux.intel.com>
Reviewed-by: Jesse Barnes <jbarnes@virtuousgeek.org>
Cc: x86@kernel.org
---
 arch/x86/include/asm/pci_x86.h |  1 +
 arch/x86/pci/common.c          | 17 ++++++++++-------
 2 files changed, 11 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index 49c7219826f..704526734be 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -47,6 +47,7 @@ enum pci_bf_sort_state {
 extern unsigned int pcibios_max_latency;
 
 void pcibios_resource_survey(void);
+void pcibios_set_cache_line_size(void);
 
 /* pci-pc.c */
 
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index a0772af64ef..f7c8a399978 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -421,16 +421,10 @@ struct pci_bus * __devinit pcibios_scan_root(int busnum)
 
 	return bus;
 }
-
-int __init pcibios_init(void)
+void __init pcibios_set_cache_line_size(void)
 {
 	struct cpuinfo_x86 *c = &boot_cpu_data;
 
-	if (!raw_pci_ops) {
-		printk(KERN_WARNING "PCI: System does not support PCI\n");
-		return 0;
-	}
-
 	/*
 	 * Set PCI cacheline size to that of the CPU if the CPU has reported it.
 	 * (For older CPUs that don't support cpuid, we se it to 32 bytes
@@ -445,7 +439,16 @@ int __init pcibios_init(void)
  		pci_dfl_cache_line_size = 32 >> 2;
 		printk(KERN_DEBUG "PCI: Unknown cacheline size. Setting to 32 bytes\n");
 	}
+}
+
+int __init pcibios_init(void)
+{
+	if (!raw_pci_ops) {
+		printk(KERN_WARNING "PCI: System does not support PCI\n");
+		return 0;
+	}
 
+	pcibios_set_cache_line_size();
 	pcibios_resource_survey();
 
 	if (pci_bf_sort >= pci_force_bf)
-- 
cgit v1.2.3-18-g5258


From 5ee01f49c963d5e0b530344f86535ecb7f672064 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Thu, 18 Mar 2010 14:31:30 -0400
Subject: x86/PCI: make sure _PAGE_IOMAP it set on pci mappings

When mapping pci space via /sys or /proc, make sure we're really
doing a hardware mapping by setting _PAGE_IOMAP.

[ Impact: bugfix; make PCI mappings map the right pages ]

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: "H. Peter Anvin" <hpa@zytor.com>
Reviewed-by: Matthew Wilcox <willy@linux.intel.com>
Acked-by: Jesse Barnes <jbarnes@virtuousgeek.org>
Cc: x86@kernel.org
---
 arch/x86/pci/i386.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 55253095be8..8379c2c3d07 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -311,6 +311,8 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
 		 */
 		prot |= _PAGE_CACHE_UC_MINUS;
 
+	prot |= _PAGE_IOMAP;	/* creating a mapping for IO */
+
 	vma->vm_page_prot = __pgprot(prot);
 
 	if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
-- 
cgit v1.2.3-18-g5258


From 294ee6f89cfd629e276f632a6003a0fad7785dce Mon Sep 17 00:00:00 2001
From: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Date: Wed, 6 Oct 2010 16:12:28 -0400
Subject: x86: Introduce x86_msi_ops

Introduce an x86 specific indirect mechanism to setup MSIs.
The MSI setup functions become function pointers in an x86_msi_ops
struct, that defaults to the implementation in io_apic.c and msi.c.

[v2: Use HAVE_DEFAULT_* knobs]
Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: x86@kernel.org
Cc: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/include/asm/pci.h      | 33 +++++++++++++++++++++++++++++++--
 arch/x86/include/asm/x86_init.h |  9 +++++++++
 arch/x86/kernel/apic/io_apic.c  |  4 ++--
 arch/x86/kernel/x86_init.c      |  7 +++++++
 4 files changed, 49 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index d395540ff89..ca0437c714b 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -7,6 +7,7 @@
 #include <linux/string.h>
 #include <asm/scatterlist.h>
 #include <asm/io.h>
+#include <asm/x86_init.h>
 
 #ifdef __KERNEL__
 
@@ -94,8 +95,36 @@ static inline void early_quirks(void) { }
 
 extern void pci_iommu_alloc(void);
 
-/* MSI arch hook */
-#define arch_setup_msi_irqs arch_setup_msi_irqs
+#ifdef CONFIG_PCI_MSI
+/* MSI arch specific hooks */
+static inline int x86_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+	return x86_msi.setup_msi_irqs(dev, nvec, type);
+}
+
+static inline void x86_teardown_msi_irqs(struct pci_dev *dev)
+{
+	x86_msi.teardown_msi_irqs(dev);
+}
+
+static inline void x86_teardown_msi_irq(unsigned int irq)
+{
+	x86_msi.teardown_msi_irq(irq);
+}
+#define arch_setup_msi_irqs x86_setup_msi_irqs
+#define arch_teardown_msi_irqs x86_teardown_msi_irqs
+#define arch_teardown_msi_irq x86_teardown_msi_irq
+/* implemented in arch/x86/kernel/apic/io_apic. */
+int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
+void native_teardown_msi_irq(unsigned int irq);
+/* default to the implementation in drivers/lib/msi.c */
+#define HAVE_DEFAULT_MSI_TEARDOWN_IRQS
+void default_teardown_msi_irqs(struct pci_dev *dev);
+#else
+#define native_setup_msi_irqs		NULL
+#define native_teardown_msi_irq		NULL
+#define default_teardown_msi_irqs	NULL
+#endif
 
 #define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
 
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index baa579c8e03..64642ad019f 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -154,9 +154,18 @@ struct x86_platform_ops {
 	int (*i8042_detect)(void);
 };
 
+struct pci_dev;
+
+struct x86_msi_ops {
+	int (*setup_msi_irqs)(struct pci_dev *dev, int nvec, int type);
+	void (*teardown_msi_irq)(unsigned int irq);
+	void (*teardown_msi_irqs)(struct pci_dev *dev);
+};
+
 extern struct x86_init_ops x86_init;
 extern struct x86_cpuinit_ops x86_cpuinit;
 extern struct x86_platform_ops x86_platform;
+extern struct x86_msi_ops x86_msi;
 
 extern void x86_init_noop(void);
 extern void x86_init_uint_noop(unsigned int unused);
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 44bb914a42b..0885a412073 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3330,7 +3330,7 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
 	return 0;
 }
 
-int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 {
 	int node, ret, sub_handle, index = 0;
 	unsigned int irq, irq_want;
@@ -3388,7 +3388,7 @@ error:
 	return ret;
 }
 
-void arch_teardown_msi_irq(unsigned int irq)
+void native_teardown_msi_irq(unsigned int irq)
 {
 	destroy_irq(irq);
 }
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index cd6da6bf3ec..ceb2911aa43 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -6,10 +6,12 @@
 #include <linux/init.h>
 #include <linux/ioport.h>
 #include <linux/module.h>
+#include <linux/pci.h>
 
 #include <asm/bios_ebda.h>
 #include <asm/paravirt.h>
 #include <asm/pci_x86.h>
+#include <asm/pci.h>
 #include <asm/mpspec.h>
 #include <asm/setup.h>
 #include <asm/apic.h>
@@ -99,3 +101,8 @@ struct x86_platform_ops x86_platform = {
 };
 
 EXPORT_SYMBOL_GPL(x86_platform);
+struct x86_msi_ops x86_msi = {
+	.setup_msi_irqs = native_setup_msi_irqs,
+	.teardown_msi_irq = native_teardown_msi_irq,
+	.teardown_msi_irqs = default_teardown_msi_irqs,
+};
-- 
cgit v1.2.3-18-g5258


From b5401a96b59475c1c878439caecb8c521bdfd4ad Mon Sep 17 00:00:00 2001
From: Alex Nixon <alex.nixon@citrix.com>
Date: Thu, 18 Mar 2010 16:31:34 -0400
Subject: xen/x86/PCI: Add support for the Xen PCI subsystem

The frontend stub lives in arch/x86/pci/xen.c, alongside other
sub-arch PCI init code (e.g. olpc.c).

It provides a mechanism for Xen PCI frontend to setup/destroy
legacy interrupts, MSI/MSI-X, and PCI configuration operations.

[ Impact: add core of Xen PCI support ]
[ v2: Removed the IOMMU code and only focusing on PCI.]
[ v3: removed usage of pci_scan_all_fns as that does not exist]
[ v4: introduced pci_xen value to fix compile warnings]
[ v5: squished fixes+features in one patch, changed Reviewed-by to Ccs]
[ v7: added Acked-by]
Signed-off-by: Alex Nixon <alex.nixon@citrix.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Acked-by: Jesse Barnes <jbarnes@virtuousgeek.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Matthew Wilcox <willy@linux.intel.com>
Cc: Qing He <qing.he@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: x86@kernel.org
---
 arch/x86/Kconfig               |   5 ++
 arch/x86/include/asm/xen/pci.h |  53 +++++++++++++++
 arch/x86/pci/Makefile          |   1 +
 arch/x86/pci/xen.c             | 147 +++++++++++++++++++++++++++++++++++++++++
 arch/x86/xen/enlighten.c       |   3 +
 5 files changed, 209 insertions(+)
 create mode 100644 arch/x86/include/asm/xen/pci.h
 create mode 100644 arch/x86/pci/xen.c

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 8cc510874e1..74ea59d3407 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1898,6 +1898,11 @@ config PCI_OLPC
 	def_bool y
 	depends on PCI && OLPC && (PCI_GOOLPC || PCI_GOANY)
 
+config PCI_XEN
+	def_bool y
+	depends on PCI && XEN
+	select SWIOTLB_XEN
+
 config PCI_DOMAINS
 	def_bool y
 	depends on PCI
diff --git a/arch/x86/include/asm/xen/pci.h b/arch/x86/include/asm/xen/pci.h
new file mode 100644
index 00000000000..449c82f7167
--- /dev/null
+++ b/arch/x86/include/asm/xen/pci.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_X86_XEN_PCI_H
+#define _ASM_X86_XEN_PCI_H
+
+#if defined(CONFIG_PCI_XEN)
+extern int __init pci_xen_init(void);
+#define pci_xen 1
+#else
+#define pci_xen 0
+#define pci_xen_init (0)
+#endif
+
+#if defined(CONFIG_PCI_MSI)
+#if defined(CONFIG_PCI_XEN)
+/* The drivers/pci/xen-pcifront.c sets this structure to
+ * its own functions.
+ */
+struct xen_pci_frontend_ops {
+	int (*enable_msi)(struct pci_dev *dev, int **vectors);
+	void (*disable_msi)(struct pci_dev *dev);
+	int (*enable_msix)(struct pci_dev *dev, int **vectors, int nvec);
+	void (*disable_msix)(struct pci_dev *dev);
+};
+
+extern struct xen_pci_frontend_ops *xen_pci_frontend;
+
+static inline int xen_pci_frontend_enable_msi(struct pci_dev *dev,
+					      int **vectors)
+{
+	if (xen_pci_frontend && xen_pci_frontend->enable_msi)
+		return xen_pci_frontend->enable_msi(dev, vectors);
+	return -ENODEV;
+}
+static inline void xen_pci_frontend_disable_msi(struct pci_dev *dev)
+{
+	if (xen_pci_frontend && xen_pci_frontend->disable_msi)
+			xen_pci_frontend->disable_msi(dev);
+}
+static inline int xen_pci_frontend_enable_msix(struct pci_dev *dev,
+					       int **vectors, int nvec)
+{
+	if (xen_pci_frontend && xen_pci_frontend->enable_msix)
+		return xen_pci_frontend->enable_msix(dev, vectors, nvec);
+	return -ENODEV;
+}
+static inline void xen_pci_frontend_disable_msix(struct pci_dev *dev)
+{
+	if (xen_pci_frontend && xen_pci_frontend->disable_msix)
+			xen_pci_frontend->disable_msix(dev);
+}
+#endif /* CONFIG_PCI_XEN */
+#endif /* CONFIG_PCI_MSI */
+
+#endif	/* _ASM_X86_XEN_PCI_H */
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index a0207a7fdf3..effd96e33f1 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -4,6 +4,7 @@ obj-$(CONFIG_PCI_BIOS)		+= pcbios.o
 obj-$(CONFIG_PCI_MMCONFIG)	+= mmconfig_$(BITS).o direct.o mmconfig-shared.o
 obj-$(CONFIG_PCI_DIRECT)	+= direct.o
 obj-$(CONFIG_PCI_OLPC)		+= olpc.o
+obj-$(CONFIG_PCI_XEN)		+= xen.o
 
 obj-y				+= fixup.o
 obj-$(CONFIG_ACPI)		+= acpi.o
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
new file mode 100644
index 00000000000..b19c873d8d0
--- /dev/null
+++ b/arch/x86/pci/xen.c
@@ -0,0 +1,147 @@
+/*
+ * Xen PCI Frontend Stub - puts some "dummy" functions in to the Linux
+ *			   x86 PCI core to support the Xen PCI Frontend
+ *
+ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/acpi.h>
+
+#include <linux/io.h>
+#include <asm/pci_x86.h>
+
+#include <asm/xen/hypervisor.h>
+
+#include <xen/events.h>
+#include <asm/xen/pci.h>
+
+#if defined(CONFIG_PCI_MSI)
+#include <linux/msi.h>
+
+struct xen_pci_frontend_ops *xen_pci_frontend;
+EXPORT_SYMBOL_GPL(xen_pci_frontend);
+
+/*
+ * For MSI interrupts we have to use drivers/xen/event.s functions to
+ * allocate an irq_desc and setup the right */
+
+
+static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+	int irq, ret, i;
+	struct msi_desc *msidesc;
+	int *v;
+
+	v = kzalloc(sizeof(int) * max(1, nvec), GFP_KERNEL);
+	if (!v)
+		return -ENOMEM;
+
+	if (!xen_initial_domain()) {
+		if (type == PCI_CAP_ID_MSIX)
+			ret = xen_pci_frontend_enable_msix(dev, &v, nvec);
+		else
+			ret = xen_pci_frontend_enable_msi(dev, &v);
+		if (ret)
+			goto error;
+	}
+	i = 0;
+	list_for_each_entry(msidesc, &dev->msi_list, list) {
+		irq = xen_allocate_pirq(v[i], 0, /* not sharable */
+			(type == PCI_CAP_ID_MSIX) ?
+			"pcifront-msi-x" : "pcifront-msi");
+		if (irq < 0)
+			return -1;
+
+		ret = set_irq_msi(irq, msidesc);
+		if (ret)
+			goto error_while;
+		i++;
+	}
+	kfree(v);
+	return 0;
+
+error_while:
+	unbind_from_irqhandler(irq, NULL);
+error:
+	if (ret == -ENODEV)
+		dev_err(&dev->dev, "Xen PCI frontend has not registered" \
+			" MSI/MSI-X support!\n");
+
+	kfree(v);
+	return ret;
+}
+
+static void xen_teardown_msi_irqs(struct pci_dev *dev)
+{
+	/* Only do this when were are in non-privileged mode.*/
+	if (!xen_initial_domain()) {
+		struct msi_desc *msidesc;
+
+		msidesc = list_entry(dev->msi_list.next, struct msi_desc, list);
+		if (msidesc->msi_attrib.is_msix)
+			xen_pci_frontend_disable_msix(dev);
+		else
+			xen_pci_frontend_disable_msi(dev);
+	}
+
+}
+
+static void xen_teardown_msi_irq(unsigned int irq)
+{
+	xen_destroy_irq(irq);
+}
+#endif
+
+static int xen_pcifront_enable_irq(struct pci_dev *dev)
+{
+	int rc;
+	int share = 1;
+
+	dev_info(&dev->dev, "Xen PCI enabling IRQ: %d\n", dev->irq);
+
+	if (dev->irq < 0)
+		return -EINVAL;
+
+	if (dev->irq < NR_IRQS_LEGACY)
+		share = 0;
+
+	rc = xen_allocate_pirq(dev->irq, share, "pcifront");
+	if (rc < 0) {
+		dev_warn(&dev->dev, "Xen PCI IRQ: %d, failed to register:%d\n",
+			 dev->irq, rc);
+		return rc;
+	}
+	return 0;
+}
+
+int __init pci_xen_init(void)
+{
+	if (!xen_pv_domain() || xen_initial_domain())
+		return -ENODEV;
+
+	printk(KERN_INFO "PCI: setting up Xen PCI frontend stub\n");
+
+	pcibios_set_cache_line_size();
+
+	pcibios_enable_irq = xen_pcifront_enable_irq;
+	pcibios_disable_irq = NULL;
+
+#ifdef CONFIG_ACPI
+	/* Keep ACPI out of the picture */
+	acpi_noirq = 1;
+#endif
+
+#ifdef CONFIG_ISAPNP
+	/* Stop isapnp from probing */
+	isapnp_disable = 1;
+#endif
+
+#ifdef CONFIG_PCI_MSI
+	x86_msi.setup_msi_irqs = xen_setup_msi_irqs;
+	x86_msi.teardown_msi_irq = xen_teardown_msi_irq;
+	x86_msi.teardown_msi_irqs = xen_teardown_msi_irqs;
+#endif
+	return 0;
+}
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 7d46c844141..1ccfa1bf0f8 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -45,6 +45,7 @@
 #include <asm/paravirt.h>
 #include <asm/apic.h>
 #include <asm/page.h>
+#include <asm/xen/pci.h>
 #include <asm/xen/hypercall.h>
 #include <asm/xen/hypervisor.h>
 #include <asm/fixmap.h>
@@ -1220,6 +1221,8 @@ asmlinkage void __init xen_start_kernel(void)
 		add_preferred_console("xenboot", 0, NULL);
 		add_preferred_console("tty", 0, NULL);
 		add_preferred_console("hvc", 0, NULL);
+		if (pci_xen)
+			x86_init.pci.arch_init = pci_xen_init;
 	} else {
 		/* Make sure ACS will be enabled */
 		pci_request_acs();
-- 
cgit v1.2.3-18-g5258


From 74226b8c8a0b10841129916191205095af928da5 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 19 Aug 2010 13:34:58 -0400
Subject: xen/pci: Request ACS when Xen-SWIOTLB is activated.

It used to done in the Xen startup code but that is not really
appropiate.

[v2: Update Kconfig with PCI requirement]
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/pci-swiotlb-xen.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c
index a013ec9d0c5..be4d80a6fae 100644
--- a/arch/x86/xen/pci-swiotlb-xen.c
+++ b/arch/x86/xen/pci-swiotlb-xen.c
@@ -1,6 +1,7 @@
 /* Glue code to lib/swiotlb-xen.c */
 
 #include <linux/dma-mapping.h>
+#include <linux/pci.h>
 #include <xen/swiotlb-xen.h>
 
 #include <asm/xen/hypervisor.h>
@@ -54,5 +55,8 @@ void __init pci_xen_swiotlb_init(void)
 	if (xen_swiotlb) {
 		xen_swiotlb_init(1);
 		dma_ops = &xen_swiotlb_dma_ops;
+
+		/* Make sure ACS will be enabled */
+		pci_request_acs();
 	}
 }
-- 
cgit v1.2.3-18-g5258


From ba0cef3d149ce4db293c572bf36ed352b11ce7b9 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Fri, 15 Oct 2010 15:15:01 +0200
Subject: perf_events: Fix bogus AMD64 generic TLB events

PERF_COUNT_HW_CACHE_DTLB:READ:MISS had a bogus umask value of 0 which
counts nothing. Needed to be 0x7 (to count all possibilities).

PERF_COUNT_HW_CACHE_ITLB:READ:MISS had a bogus umask value of 0 which
counts nothing. Needed to be 0x3 (to count all possibilities).

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Robert Richter <robert.richter@amd.com>
Cc: <stable@kernel.org> # as far back as it applies
LKML-Reference: <4cb85478.41e9d80a.44e2.3f00@mx.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_amd.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index c2897b7b4a3..46d58448c3a 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -52,7 +52,7 @@ static __initconst const u64 amd_hw_cache_event_ids
  [ C(DTLB) ] = {
 	[ C(OP_READ) ] = {
 		[ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses        */
-		[ C(RESULT_MISS)   ] = 0x0046, /* L1 DTLB and L2 DLTB Miss   */
+		[ C(RESULT_MISS)   ] = 0x0746, /* L1_DTLB_AND_L2_DLTB_MISS.ALL */
 	},
 	[ C(OP_WRITE) ] = {
 		[ C(RESULT_ACCESS) ] = 0,
@@ -66,7 +66,7 @@ static __initconst const u64 amd_hw_cache_event_ids
  [ C(ITLB) ] = {
 	[ C(OP_READ) ] = {
 		[ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes        */
-		[ C(RESULT_MISS)   ] = 0x0085, /* Instr. fetch ITLB misses   */
+		[ C(RESULT_MISS)   ] = 0x0385, /* L1_ITLB_AND_L2_ITLB_MISS.ALL */
 	},
 	[ C(OP_WRITE) ] = {
 		[ C(RESULT_ACCESS) ] = -1,
-- 
cgit v1.2.3-18-g5258


From e360adbe29241a0194e10e20595360dd7b98a2b3 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 14 Oct 2010 14:01:34 +0800
Subject: irq_work: Add generic hardirq context callbacks

Provide a mechanism that allows running code in IRQ context. It is
most useful for NMI code that needs to interact with the rest of the
system -- like wakeup a task to drain buffers.

Perf currently has such a mechanism, so extract that and provide it as
a generic feature, independent of perf so that others may also
benefit.

The IRQ context callback is generated through self-IPIs where
possible, or on architectures like powerpc the decrementer (the
built-in timer facility) is set to generate an interrupt immediately.

Architectures that don't have anything like this get to do with a
callback from the timer tick. These architectures can call
irq_work_run() at the tail of any IRQ handlers that might enqueue such
work (like the perf IRQ handler) to avoid undue latencies in
processing the work.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Kyle McMartin <kyle@mcmartin.ca>
Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
[ various fixes ]
Signed-off-by: Huang Ying <ying.huang@intel.com>
LKML-Reference: <1287036094.7768.291.camel@yhuang-dev>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig                   |  1 +
 arch/x86/include/asm/entry_arch.h  |  4 ++--
 arch/x86/include/asm/hardirq.h     |  2 +-
 arch/x86/include/asm/hw_irq.h      |  2 +-
 arch/x86/include/asm/irq_vectors.h |  4 ++--
 arch/x86/kernel/Makefile           |  1 +
 arch/x86/kernel/cpu/perf_event.c   | 19 -------------------
 arch/x86/kernel/entry_64.S         |  6 +++---
 arch/x86/kernel/irq.c              |  8 ++++----
 arch/x86/kernel/irq_work.c         | 30 ++++++++++++++++++++++++++++++
 arch/x86/kernel/irqinit.c          |  6 +++---
 11 files changed, 48 insertions(+), 35 deletions(-)
 create mode 100644 arch/x86/kernel/irq_work.c

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 9815221976a..fd227d6b8d9 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -25,6 +25,7 @@ config X86
 	select HAVE_IDE
 	select HAVE_OPROFILE
 	select HAVE_PERF_EVENTS if (!M386 && !M486)
+	select HAVE_IRQ_WORK
 	select HAVE_IOREMAP_PROT
 	select HAVE_KPROBES
 	select ARCH_WANT_OPTIONAL_GPIOLIB
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index 8e8ec663a98..b8e96a18676 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -49,8 +49,8 @@ BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR)
 BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
 BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
 
-#ifdef CONFIG_PERF_EVENTS
-BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR)
+#ifdef CONFIG_IRQ_WORK
+BUILD_INTERRUPT(irq_work_interrupt, IRQ_WORK_VECTOR)
 #endif
 
 #ifdef CONFIG_X86_THERMAL_VECTOR
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index aeab29aee61..55e4de613f0 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -14,7 +14,7 @@ typedef struct {
 #endif
 	unsigned int x86_platform_ipis;	/* arch dependent */
 	unsigned int apic_perf_irqs;
-	unsigned int apic_pending_irqs;
+	unsigned int apic_irq_work_irqs;
 #ifdef CONFIG_SMP
 	unsigned int irq_resched_count;
 	unsigned int irq_call_count;
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 46c0fe05f23..3a54a1ca1a0 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -29,7 +29,7 @@
 extern void apic_timer_interrupt(void);
 extern void x86_platform_ipi(void);
 extern void error_interrupt(void);
-extern void perf_pending_interrupt(void);
+extern void irq_work_interrupt(void);
 
 extern void spurious_interrupt(void);
 extern void thermal_interrupt(void);
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index e2ca3009255..6af0894dafb 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -114,9 +114,9 @@
 #define X86_PLATFORM_IPI_VECTOR		0xed
 
 /*
- * Performance monitoring pending work vector:
+ * IRQ work vector:
  */
-#define LOCAL_PENDING_VECTOR		0xec
+#define IRQ_WORK_VECTOR			0xec
 
 #define UV_BAU_MESSAGE			0xea
 
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 9d3f485e5dd..7490bf8d145 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -35,6 +35,7 @@ obj-y			:= process_$(BITS).o signal.o entry_$(BITS).o
 obj-y			+= traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
 obj-y			+= time.o ioport.o ldt.o dumpstack.o
 obj-y			+= setup.o x86_init.o i8259.o irqinit.o jump_label.o
+obj-$(CONFIG_IRQ_WORK)  += irq_work.o
 obj-$(CONFIG_X86_VISWS)	+= visws_quirks.o
 obj-$(CONFIG_X86_32)	+= probe_roms_32.o
 obj-$(CONFIG_X86_32)	+= sys_i386_32.o i386_ksyms_32.o
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index e2513f26ba8..fe73c1844a9 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1196,25 +1196,6 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
 	return handled;
 }
 
-void smp_perf_pending_interrupt(struct pt_regs *regs)
-{
-	irq_enter();
-	ack_APIC_irq();
-	inc_irq_stat(apic_pending_irqs);
-	perf_event_do_pending();
-	irq_exit();
-}
-
-void set_perf_event_pending(void)
-{
-#ifdef CONFIG_X86_LOCAL_APIC
-	if (!x86_pmu.apic || !x86_pmu_initialized())
-		return;
-
-	apic->send_IPI_self(LOCAL_PENDING_VECTOR);
-#endif
-}
-
 void perf_events_lapic_init(void)
 {
 	if (!x86_pmu.apic || !x86_pmu_initialized())
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 17be5ec7cbb..c375c79065f 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1023,9 +1023,9 @@ apicinterrupt ERROR_APIC_VECTOR \
 apicinterrupt SPURIOUS_APIC_VECTOR \
 	spurious_interrupt smp_spurious_interrupt
 
-#ifdef CONFIG_PERF_EVENTS
-apicinterrupt LOCAL_PENDING_VECTOR \
-	perf_pending_interrupt smp_perf_pending_interrupt
+#ifdef CONFIG_IRQ_WORK
+apicinterrupt IRQ_WORK_VECTOR \
+	irq_work_interrupt smp_irq_work_interrupt
 #endif
 
 /*
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 91fd0c70a18..44edb03fc9e 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -67,10 +67,10 @@ static int show_other_interrupts(struct seq_file *p, int prec)
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
 	seq_printf(p, "  Performance monitoring interrupts\n");
-	seq_printf(p, "%*s: ", prec, "PND");
+	seq_printf(p, "%*s: ", prec, "IWI");
 	for_each_online_cpu(j)
-		seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs);
-	seq_printf(p, "  Performance pending work\n");
+		seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs);
+	seq_printf(p, "  IRQ work interrupts\n");
 #endif
 	if (x86_platform_ipi_callback) {
 		seq_printf(p, "%*s: ", prec, "PLT");
@@ -185,7 +185,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
 	sum += irq_stats(cpu)->apic_timer_irqs;
 	sum += irq_stats(cpu)->irq_spurious_count;
 	sum += irq_stats(cpu)->apic_perf_irqs;
-	sum += irq_stats(cpu)->apic_pending_irqs;
+	sum += irq_stats(cpu)->apic_irq_work_irqs;
 #endif
 	if (x86_platform_ipi_callback)
 		sum += irq_stats(cpu)->x86_platform_ipis;
diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c
new file mode 100644
index 00000000000..ca8f703a1e7
--- /dev/null
+++ b/arch/x86/kernel/irq_work.c
@@ -0,0 +1,30 @@
+/*
+ * x86 specific code for irq_work
+ *
+ * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/irq_work.h>
+#include <linux/hardirq.h>
+#include <asm/apic.h>
+
+void smp_irq_work_interrupt(struct pt_regs *regs)
+{
+	irq_enter();
+	ack_APIC_irq();
+	inc_irq_stat(apic_irq_work_irqs);
+	irq_work_run();
+	irq_exit();
+}
+
+void arch_irq_work_raise(void)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+	if (!cpu_has_apic)
+		return;
+
+	apic->send_IPI_self(IRQ_WORK_VECTOR);
+	apic_wait_icr_idle();
+#endif
+}
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 990ae7cfc57..713969b9266 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -224,9 +224,9 @@ static void __init apic_intr_init(void)
 	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
 	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
 
-	/* Performance monitoring interrupts: */
-# ifdef CONFIG_PERF_EVENTS
-	alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
+	/* IRQ work interrupts: */
+# ifdef CONFIG_IRQ_WORK
+	alloc_intr_gate(IRQ_WORK_VECTOR, irq_work_interrupt);
 # endif
 
 #endif
-- 
cgit v1.2.3-18-g5258


From e82b8e4ea4f3dffe6e7939f90e78da675fcc450e Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venki@google.com>
Date: Mon, 4 Oct 2010 17:03:20 -0700
Subject: x86: Add IRQ_TIME_ACCOUNTING

This patch adds IRQ_TIME_ACCOUNTING option on x86 and runtime enables it
when TSC is enabled.

This change just enables fine grained irq time accounting, isn't used yet.
Following patches use it for different purposes.

Signed-off-by: Venkatesh Pallipadi <venki@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1286237003-12406-6-git-send-email-venki@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig      | 11 +++++++++++
 arch/x86/kernel/tsc.c |  8 ++++++++
 2 files changed, 19 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cea0cd9a316..f4c70c246ff 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -795,6 +795,17 @@ config SCHED_MC
 	  making when dealing with multi-core CPU chips at a cost of slightly
 	  increased overhead in some places. If unsure say N here.
 
+config IRQ_TIME_ACCOUNTING
+	bool "Fine granularity task level IRQ time accounting"
+	default n
+	---help---
+	  Select this option to enable fine granularity task irq time
+	  accounting. This is done by reading a timestamp on each
+	  transitions between softirq and hardirq state, so there can be a
+	  small performance impact.
+
+	  If in doubt, say N here.
+
 source "kernel/Kconfig.preempt"
 
 config X86_UP_APIC
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 26a863a9c2a..a1c2cd76853 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -104,10 +104,14 @@ int __init notsc_setup(char *str)
 
 __setup("notsc", notsc_setup);
 
+static int no_sched_irq_time;
+
 static int __init tsc_setup(char *str)
 {
 	if (!strcmp(str, "reliable"))
 		tsc_clocksource_reliable = 1;
+	if (!strncmp(str, "noirqtime", 9))
+		no_sched_irq_time = 1;
 	return 1;
 }
 
@@ -801,6 +805,7 @@ void mark_tsc_unstable(char *reason)
 	if (!tsc_unstable) {
 		tsc_unstable = 1;
 		sched_clock_stable = 0;
+		disable_sched_clock_irqtime();
 		printk(KERN_INFO "Marking TSC unstable due to %s\n", reason);
 		/* Change only the rating, when not registered */
 		if (clocksource_tsc.mult)
@@ -987,6 +992,9 @@ void __init tsc_init(void)
 	/* now allow native_sched_clock() to use rdtsc */
 	tsc_disabled = 0;
 
+	if (!no_sched_irq_time)
+		enable_sched_clock_irqtime();
+
 	lpj = ((u64)tsc_khz * 1000);
 	do_div(lpj, HZ);
 	lpj_fine = lpj;
-- 
cgit v1.2.3-18-g5258


From 9717967c4b704ce344c954afb5bb160aa9c01c34 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 18 Oct 2010 13:47:48 -0700
Subject: x86: ioapic: Call free_irte only if interrupt remapping enabled

On a system that support intr-rempping when booting with "intremap=off"

[  177.895501] BUG: unable to handle kernel NULL pointer dereference at 00000000000000f8
[  177.913316] IP: [<ffffffff8145fc18>] free_irte+0x47/0xc0
...
[  178.173326] Call Trace:
[  178.173574]  [<ffffffff810515b4>] destroy_irq+0x3a/0x75
[  178.192934]  [<ffffffff81051834>] arch_teardown_msi_irq+0xe/0x10
[  178.193418]  [<ffffffff81458dc3>] arch_teardown_msi_irqs+0x56/0x7f
[  178.213021]  [<ffffffff81458e79>] free_msi_irqs+0x8d/0xeb

Call free_irte only when interrupt remapping is enabled.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <4CBCB274.7010108@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/apic/io_apic.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 20e47e04539..8ae808d110f 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3109,7 +3109,8 @@ void destroy_irq(unsigned int irq)
 
 	irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE);
 
-	free_irte(irq);
+	if (intr_remapping_enabled)
+		free_irte(irq);
 	raw_spin_lock_irqsave(&vector_lock, flags);
 	__clear_irq_vector(irq, cfg);
 	raw_spin_unlock_irqrestore(&vector_lock, flags);
-- 
cgit v1.2.3-18-g5258


From 9581d442b9058d3699b4be568b6e5eae38a41493 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 19 Oct 2010 16:46:55 +0200
Subject: KVM: Fix fs/gs reload oops with invalid ldt

kvm reloads the host's fs and gs blindly, however the underlying segment
descriptors may be invalid due to the user modifying the ldt after loading
them.

Fix by using the safe accessors (loadsegment() and load_gs_index()) instead
of home grown unsafe versions.

This is CVE-2010-3698.

KVM-Stable-Tag.
Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 24 ------------------------
 arch/x86/kvm/svm.c              | 15 ++++++++++-----
 arch/x86/kvm/vmx.c              | 24 +++++++++---------------
 3 files changed, 19 insertions(+), 44 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 502e53f999c..c52e2eb40a1 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -652,20 +652,6 @@ static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
 	return (struct kvm_mmu_page *)page_private(page);
 }
 
-static inline u16 kvm_read_fs(void)
-{
-	u16 seg;
-	asm("mov %%fs, %0" : "=g"(seg));
-	return seg;
-}
-
-static inline u16 kvm_read_gs(void)
-{
-	u16 seg;
-	asm("mov %%gs, %0" : "=g"(seg));
-	return seg;
-}
-
 static inline u16 kvm_read_ldt(void)
 {
 	u16 ldt;
@@ -673,16 +659,6 @@ static inline u16 kvm_read_ldt(void)
 	return ldt;
 }
 
-static inline void kvm_load_fs(u16 sel)
-{
-	asm("mov %0, %%fs" : : "rm"(sel));
-}
-
-static inline void kvm_load_gs(u16 sel)
-{
-	asm("mov %0, %%gs" : : "rm"(sel));
-}
-
 static inline void kvm_load_ldt(u16 sel)
 {
 	asm("lldt %0" : : "rm"(sel));
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 81ed28cb36e..8a3f9f64f86 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3163,8 +3163,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 	sync_lapic_to_cr8(vcpu);
 
 	save_host_msrs(vcpu);
-	fs_selector = kvm_read_fs();
-	gs_selector = kvm_read_gs();
+	savesegment(fs, fs_selector);
+	savesegment(gs, gs_selector);
 	ldt_selector = kvm_read_ldt();
 	svm->vmcb->save.cr2 = vcpu->arch.cr2;
 	/* required for live migration with NPT */
@@ -3251,10 +3251,15 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 	vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
 	vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
 
-	kvm_load_fs(fs_selector);
-	kvm_load_gs(gs_selector);
-	kvm_load_ldt(ldt_selector);
 	load_host_msrs(vcpu);
+	loadsegment(fs, fs_selector);
+#ifdef CONFIG_X86_64
+	load_gs_index(gs_selector);
+	wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
+#else
+	loadsegment(gs, gs_selector);
+#endif
+	kvm_load_ldt(ldt_selector);
 
 	reload_tss(vcpu);
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 49b25eee25a..7bddfab1201 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -803,7 +803,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
 	 */
 	vmx->host_state.ldt_sel = kvm_read_ldt();
 	vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
-	vmx->host_state.fs_sel = kvm_read_fs();
+	savesegment(fs, vmx->host_state.fs_sel);
 	if (!(vmx->host_state.fs_sel & 7)) {
 		vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
 		vmx->host_state.fs_reload_needed = 0;
@@ -811,7 +811,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
 		vmcs_write16(HOST_FS_SELECTOR, 0);
 		vmx->host_state.fs_reload_needed = 1;
 	}
-	vmx->host_state.gs_sel = kvm_read_gs();
+	savesegment(gs, vmx->host_state.gs_sel);
 	if (!(vmx->host_state.gs_sel & 7))
 		vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
 	else {
@@ -841,27 +841,21 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
 
 static void __vmx_load_host_state(struct vcpu_vmx *vmx)
 {
-	unsigned long flags;
-
 	if (!vmx->host_state.loaded)
 		return;
 
 	++vmx->vcpu.stat.host_state_reload;
 	vmx->host_state.loaded = 0;
 	if (vmx->host_state.fs_reload_needed)
-		kvm_load_fs(vmx->host_state.fs_sel);
+		loadsegment(fs, vmx->host_state.fs_sel);
 	if (vmx->host_state.gs_ldt_reload_needed) {
 		kvm_load_ldt(vmx->host_state.ldt_sel);
-		/*
-		 * If we have to reload gs, we must take care to
-		 * preserve our gs base.
-		 */
-		local_irq_save(flags);
-		kvm_load_gs(vmx->host_state.gs_sel);
 #ifdef CONFIG_X86_64
-		wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
+		load_gs_index(vmx->host_state.gs_sel);
+		wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
+#else
+		loadsegment(gs, vmx->host_state.gs_sel);
 #endif
-		local_irq_restore(flags);
 	}
 	reload_tss();
 #ifdef CONFIG_X86_64
@@ -2589,8 +2583,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
 	vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
 	vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
-	vmcs_write16(HOST_FS_SELECTOR, kvm_read_fs());    /* 22.2.4 */
-	vmcs_write16(HOST_GS_SELECTOR, kvm_read_gs());    /* 22.2.4 */
+	vmcs_write16(HOST_FS_SELECTOR, 0);            /* 22.2.4 */
+	vmcs_write16(HOST_GS_SELECTOR, 0);            /* 22.2.4 */
 	vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
 #ifdef CONFIG_X86_64
 	rdmsrl(MSR_FS_BASE, a);
-- 
cgit v1.2.3-18-g5258


From 44235dcde416104b8e1db7606c283f4c0149c760 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Thu, 14 Oct 2010 17:04:59 -0700
Subject: x86, mm: Fix bogus whitespace in sync_global_pgds()

Whitespace cleanup only.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/mm/init_64.c | 44 ++++++++++++++++++++++----------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 74f0f358b07..1ad7c0ff5d2 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -103,28 +103,28 @@ __setup("noexec32=", nonx32_setup);
  */
 void sync_global_pgds(unsigned long start, unsigned long end)
 {
-       unsigned long address;
-
-       for (address = start; address <= end; address += PGDIR_SIZE) {
-	       const pgd_t *pgd_ref = pgd_offset_k(address);
-	       unsigned long flags;
-	       struct page *page;
-
-	       if (pgd_none(*pgd_ref))
-		       continue;
-
-	       spin_lock_irqsave(&pgd_lock, flags);
-	       list_for_each_entry(page, &pgd_list, lru) {
-		       pgd_t *pgd;
-		       pgd = (pgd_t *)page_address(page) + pgd_index(address);
-		       if (pgd_none(*pgd))
-			       set_pgd(pgd, *pgd_ref);
-		       else
-			       BUG_ON(pgd_page_vaddr(*pgd)
-					!= pgd_page_vaddr(*pgd_ref));
-	       }
-	       spin_unlock_irqrestore(&pgd_lock, flags);
-       }
+	unsigned long address;
+
+	for (address = start; address <= end; address += PGDIR_SIZE) {
+		const pgd_t *pgd_ref = pgd_offset_k(address);
+		unsigned long flags;
+		struct page *page;
+
+		if (pgd_none(*pgd_ref))
+			continue;
+
+		spin_lock_irqsave(&pgd_lock, flags);
+		list_for_each_entry(page, &pgd_list, lru) {
+			pgd_t *pgd;
+			pgd = (pgd_t *)page_address(page) + pgd_index(address);
+			if (pgd_none(*pgd))
+				set_pgd(pgd, *pgd_ref);
+			else
+				BUG_ON(pgd_page_vaddr(*pgd)
+				       != pgd_page_vaddr(*pgd_ref));
+		}
+		spin_unlock_irqrestore(&pgd_lock, flags);
+	}
 }
 
 /*
-- 
cgit v1.2.3-18-g5258


From 617d34d9e5d8326ec8f188c616aa06ac59d083fe Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Tue, 21 Sep 2010 12:01:51 -0700
Subject: x86, mm: Hold mm->page_table_lock while doing vmalloc_sync

Take mm->page_table_lock while syncing the vmalloc region.  This prevents
a race with the Xen pagetable pin/unpin code, which expects that the
page_table_lock is already held.  If this race occurs, then Xen can see
an inconsistent page type (a page can either be read/write or a pagetable
page, and pin/unpin converts it between them), which will cause either
the pin or the set_p[gm]d to fail; either will crash the kernel.

vmalloc_sync_all() should be called rarely, so this extra use of
page_table_lock should not interfere with its normal users.

The mm pointer is stashed in the pgd page's index field, as that won't
be otherwise used for pgds.

Reported-by: Ian Campbell <ian.cambell@eu.citrix.com>
Originally-by: Jan Beulich <jbeulich@novell.com>
LKML-Reference: <4CB88A4C.1080305@goop.org>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/pgtable.h |  2 ++
 arch/x86/mm/fault.c            | 11 ++++++++++-
 arch/x86/mm/init_64.c          |  7 +++++++
 arch/x86/mm/pgtable.c          | 20 +++++++++++++++++---
 4 files changed, 36 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 2d0a33bd297..ada823a13c7 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -28,6 +28,8 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
 extern spinlock_t pgd_lock;
 extern struct list_head pgd_list;
 
+extern struct mm_struct *pgd_page_get_mm(struct page *page);
+
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #else  /* !CONFIG_PARAVIRT */
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index caec22906d7..6c27c39f8a3 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -229,7 +229,16 @@ void vmalloc_sync_all(void)
 
 		spin_lock_irqsave(&pgd_lock, flags);
 		list_for_each_entry(page, &pgd_list, lru) {
-			if (!vmalloc_sync_one(page_address(page), address))
+			spinlock_t *pgt_lock;
+			int ret;
+
+			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+
+			spin_lock(pgt_lock);
+			ret = vmalloc_sync_one(page_address(page), address);
+			spin_unlock(pgt_lock);
+
+			if (!ret)
 				break;
 		}
 		spin_unlock_irqrestore(&pgd_lock, flags);
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 1ad7c0ff5d2..4d323fb770c 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -116,12 +116,19 @@ void sync_global_pgds(unsigned long start, unsigned long end)
 		spin_lock_irqsave(&pgd_lock, flags);
 		list_for_each_entry(page, &pgd_list, lru) {
 			pgd_t *pgd;
+			spinlock_t *pgt_lock;
+
 			pgd = (pgd_t *)page_address(page) + pgd_index(address);
+			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+			spin_lock(pgt_lock);
+
 			if (pgd_none(*pgd))
 				set_pgd(pgd, *pgd_ref);
 			else
 				BUG_ON(pgd_page_vaddr(*pgd)
 				       != pgd_page_vaddr(*pgd_ref));
+
+			spin_unlock(pgt_lock);
 		}
 		spin_unlock_irqrestore(&pgd_lock, flags);
 	}
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 5c4ee422590..c70e57dbb49 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -87,7 +87,19 @@ static inline void pgd_list_del(pgd_t *pgd)
 #define UNSHARED_PTRS_PER_PGD				\
 	(SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
 
-static void pgd_ctor(pgd_t *pgd)
+
+static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
+{
+	BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm));
+	virt_to_page(pgd)->index = (pgoff_t)mm;
+}
+
+struct mm_struct *pgd_page_get_mm(struct page *page)
+{
+	return (struct mm_struct *)page->index;
+}
+
+static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
 {
 	/* If the pgd points to a shared pagetable level (either the
 	   ptes in non-PAE, or shared PMD in PAE), then just copy the
@@ -105,8 +117,10 @@ static void pgd_ctor(pgd_t *pgd)
 	}
 
 	/* list required to sync kernel mapping updates */
-	if (!SHARED_KERNEL_PMD)
+	if (!SHARED_KERNEL_PMD) {
+		pgd_set_mm(pgd, mm);
 		pgd_list_add(pgd);
+	}
 }
 
 static void pgd_dtor(pgd_t *pgd)
@@ -272,7 +286,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 	 */
 	spin_lock_irqsave(&pgd_lock, flags);
 
-	pgd_ctor(pgd);
+	pgd_ctor(mm, pgd);
 	pgd_prepopulate_pmd(mm, pgd, pmds);
 
 	spin_unlock_irqrestore(&pgd_lock, flags);
-- 
cgit v1.2.3-18-g5258


From 3234282f33b29d349bcada40204fc7c8fda7fe72 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Tue, 19 Oct 2010 14:52:26 +0100
Subject: x86, asm: Fix CFI macro invocations to deal with shortcomings in gas

gas prior to (perhaps) 2.16.90 has problems with passing non-
parenthesized expressions containing spaces to macros. Spaces, however,
get inserted by cpp between any macro expanding to a number and a
subsequent + or -. For the +, current x86 gas then removes the space
again (future gas may not do so), but for the - the space gets retained
and is then considered a separator between macro arguments.

Fix the respective definitions for both the - and + cases, so that they
neither contain spaces nor make cpp insert any (the latter by adding
seemingly redundant parentheses).

Signed-off-by: Jan Beulich <jbeulich@novell.com>
LKML-Reference: <4CBDBEBA020000780001E05A@vpn.id2.novell.com>
Cc: Alexander van Heukelum <heukelum@fastmail.fm>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/calling.h    | 52 ++++++++++++++++++++-------------------
 arch/x86/include/asm/entry_arch.h | 19 +++-----------
 arch/x86/include/asm/segment.h    | 32 ++++++++++++------------
 arch/x86/kernel/asm-offsets_32.c  |  4 +--
 arch/x86/kernel/entry_32.S        |  6 ++---
 arch/x86/kernel/entry_64.S        | 20 +++------------
 6 files changed, 55 insertions(+), 78 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
index 0e63c9a2a8d..30af5a83216 100644
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/include/asm/calling.h
@@ -48,36 +48,38 @@ For 32-bit we have the following conventions - kernel is built with
 
 
 /*
- * 64-bit system call stack frame layout defines and helpers,
- * for assembly code:
+ * 64-bit system call stack frame layout defines and helpers, for
+ * assembly code (note that the seemingly unnecessary parentheses
+ * are to prevent cpp from inserting spaces in expressions that get
+ * passed to macros):
  */
 
-#define R15		  0
-#define R14		  8
-#define R13		 16
-#define R12		 24
-#define RBP		 32
-#define RBX		 40
+#define R15		  (0)
+#define R14		  (8)
+#define R13		 (16)
+#define R12		 (24)
+#define RBP		 (32)
+#define RBX		 (40)
 
 /* arguments: interrupts/non tracing syscalls only save up to here: */
-#define R11		 48
-#define R10		 56
-#define R9		 64
-#define R8		 72
-#define RAX		 80
-#define RCX		 88
-#define RDX		 96
-#define RSI		104
-#define RDI		112
-#define ORIG_RAX	120       /* + error_code */
+#define R11		 (48)
+#define R10		 (56)
+#define R9		 (64)
+#define R8		 (72)
+#define RAX		 (80)
+#define RCX		 (88)
+#define RDX		 (96)
+#define RSI		(104)
+#define RDI		(112)
+#define ORIG_RAX	(120)       /* + error_code */
 /* end of arguments */
 
 /* cpu exception frame or undefined in case of fast syscall: */
-#define RIP		128
-#define CS		136
-#define EFLAGS		144
-#define RSP		152
-#define SS		160
+#define RIP		(128)
+#define CS		(136)
+#define EFLAGS		(144)
+#define RSP		(152)
+#define SS		(160)
 
 #define ARGOFFSET	R11
 #define SWFRAME		ORIG_RAX
@@ -111,7 +113,7 @@ For 32-bit we have the following conventions - kernel is built with
 	.endif
 	.endm
 
-#define ARG_SKIP	9*8
+#define ARG_SKIP	(9*8)
 
 	.macro RESTORE_ARGS skiprax=0, addskip=0, skiprcx=0, skipr11=0, \
 			    skipr8910=0, skiprdx=0
@@ -169,7 +171,7 @@ For 32-bit we have the following conventions - kernel is built with
 	.endif
 	.endm
 
-#define REST_SKIP	6*8
+#define REST_SKIP	(6*8)
 
 	.macro SAVE_REST
 	subq $REST_SKIP, %rsp
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index 8e8ec663a98..4d2966e7e85 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -16,22 +16,11 @@ BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR)
 BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR)
 BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR)
 
-BUILD_INTERRUPT3(invalidate_interrupt0,INVALIDATE_TLB_VECTOR_START+0,
-		 smp_invalidate_interrupt)
-BUILD_INTERRUPT3(invalidate_interrupt1,INVALIDATE_TLB_VECTOR_START+1,
-		 smp_invalidate_interrupt)
-BUILD_INTERRUPT3(invalidate_interrupt2,INVALIDATE_TLB_VECTOR_START+2,
-		 smp_invalidate_interrupt)
-BUILD_INTERRUPT3(invalidate_interrupt3,INVALIDATE_TLB_VECTOR_START+3,
-		 smp_invalidate_interrupt)
-BUILD_INTERRUPT3(invalidate_interrupt4,INVALIDATE_TLB_VECTOR_START+4,
-		 smp_invalidate_interrupt)
-BUILD_INTERRUPT3(invalidate_interrupt5,INVALIDATE_TLB_VECTOR_START+5,
-		 smp_invalidate_interrupt)
-BUILD_INTERRUPT3(invalidate_interrupt6,INVALIDATE_TLB_VECTOR_START+6,
-		 smp_invalidate_interrupt)
-BUILD_INTERRUPT3(invalidate_interrupt7,INVALIDATE_TLB_VECTOR_START+7,
+.irpc idx, "01234567"
+BUILD_INTERRUPT3(invalidate_interrupt\idx,
+		 (INVALIDATE_TLB_VECTOR_START)+\idx,
 		 smp_invalidate_interrupt)
+.endr
 #endif
 
 BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR)
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index 14e0ed86a6f..231f1c1d660 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -73,31 +73,31 @@
 
 #define GDT_ENTRY_DEFAULT_USER_DS	15
 
-#define GDT_ENTRY_KERNEL_BASE	12
+#define GDT_ENTRY_KERNEL_BASE		(12)
 
-#define GDT_ENTRY_KERNEL_CS		(GDT_ENTRY_KERNEL_BASE + 0)
+#define GDT_ENTRY_KERNEL_CS		(GDT_ENTRY_KERNEL_BASE+0)
 
-#define GDT_ENTRY_KERNEL_DS		(GDT_ENTRY_KERNEL_BASE + 1)
+#define GDT_ENTRY_KERNEL_DS		(GDT_ENTRY_KERNEL_BASE+1)
 
-#define GDT_ENTRY_TSS			(GDT_ENTRY_KERNEL_BASE + 4)
-#define GDT_ENTRY_LDT			(GDT_ENTRY_KERNEL_BASE + 5)
+#define GDT_ENTRY_TSS			(GDT_ENTRY_KERNEL_BASE+4)
+#define GDT_ENTRY_LDT			(GDT_ENTRY_KERNEL_BASE+5)
 
-#define GDT_ENTRY_PNPBIOS_BASE		(GDT_ENTRY_KERNEL_BASE + 6)
-#define GDT_ENTRY_APMBIOS_BASE		(GDT_ENTRY_KERNEL_BASE + 11)
+#define GDT_ENTRY_PNPBIOS_BASE		(GDT_ENTRY_KERNEL_BASE+6)
+#define GDT_ENTRY_APMBIOS_BASE		(GDT_ENTRY_KERNEL_BASE+11)
 
-#define GDT_ENTRY_ESPFIX_SS		(GDT_ENTRY_KERNEL_BASE + 14)
-#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
+#define GDT_ENTRY_ESPFIX_SS		(GDT_ENTRY_KERNEL_BASE+14)
+#define __ESPFIX_SS			(GDT_ENTRY_ESPFIX_SS*8)
 
-#define GDT_ENTRY_PERCPU			(GDT_ENTRY_KERNEL_BASE + 15)
+#define GDT_ENTRY_PERCPU		(GDT_ENTRY_KERNEL_BASE+15)
 #ifdef CONFIG_SMP
 #define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8)
 #else
 #define __KERNEL_PERCPU 0
 #endif
 
-#define GDT_ENTRY_STACK_CANARY		(GDT_ENTRY_KERNEL_BASE + 16)
+#define GDT_ENTRY_STACK_CANARY		(GDT_ENTRY_KERNEL_BASE+16)
 #ifdef CONFIG_CC_STACKPROTECTOR
-#define __KERNEL_STACK_CANARY		(GDT_ENTRY_STACK_CANARY * 8)
+#define __KERNEL_STACK_CANARY		(GDT_ENTRY_STACK_CANARY*8)
 #else
 #define __KERNEL_STACK_CANARY		0
 #endif
@@ -182,10 +182,10 @@
 
 #endif
 
-#define __KERNEL_CS	(GDT_ENTRY_KERNEL_CS * 8)
-#define __KERNEL_DS	(GDT_ENTRY_KERNEL_DS * 8)
-#define __USER_DS     (GDT_ENTRY_DEFAULT_USER_DS* 8 + 3)
-#define __USER_CS     (GDT_ENTRY_DEFAULT_USER_CS* 8 + 3)
+#define __KERNEL_CS	(GDT_ENTRY_KERNEL_CS*8)
+#define __KERNEL_DS	(GDT_ENTRY_KERNEL_DS*8)
+#define __USER_DS	(GDT_ENTRY_DEFAULT_USER_DS*8+3)
+#define __USER_CS	(GDT_ENTRY_DEFAULT_USER_CS*8+3)
 #ifndef CONFIG_PARAVIRT
 #define get_kernel_rpl()  0
 #endif
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index dfdbf640389..1a4088dda37 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -99,9 +99,7 @@ void foo(void)
 
 	DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
 	DEFINE(PAGE_SHIFT_asm, PAGE_SHIFT);
-	DEFINE(PTRS_PER_PTE, PTRS_PER_PTE);
-	DEFINE(PTRS_PER_PMD, PTRS_PER_PMD);
-	DEFINE(PTRS_PER_PGD, PTRS_PER_PGD);
+	DEFINE(THREAD_SIZE_asm, THREAD_SIZE);
 
 	OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
 
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 9fb188d7bc7..f73a4b881aa 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -382,20 +382,20 @@ sysenter_past_esp:
 	 * enough kernel state to call TRACE_IRQS_OFF can be called - but
 	 * we immediately enable interrupts at that point anyway.
 	 */
-	pushl_cfi $(__USER_DS)
+	pushl_cfi $__USER_DS
 	/*CFI_REL_OFFSET ss, 0*/
 	pushl_cfi %ebp
 	CFI_REL_OFFSET esp, 0
 	pushfl_cfi
 	orl $X86_EFLAGS_IF, (%esp)
-	pushl_cfi $(__USER_CS)
+	pushl_cfi $__USER_CS
 	/*CFI_REL_OFFSET cs, 0*/
 	/*
 	 * Push current_thread_info()->sysenter_return to the stack.
 	 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
 	 * pushed above; +8 corresponds to copy_thread's esp0 setting.
 	 */
-	pushl_cfi (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
+	pushl_cfi TI_sysenter_return-THREAD_SIZE_asm+8+4*4(%esp)
 	CFI_REL_OFFSET eip, 0
 
 	pushl_cfi %eax
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 8851a2bb8c0..9cc9a71f8fb 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -963,22 +963,10 @@ apicinterrupt X86_PLATFORM_IPI_VECTOR \
 	x86_platform_ipi smp_x86_platform_ipi
 
 #ifdef CONFIG_SMP
-apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \
-	invalidate_interrupt0 smp_invalidate_interrupt
-apicinterrupt INVALIDATE_TLB_VECTOR_START+1 \
-	invalidate_interrupt1 smp_invalidate_interrupt
-apicinterrupt INVALIDATE_TLB_VECTOR_START+2 \
-	invalidate_interrupt2 smp_invalidate_interrupt
-apicinterrupt INVALIDATE_TLB_VECTOR_START+3 \
-	invalidate_interrupt3 smp_invalidate_interrupt
-apicinterrupt INVALIDATE_TLB_VECTOR_START+4 \
-	invalidate_interrupt4 smp_invalidate_interrupt
-apicinterrupt INVALIDATE_TLB_VECTOR_START+5 \
-	invalidate_interrupt5 smp_invalidate_interrupt
-apicinterrupt INVALIDATE_TLB_VECTOR_START+6 \
-	invalidate_interrupt6 smp_invalidate_interrupt
-apicinterrupt INVALIDATE_TLB_VECTOR_START+7 \
-	invalidate_interrupt7 smp_invalidate_interrupt
+.irpc idx, "01234567"
+apicinterrupt (INVALIDATE_TLB_VECTOR_START)+\idx \
+	invalidate_interrupt\idx smp_invalidate_interrupt
+.endr
 #endif
 
 apicinterrupt THRESHOLD_APIC_VECTOR \
-- 
cgit v1.2.3-18-g5258


From a68c439b1966c91f0ef474e2bf275d6792312726 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 6 Oct 2010 12:27:53 +0200
Subject: apic, x86: Check if EILVT APIC registers are available (AMD only)

This patch implements checks for the availability of LVT entries
(APIC500-530) and reserves it if used. The check becomes
necessary since we want to let the BIOS provide the LVT offsets.
 The offsets should be determined by the subsystems using it
like those for MCE threshold or IBS.  On K8 only offset 0
(APIC500) and MCE interrupts are supported. Beginning with
family 10h at least 4 offsets are available.

Since offsets must be consistent for all cores, we keep track of
the LVT offsets in software and reserve the offset for the same
vector also to be used on other cores. An offset is freed by
setting the entry to APIC_EILVT_MASKED.

If the BIOS is right, there should be no conflicts. Otherwise a
"[Firmware Bug]: ..." error message is generated. However, if
software does not properly determines the offsets, it is not
necessarily a BIOS bug.

Signed-off-by: Robert Richter <robert.richter@amd.com>
LKML-Reference: <1286360874-1471-2-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/apicdef.h |  1 +
 arch/x86/kernel/apic/apic.c    | 83 +++++++++++++++++++++++++++++++++++++-----
 2 files changed, 75 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 7fe3b3060f0..a859ca461fb 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -131,6 +131,7 @@
 #define APIC_EILVTn(n)	(0x500 + 0x10 * n)
 #define		APIC_EILVT_NR_AMD_K8	1	/* # of extended interrupts */
 #define		APIC_EILVT_NR_AMD_10H	4
+#define		APIC_EILVT_NR_MAX	APIC_EILVT_NR_AMD_10H
 #define		APIC_EILVT_LVTOFF(x)	(((x) >> 4) & 0xF)
 #define		APIC_EILVT_MSG_FIX	0x0
 #define		APIC_EILVT_MSG_SMI	0x2
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 8cf86fb3b4e..2bfeafd24f5 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -52,6 +52,7 @@
 #include <asm/mce.h>
 #include <asm/kvm_para.h>
 #include <asm/tsc.h>
+#include <asm/atomic.h>
 
 unsigned int num_processors;
 
@@ -370,24 +371,88 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
 }
 
 /*
- * Setup extended LVT, AMD specific (K8, family 10h)
+ * Setup extended LVT, AMD specific
  *
- * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and
- * MCE interrupts are supported. Thus MCE offset must be set to 0.
+ * Software should use the LVT offsets the BIOS provides.  The offsets
+ * are determined by the subsystems using it like those for MCE
+ * threshold or IBS.  On K8 only offset 0 (APIC500) and MCE interrupts
+ * are supported. Beginning with family 10h at least 4 offsets are
+ * available.
  *
- * If mask=1, the LVT entry does not generate interrupts while mask=0
- * enables the vector. See also the BKDGs.
+ * Since the offsets must be consistent for all cores, we keep track
+ * of the LVT offsets in software and reserve the offset for the same
+ * vector also to be used on other cores. An offset is freed by
+ * setting the entry to APIC_EILVT_MASKED.
+ *
+ * If the BIOS is right, there should be no conflicts. Otherwise a
+ * "[Firmware Bug]: ..." error message is generated. However, if
+ * software does not properly determines the offsets, it is not
+ * necessarily a BIOS bug.
  */
 
 #define APIC_EILVT_LVTOFF_MCE 0
 #define APIC_EILVT_LVTOFF_IBS 1
 
-static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
+static atomic_t eilvt_offsets[APIC_EILVT_NR_MAX];
+
+static inline int eilvt_entry_is_changeable(unsigned int old, unsigned int new)
+{
+	return (old & APIC_EILVT_MASKED)
+		|| (new == APIC_EILVT_MASKED)
+		|| ((new & ~APIC_EILVT_MASKED) == old);
+}
+
+static unsigned int reserve_eilvt_offset(int offset, unsigned int new)
+{
+	unsigned int rsvd;			/* 0: uninitialized */
+
+	if (offset >= APIC_EILVT_NR_MAX)
+		return ~0;
+
+	rsvd = atomic_read(&eilvt_offsets[offset]) & ~APIC_EILVT_MASKED;
+	do {
+		if (rsvd &&
+		    !eilvt_entry_is_changeable(rsvd, new))
+			/* may not change if vectors are different */
+			return rsvd;
+		rsvd = atomic_cmpxchg(&eilvt_offsets[offset], rsvd, new);
+	} while (rsvd != new);
+
+	return new;
+}
+
+/*
+ * If mask=1, the LVT entry does not generate interrupts while mask=0
+ * enables the vector. See also the BKDGs.
+ */
+
+static int setup_APIC_eilvt(u8 offset, u8 vector, u8 msg_type, u8 mask)
 {
-	unsigned long reg = (lvt_off << 4) + APIC_EILVTn(0);
-	unsigned int  v   = (mask << 16) | (msg_type << 8) | vector;
+	unsigned long reg = APIC_EILVTn(offset);
+	unsigned int new, old, reserved;
+
+	new = (mask << 16) | (msg_type << 8) | vector;
+	old = apic_read(reg);
+	reserved = reserve_eilvt_offset(offset, new);
 
-	apic_write(reg, v);
+	if (reserved != new) {
+		pr_err(FW_BUG "cpu %d, try to setup vector 0x%x, but "
+		       "vector 0x%x was already reserved by another core, "
+		       "APIC%lX=0x%x\n",
+		       smp_processor_id(), new, reserved, reg, old);
+		return -EINVAL;
+	}
+
+	if (!eilvt_entry_is_changeable(old, new)) {
+		pr_err(FW_BUG "cpu %d, try to setup vector 0x%x but "
+		       "register already in use, APIC%lX=0x%x\n",
+		       smp_processor_id(), new, reg, old);
+		return -EBUSY;
+	}
+
+	apic_write(reg, new);
+
+	return 0;
 }
 
 u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask)
-- 
cgit v1.2.3-18-g5258


From 27afdf2008da0b8878a73e32e4eb12381b84e224 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 6 Oct 2010 12:27:54 +0200
Subject: apic, x86: Use BIOS settings for IBS and MCE threshold interrupt LVT
 offsets

We want the BIOS to setup the EILVT APIC registers. The offsets
were hardcoded and BIOS settings were overwritten by the OS.
Now, the subsystems for MCE threshold and IBS determine the LVT
offset from the registers the BIOS has setup. If the BIOS setup
is buggy on a family 10h system, a workaround enables IBS. If
the OS determines an invalid register setup, a "[Firmware Bug]:
" error message is reported.

We need this change also for upcomming cpu families.

Signed-off-by: Robert Richter <robert.richter@amd.com>
LKML-Reference: <1286360874-1471-3-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/apic.h          |   4 +-
 arch/x86/kernel/apic/apic.c          |  19 +----
 arch/x86/kernel/cpu/mcheck/mce_amd.c |  27 ++++++-
 arch/x86/oprofile/op_model_amd.c     | 145 ++++++++++++++++++++++++++++++-----
 4 files changed, 154 insertions(+), 41 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 1fa03e04ae4..286de34b0ed 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -252,9 +252,7 @@ static inline int apic_is_clustered_box(void)
 }
 #endif
 
-extern u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask);
-extern u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask);
-
+extern int setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask);
 
 #else /* !CONFIG_X86_LOCAL_APIC */
 static inline void lapic_shutdown(void) { }
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 2bfeafd24f5..850657d1b0e 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -390,9 +390,6 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
  * necessarily a BIOS bug.
  */
 
-#define APIC_EILVT_LVTOFF_MCE 0
-#define APIC_EILVT_LVTOFF_IBS 1
-
 static atomic_t eilvt_offsets[APIC_EILVT_NR_MAX];
 
 static inline int eilvt_entry_is_changeable(unsigned int old, unsigned int new)
@@ -426,7 +423,7 @@ static unsigned int reserve_eilvt_offset(int offset, unsigned int new)
  * enables the vector. See also the BKDGs.
  */
 
-static int setup_APIC_eilvt(u8 offset, u8 vector, u8 msg_type, u8 mask)
+int setup_APIC_eilvt(u8 offset, u8 vector, u8 msg_type, u8 mask)
 {
 	unsigned long reg = APIC_EILVTn(offset);
 	unsigned int new, old, reserved;
@@ -454,19 +451,7 @@ static int setup_APIC_eilvt(u8 offset, u8 vector, u8 msg_type, u8 mask)
 
 	return 0;
 }
-
-u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask)
-{
-	setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask);
-	return APIC_EILVT_LVTOFF_MCE;
-}
-
-u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask)
-{
-	setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
-	return APIC_EILVT_LVTOFF_IBS;
-}
-EXPORT_SYMBOL_GPL(setup_APIC_eilvt_ibs);
+EXPORT_SYMBOL_GPL(setup_APIC_eilvt);
 
 /*
  * Program the next event, relative to now
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 39aaee5c1ab..80c482382d5 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -131,7 +131,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
 	u32 low = 0, high = 0, address = 0;
 	unsigned int bank, block;
 	struct thresh_restart tr;
-	u8 lvt_off;
+	int lvt_off = -1;
+	u8 offset;
 
 	for (bank = 0; bank < NR_BANKS; ++bank) {
 		for (block = 0; block < NR_BLOCKS; ++block) {
@@ -162,8 +163,28 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
 			if (shared_bank[bank] && c->cpu_core_id)
 				break;
 #endif
-			lvt_off = setup_APIC_eilvt_mce(THRESHOLD_APIC_VECTOR,
-						       APIC_EILVT_MSG_FIX, 0);
+			offset = (high & MASK_LVTOFF_HI) >> 20;
+			if (lvt_off < 0) {
+				if (setup_APIC_eilvt(offset,
+						     THRESHOLD_APIC_VECTOR,
+						     APIC_EILVT_MSG_FIX, 0)) {
+					pr_err(FW_BUG "cpu %d, failed to "
+					       "setup threshold interrupt "
+					       "for bank %d, block %d "
+					       "(MSR%08X=0x%x%08x)",
+					       smp_processor_id(), bank, block,
+					       address, high, low);
+					continue;
+				}
+				lvt_off = offset;
+			} else if (lvt_off != offset) {
+				pr_err(FW_BUG "cpu %d, invalid threshold "
+				       "interrupt offset %d for bank %d,"
+				       "block %d (MSR%08X=0x%x%08x)",
+				       smp_processor_id(), lvt_off, bank,
+				       block, address, high, low);
+				continue;
+			}
 
 			high &= ~MASK_LVTOFF_HI;
 			high |= lvt_off << 20;
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index b67a6b5aa8d..42fb46f8388 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -64,15 +64,22 @@ static u64 ibs_op_ctl;
  * IBS cpuid feature detection
  */
 
-#define IBS_CPUID_FEATURES      0x8000001b
+#define IBS_CPUID_FEATURES		0x8000001b
 
 /*
  * Same bit mask as for IBS cpuid feature flags (Fn8000_001B_EAX), but
  * bit 0 is used to indicate the existence of IBS.
  */
-#define IBS_CAPS_AVAIL			(1LL<<0)
-#define IBS_CAPS_RDWROPCNT		(1LL<<3)
-#define IBS_CAPS_OPCNT			(1LL<<4)
+#define IBS_CAPS_AVAIL			(1U<<0)
+#define IBS_CAPS_RDWROPCNT		(1U<<3)
+#define IBS_CAPS_OPCNT			(1U<<4)
+
+/*
+ * IBS APIC setup
+ */
+#define IBSCTL				0x1cc
+#define IBSCTL_LVT_OFFSET_VALID		(1ULL<<8)
+#define IBSCTL_LVT_OFFSET_MASK		0x0F
 
 /*
  * IBS randomization macros
@@ -266,6 +273,74 @@ static void op_amd_stop_ibs(void)
 		wrmsrl(MSR_AMD64_IBSOPCTL, 0);
 }
 
+static inline int eilvt_is_available(int offset)
+{
+	/* check if we may assign a vector */
+	return !setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 1);
+}
+
+static inline int ibs_eilvt_valid(void)
+{
+	u64 val;
+	int offset;
+
+	rdmsrl(MSR_AMD64_IBSCTL, val);
+	if (!(val & IBSCTL_LVT_OFFSET_VALID)) {
+		pr_err(FW_BUG "cpu %d, invalid IBS "
+		       "interrupt offset %d (MSR%08X=0x%016llx)",
+		       smp_processor_id(), offset,
+		       MSR_AMD64_IBSCTL, val);
+		return 0;
+	}
+
+	offset = val & IBSCTL_LVT_OFFSET_MASK;
+
+	if (eilvt_is_available(offset))
+		return !0;
+
+	pr_err(FW_BUG "cpu %d, IBS interrupt offset %d "
+	       "not available (MSR%08X=0x%016llx)",
+	       smp_processor_id(), offset,
+	       MSR_AMD64_IBSCTL, val);
+
+	return 0;
+}
+
+static inline int get_ibs_offset(void)
+{
+	u64 val;
+
+	rdmsrl(MSR_AMD64_IBSCTL, val);
+	if (!(val & IBSCTL_LVT_OFFSET_VALID))
+		return -EINVAL;
+
+	return val & IBSCTL_LVT_OFFSET_MASK;
+}
+
+static void setup_APIC_ibs(void)
+{
+	int offset;
+
+	offset = get_ibs_offset();
+	if (offset < 0)
+		goto failed;
+
+	if (!setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 0))
+		return;
+failed:
+	pr_warn("oprofile: IBS APIC setup failed on cpu #%d\n",
+		smp_processor_id());
+}
+
+static void clear_APIC_ibs(void)
+{
+	int offset;
+
+	offset = get_ibs_offset();
+	if (offset >= 0)
+		setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1);
+}
+
 #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
 
 static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
@@ -376,13 +451,13 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
 	}
 
 	if (ibs_caps)
-		setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_NMI, 0);
+		setup_APIC_ibs();
 }
 
 static void op_amd_cpu_shutdown(void)
 {
 	if (ibs_caps)
-		setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1);
+		clear_APIC_ibs();
 }
 
 static int op_amd_check_ctrs(struct pt_regs * const regs,
@@ -445,16 +520,11 @@ static void op_amd_stop(struct op_msrs const * const msrs)
 	op_amd_stop_ibs();
 }
 
-static int __init_ibs_nmi(void)
+static int setup_ibs_ctl(int ibs_eilvt_off)
 {
-#define IBSCTL_LVTOFFSETVAL		(1 << 8)
-#define IBSCTL				0x1cc
 	struct pci_dev *cpu_cfg;
 	int nodes;
 	u32 value = 0;
-	u8 ibs_eilvt_off;
-
-	ibs_eilvt_off = setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1);
 
 	nodes = 0;
 	cpu_cfg = NULL;
@@ -466,21 +536,60 @@ static int __init_ibs_nmi(void)
 			break;
 		++nodes;
 		pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off
-				       | IBSCTL_LVTOFFSETVAL);
+				       | IBSCTL_LVT_OFFSET_VALID);
 		pci_read_config_dword(cpu_cfg, IBSCTL, &value);
-		if (value != (ibs_eilvt_off | IBSCTL_LVTOFFSETVAL)) {
+		if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) {
 			pci_dev_put(cpu_cfg);
 			printk(KERN_DEBUG "Failed to setup IBS LVT offset, "
-				"IBSCTL = 0x%08x", value);
-			return 1;
+			       "IBSCTL = 0x%08x\n", value);
+			return -EINVAL;
 		}
 	} while (1);
 
 	if (!nodes) {
-		printk(KERN_DEBUG "No CPU node configured for IBS");
-		return 1;
+		printk(KERN_DEBUG "No CPU node configured for IBS\n");
+		return -ENODEV;
+	}
+
+	return 0;
+}
+
+static int force_ibs_eilvt_setup(void)
+{
+	int i;
+	int ret;
+
+	/* find the next free available EILVT entry */
+	for (i = 1; i < 4; i++) {
+		if (!eilvt_is_available(i))
+			continue;
+		ret = setup_ibs_ctl(i);
+		if (ret)
+			return ret;
+		return 0;
 	}
 
+	printk(KERN_DEBUG "No EILVT entry available\n");
+
+	return -EBUSY;
+}
+
+static int __init_ibs_nmi(void)
+{
+	int ret;
+
+	if (ibs_eilvt_valid())
+		return 0;
+
+	ret = force_ibs_eilvt_setup();
+	if (ret)
+		return ret;
+
+	if (!ibs_eilvt_valid())
+		return -EFAULT;
+
+	pr_err(FW_BUG "workaround enabled for IBS LVT offset\n");
+
 	return 0;
 }
 
-- 
cgit v1.2.3-18-g5258


From f01f7c56a1425b9749a99af821e1de334fb64d7e Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Tue, 19 Oct 2010 22:17:37 +0000
Subject: x86, mm: Fix incorrect data type in vmalloc_sync_all()

arch/x86/mm/fault.c: In function 'vmalloc_sync_all':
arch/x86/mm/fault.c:238: warning: assignment makes integer from pointer without a cast

introduced by 617d34d9e5d8326ec8f188c616aa06ac59d083fe.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <20101020103642.GA3135@kryptos.osrc.amd.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/mm/fault.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 6c27c39f8a3..0cdb8d493f6 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -230,7 +230,7 @@ void vmalloc_sync_all(void)
 		spin_lock_irqsave(&pgd_lock, flags);
 		list_for_each_entry(page, &pgd_list, lru) {
 			spinlock_t *pgt_lock;
-			int ret;
+			pmd_t *ret;
 
 			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
 
-- 
cgit v1.2.3-18-g5258


From b40827fa7268fda8a62490728a61c2856f33830b Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@alien8.de>
Date: Sat, 28 Aug 2010 15:58:33 +0200
Subject: x86-32, mm: Add an initial page table for core bootstrapping

This patch adds an initial page table with low mappings used exclusively
for booting APs/resuming after ACPI suspend/machine restart. After this,
there's no need to add low mappings to swapper_pg_dir and zap them later
or create own swsusp PGD page solely for ACPI sleep needs - we have
initial_page_table for that.

Signed-off-by: Borislav Petkov <bp@alien8.de>
LKML-Reference: <20101020070526.GA9588@liondog.tnic>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/pgtable_32.h |  2 +-
 arch/x86/include/asm/tlbflush.h   |  2 --
 arch/x86/include/asm/trampoline.h |  3 ---
 arch/x86/kernel/acpi/sleep.c      |  7 ++++-
 arch/x86/kernel/head32.c          |  1 +
 arch/x86/kernel/head_32.S         | 55 ++++++++++++++++++---------------------
 arch/x86/kernel/reboot.c          | 10 ++-----
 arch/x86/kernel/setup.c           | 18 ++++++++++++-
 arch/x86/kernel/smpboot.c         | 16 +++---------
 arch/x86/kernel/trampoline.c      | 16 ------------
 arch/x86/mm/init_32.c             | 45 --------------------------------
 11 files changed, 56 insertions(+), 119 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index f686f49e8b7..8abde9ec90b 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -26,7 +26,7 @@ struct mm_struct;
 struct vm_area_struct;
 
 extern pgd_t swapper_pg_dir[1024];
-extern pgd_t trampoline_pg_dir[1024];
+extern pgd_t initial_page_table[1024];
 
 static inline void pgtable_cache_init(void) { }
 static inline void check_pgt_cache(void) { }
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 7f3eba08e7d..169be8938b9 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -172,6 +172,4 @@ static inline void flush_tlb_kernel_range(unsigned long start,
 	flush_tlb_all();
 }
 
-extern void zap_low_mappings(bool early);
-
 #endif /* _ASM_X86_TLBFLUSH_H */
diff --git a/arch/x86/include/asm/trampoline.h b/arch/x86/include/asm/trampoline.h
index 4dde797c057..f4500fb3b48 100644
--- a/arch/x86/include/asm/trampoline.h
+++ b/arch/x86/include/asm/trampoline.h
@@ -13,16 +13,13 @@ extern unsigned char *trampoline_base;
 
 extern unsigned long init_rsp;
 extern unsigned long initial_code;
-extern unsigned long initial_page_table;
 extern unsigned long initial_gs;
 
 #define TRAMPOLINE_SIZE roundup(trampoline_end - trampoline_data, PAGE_SIZE)
 
 extern unsigned long setup_trampoline(void);
-extern void __init setup_trampoline_page_table(void);
 extern void __init reserve_trampoline_memory(void);
 #else
-static inline void setup_trampoline_page_table(void) {}
 static inline void reserve_trampoline_memory(void) {}
 #endif /* CONFIG_X86_TRAMPOLINE */
 
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 33cec152070..b35e1ab8ba0 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -12,6 +12,11 @@
 #include <asm/segment.h>
 #include <asm/desc.h>
 
+#ifdef CONFIG_X86_32
+#include <asm/pgtable.h>
+#include <asm/pgtable_32.h>
+#endif
+
 #include "realmode/wakeup.h"
 #include "sleep.h"
 
@@ -90,7 +95,7 @@ int acpi_save_state_mem(void)
 
 #ifndef CONFIG_64BIT
 	header->pmode_entry = (u32)&wakeup_pmode_return;
-	header->pmode_cr3 = (u32)(swsusp_pg_dir - __PAGE_OFFSET);
+	header->pmode_cr3 = (u32)__pa(&initial_page_table);
 	saved_magic = 0x12345678;
 #else /* CONFIG_64BIT */
 	header->trampoline_segment = setup_trampoline() >> 4;
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 784360c0625..8b9c2019fc0 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -17,6 +17,7 @@
 #include <asm/apic.h>
 #include <asm/io_apic.h>
 #include <asm/bios_ebda.h>
+#include <asm/tlbflush.h>
 
 static void __init i386_default_early_setup(void)
 {
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index fa8c1b8e09f..bcece91dd31 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -183,13 +183,12 @@ default_entry:
 #ifdef CONFIG_X86_PAE
 
 	/*
-	 * In PAE mode swapper_pg_dir is statically defined to contain enough
-	 * entries to cover the VMSPLIT option (that is the top 1, 2 or 3
-	 * entries). The identity mapping is handled by pointing two PGD
-	 * entries to the first kernel PMD.
+	 * In PAE mode initial_page_table is statically defined to contain
+	 * enough entries to cover the VMSPLIT option (that is the top 1, 2 or 3
+	 * entries). The identity mapping is handled by pointing two PGD entries
+	 * to the first kernel PMD.
 	 *
-	 * Note the upper half of each PMD or PTE are always zero at
-	 * this stage.
+	 * Note the upper half of each PMD or PTE are always zero at this stage.
 	 */
 
 #define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */
@@ -197,7 +196,7 @@ default_entry:
 	xorl %ebx,%ebx				/* %ebx is kept at zero */
 
 	movl $pa(__brk_base), %edi
-	movl $pa(swapper_pg_pmd), %edx
+	movl $pa(initial_pg_pmd), %edx
 	movl $PTE_IDENT_ATTR, %eax
 10:
 	leal PDE_IDENT_ATTR(%edi),%ecx		/* Create PMD entry */
@@ -226,14 +225,14 @@ default_entry:
 	movl %eax, pa(max_pfn_mapped)
 
 	/* Do early initialization of the fixmap area */
-	movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax
-	movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8)
+	movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
+	movl %eax,pa(initial_pg_pmd+0x1000*KPMDS-8)
 #else	/* Not PAE */
 
 page_pde_offset = (__PAGE_OFFSET >> 20);
 
 	movl $pa(__brk_base), %edi
-	movl $pa(swapper_pg_dir), %edx
+	movl $pa(initial_page_table), %edx
 	movl $PTE_IDENT_ATTR, %eax
 10:
 	leal PDE_IDENT_ATTR(%edi),%ecx		/* Create PDE entry */
@@ -257,8 +256,8 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
 	movl %eax, pa(max_pfn_mapped)
 
 	/* Do early initialization of the fixmap area */
-	movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax
-	movl %eax,pa(swapper_pg_dir+0xffc)
+	movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
+	movl %eax,pa(initial_page_table+0xffc)
 #endif
 	jmp 3f
 /*
@@ -334,7 +333,7 @@ ENTRY(startup_32_smp)
 /*
  * Enable paging
  */
-	movl pa(initial_page_table), %eax
+	movl $pa(initial_page_table), %eax
 	movl %eax,%cr3		/* set the page table pointer.. */
 	movl %cr0,%eax
 	orl  $X86_CR0_PG,%eax
@@ -614,8 +613,6 @@ ignore_int:
 .align 4
 ENTRY(initial_code)
 	.long i386_start_kernel
-ENTRY(initial_page_table)
-	.long pa(swapper_pg_dir)
 
 /*
  * BSS section
@@ -623,20 +620,18 @@ ENTRY(initial_page_table)
 __PAGE_ALIGNED_BSS
 	.align PAGE_SIZE_asm
 #ifdef CONFIG_X86_PAE
-swapper_pg_pmd:
+initial_pg_pmd:
 	.fill 1024*KPMDS,4,0
 #else
-ENTRY(swapper_pg_dir)
+ENTRY(initial_page_table)
 	.fill 1024,4,0
 #endif
-swapper_pg_fixmap:
+initial_pg_fixmap:
 	.fill 1024,4,0
-#ifdef CONFIG_X86_TRAMPOLINE
-ENTRY(trampoline_pg_dir)
-	.fill 1024,4,0
-#endif
 ENTRY(empty_zero_page)
 	.fill 4096,1,0
+ENTRY(swapper_pg_dir)
+	.fill 1024,4,0
 
 /*
  * This starts the data section.
@@ -645,20 +640,20 @@ ENTRY(empty_zero_page)
 __PAGE_ALIGNED_DATA
 	/* Page-aligned for the benefit of paravirt? */
 	.align PAGE_SIZE_asm
-ENTRY(swapper_pg_dir)
-	.long	pa(swapper_pg_pmd+PGD_IDENT_ATTR),0	/* low identity map */
+ENTRY(initial_page_table)
+	.long	pa(initial_pg_pmd+PGD_IDENT_ATTR),0	/* low identity map */
 # if KPMDS == 3
-	.long	pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
-	.long	pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0
-	.long	pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x2000),0
+	.long	pa(initial_pg_pmd+PGD_IDENT_ATTR),0
+	.long	pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0
+	.long	pa(initial_pg_pmd+PGD_IDENT_ATTR+0x2000),0
 # elif KPMDS == 2
 	.long	0,0
-	.long	pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
-	.long	pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0
+	.long	pa(initial_pg_pmd+PGD_IDENT_ATTR),0
+	.long	pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0
 # elif KPMDS == 1
 	.long	0,0
 	.long	0,0
-	.long	pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
+	.long	pa(initial_pg_pmd+PGD_IDENT_ATTR),0
 # else
 #  error "Kernel PMDs should be 1, 2 or 3"
 # endif
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 7a4cf14223b..f7f53dcd3e0 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -371,16 +371,10 @@ void machine_real_restart(const unsigned char *code, int length)
 	CMOS_WRITE(0x00, 0x8f);
 	spin_unlock(&rtc_lock);
 
-	/* Remap the kernel at virtual address zero, as well as offset zero
-	   from the kernel segment.  This assumes the kernel segment starts at
-	   virtual address PAGE_OFFSET. */
-	memcpy(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY,
-		sizeof(swapper_pg_dir [0]) * KERNEL_PGD_PTRS);
-
 	/*
-	 * Use `swapper_pg_dir' as our page directory.
+	 * Switch back to the initial page table.
 	 */
-	load_cr3(swapper_pg_dir);
+	load_cr3(initial_page_table);
 
 	/* Write 0x1234 to absolute memory location 0x472.  The BIOS reads
 	   this on booting to tell it to "Bypass memory test (also warm
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 322b24fbeaf..af6cf2bfcee 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -728,6 +728,17 @@ void __init setup_arch(char **cmdline_p)
 #ifdef CONFIG_X86_32
 	memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
 	visws_early_detect();
+
+	/*
+	 * copy kernel address range established so far and switch
+	 * to the proper swapper page table
+	 */
+	clone_pgd_range(swapper_pg_dir     + KERNEL_PGD_BOUNDARY,
+			initial_page_table + KERNEL_PGD_BOUNDARY,
+			KERNEL_PGD_PTRS);
+
+	load_cr3(swapper_pg_dir);
+	__flush_tlb_all();
 #else
 	printk(KERN_INFO "Command line: %s\n", boot_command_line);
 #endif
@@ -1009,7 +1020,12 @@ void __init setup_arch(char **cmdline_p)
 	paging_init();
 	x86_init.paging.pagetable_setup_done(swapper_pg_dir);
 
-	setup_trampoline_page_table();
+#ifdef CONFIG_X86_32
+	/* sync back kernel address range */
+	clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY,
+			swapper_pg_dir     + KERNEL_PGD_BOUNDARY,
+			KERNEL_PGD_PTRS);
+#endif
 
 	tboot_probe();
 
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 63a1a5596ac..e63bb518585 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -298,22 +298,16 @@ notrace static void __cpuinit start_secondary(void *unused)
 	 * fragile that we want to limit the things done here to the
 	 * most necessary things.
 	 */
+	cpu_init();
+	preempt_disable();
+	smp_callin();
 
 #ifdef CONFIG_X86_32
-	/*
-	 * Switch away from the trampoline page-table
-	 *
-	 * Do this before cpu_init() because it needs to access per-cpu
-	 * data which may not be mapped in the trampoline page-table.
-	 */
+	/* switch away from the initial page table */
 	load_cr3(swapper_pg_dir);
 	__flush_tlb_all();
 #endif
 
-	cpu_init();
-	preempt_disable();
-	smp_callin();
-
 	/* otherwise gcc will move up smp_processor_id before the cpu_init */
 	barrier();
 	/*
@@ -772,7 +766,6 @@ do_rest:
 #ifdef CONFIG_X86_32
 	/* Stack for startup_32 can be just as for start_secondary onwards */
 	irq_ctx_init(cpu);
-	initial_page_table = __pa(&trampoline_pg_dir);
 #else
 	clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
 	initial_gs = per_cpu_offset(cpu);
@@ -921,7 +914,6 @@ int __cpuinit native_cpu_up(unsigned int cpu)
 	per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
 
 	err = do_boot_cpu(apicid, cpu);
-
 	if (err) {
 		pr_debug("do_boot_cpu failed %d\n", err);
 		return -EIO;
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
index e2a59525739..f1488a344cd 100644
--- a/arch/x86/kernel/trampoline.c
+++ b/arch/x86/kernel/trampoline.c
@@ -38,19 +38,3 @@ unsigned long __trampinit setup_trampoline(void)
 	memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE);
 	return virt_to_phys(trampoline_base);
 }
-
-void __init setup_trampoline_page_table(void)
-{
-#ifdef CONFIG_X86_32
-	/* Copy kernel address range */
-	clone_pgd_range(trampoline_pg_dir + KERNEL_PGD_BOUNDARY,
-			swapper_pg_dir + KERNEL_PGD_BOUNDARY,
-			KERNEL_PGD_PTRS);
-
-	/* Initialize low mappings */
-	clone_pgd_range(trampoline_pg_dir,
-			swapper_pg_dir + KERNEL_PGD_BOUNDARY,
-			min_t(unsigned long, KERNEL_PGD_PTRS,
-			      KERNEL_PGD_BOUNDARY));
-#endif
-}
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 558f2d33207..1aeac2d9df8 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -548,48 +548,6 @@ static void __init pagetable_init(void)
 	permanent_kmaps_init(pgd_base);
 }
 
-#ifdef CONFIG_ACPI_SLEEP
-/*
- * ACPI suspend needs this for resume, because things like the intel-agp
- * driver might have split up a kernel 4MB mapping.
- */
-char swsusp_pg_dir[PAGE_SIZE]
-	__attribute__ ((aligned(PAGE_SIZE)));
-
-static inline void save_pg_dir(void)
-{
-	copy_page(swsusp_pg_dir, swapper_pg_dir);
-}
-#else /* !CONFIG_ACPI_SLEEP */
-static inline void save_pg_dir(void)
-{
-}
-#endif /* !CONFIG_ACPI_SLEEP */
-
-void zap_low_mappings(bool early)
-{
-	int i;
-
-	/*
-	 * Zap initial low-memory mappings.
-	 *
-	 * Note that "pgd_clear()" doesn't do it for
-	 * us, because pgd_clear() is a no-op on i386.
-	 */
-	for (i = 0; i < KERNEL_PGD_BOUNDARY; i++) {
-#ifdef CONFIG_X86_PAE
-		set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
-#else
-		set_pgd(swapper_pg_dir+i, __pgd(0));
-#endif
-	}
-
-	if (early)
-		__flush_tlb();
-	else
-		flush_tlb_all();
-}
-
 pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP);
 EXPORT_SYMBOL_GPL(__supported_pte_mask);
 
@@ -958,9 +916,6 @@ void __init mem_init(void)
 
 	if (boot_cpu_data.wp_works_ok < 0)
 		test_wp_bit();
-
-	save_pg_dir();
-	zap_low_mappings(true);
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-- 
cgit v1.2.3-18-g5258


From 932967202182743c01a2eee4bdfa2c42697bc586 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Wed, 20 Oct 2010 11:07:03 +0800
Subject: x86: Spread tlb flush vector between nodes

Currently flush tlb vector allocation is based on below equation:
	sender = smp_processor_id() % 8
This isn't optimal, CPUs from different node can have the same vector, this
causes a lot of lock contention. Instead, we can assign the same vectors to
CPUs from the same node, while different node has different vectors. This has
below advantages:
a. if there is lock contention, the lock contention is between CPUs from one
node. This should be much cheaper than the contention between nodes.
b. completely avoid lock contention between nodes. This especially benefits
kswapd, which is the biggest user of tlb flush, since kswapd sets its affinity
to specific node.

In my test, this could reduce > 20% CPU overhead in extreme case.The test
machine has 4 nodes and each node has 16 CPUs. I then bind each node's kswapd
to the first CPU of the node. I run a workload with 4 sequential mmap file
read thread. The files are empty sparse file. This workload will trigger a
lot of page reclaim and tlbflush. The kswapd bind is to easy trigger the
extreme tlb flush lock contention because otherwise kswapd keeps migrating
between CPUs of a node and I can't get stable result. Sure in real workload,
we can't always see so big tlb flush lock contention, but it's possible.

[ hpa: folded in fix from Eric Dumazet to use this_cpu_read() ]

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
LKML-Reference: <1287544023.4571.8.camel@sli10-conroe.sh.intel.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/mm/tlb.c | 48 +++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 47 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index c03f14ab666..49358481c73 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -5,6 +5,7 @@
 #include <linux/smp.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
+#include <linux/cpu.h>
 
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
@@ -52,6 +53,8 @@ union smp_flush_state {
    want false sharing in the per cpu data segment. */
 static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS];
 
+static DEFINE_PER_CPU_READ_MOSTLY(int, tlb_vector_offset);
+
 /*
  * We cannot call mmdrop() because we are in interrupt context,
  * instead update mm->cpu_vm_mask.
@@ -173,7 +176,7 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
 	union smp_flush_state *f;
 
 	/* Caller has disabled preemption */
-	sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
+	sender = this_cpu_read(tlb_vector_offset);
 	f = &flush_state[sender];
 
 	/*
@@ -218,6 +221,47 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
 	flush_tlb_others_ipi(cpumask, mm, va);
 }
 
+static void __cpuinit calculate_tlb_offset(void)
+{
+	int cpu, node, nr_node_vecs;
+	/*
+	 * we are changing tlb_vector_offset for each CPU in runtime, but this
+	 * will not cause inconsistency, as the write is atomic under X86. we
+	 * might see more lock contentions in a short time, but after all CPU's
+	 * tlb_vector_offset are changed, everything should go normal
+	 *
+	 * Note: if NUM_INVALIDATE_TLB_VECTORS % nr_online_nodes !=0, we might
+	 * waste some vectors.
+	 **/
+	if (nr_online_nodes > NUM_INVALIDATE_TLB_VECTORS)
+		nr_node_vecs = 1;
+	else
+		nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes;
+
+	for_each_online_node(node) {
+		int node_offset = (node % NUM_INVALIDATE_TLB_VECTORS) *
+			nr_node_vecs;
+		int cpu_offset = 0;
+		for_each_cpu(cpu, cpumask_of_node(node)) {
+			per_cpu(tlb_vector_offset, cpu) = node_offset +
+				cpu_offset;
+			cpu_offset++;
+			cpu_offset = cpu_offset % nr_node_vecs;
+		}
+	}
+}
+
+static int tlb_cpuhp_notify(struct notifier_block *n,
+		unsigned long action, void *hcpu)
+{
+	switch (action & 0xf) {
+	case CPU_ONLINE:
+	case CPU_DEAD:
+		calculate_tlb_offset();
+	}
+	return NOTIFY_OK;
+}
+
 static int __cpuinit init_smp_flush(void)
 {
 	int i;
@@ -225,6 +269,8 @@ static int __cpuinit init_smp_flush(void)
 	for (i = 0; i < ARRAY_SIZE(flush_state); i++)
 		raw_spin_lock_init(&flush_state[i].tlbstate_lock);
 
+	calculate_tlb_offset();
+	hotcpu_notifier(tlb_cpuhp_notify, 0);
 	return 0;
 }
 core_initcall(init_smp_flush);
-- 
cgit v1.2.3-18-g5258


From 66f2b061546974b96b7b238a92ce89a87ecf0754 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Wed, 20 Oct 2010 15:55:35 -0700
Subject: x86, mm: Enable ARCH_DMA_ADDR_T_64BIT with X86_64 || HIGHMEM64G

Set CONFIG_ARCH_DMA_ADDR_T_64BIT when we set dma_addr_t to 64 bits in
<asm/types.h>; this allows Kconfig decisions based on this property.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
LKML-Reference: <201010202255.o9KMtZXu009370@imap1.linux-foundation.org>
Acked-by: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/Kconfig | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cea0cd9a316..2924f4e7779 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1148,6 +1148,9 @@ config X86_PAE
 config ARCH_PHYS_ADDR_T_64BIT
 	def_bool X86_64 || X86_PAE
 
+config ARCH_DMA_ADDR_T_64BIT
+	def_bool X86_64 || HIGHMEM64G
+
 config DIRECT_GBPAGES
 	bool "Enable 1GB pages for kernel pagetables" if EMBEDDED
 	default y
-- 
cgit v1.2.3-18-g5258


From 5bba6c56dc99ff88f79a79572e29ecf445710878 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 21 Oct 2010 09:36:07 -0400
Subject: X86/PCI: Remove the dependency on isapnp_disable.

This looks to be vestigial dependency that had never been used even
in the original code base (2.6.18) from which this driver
was up-ported. Without this fix, with the CONFIG_ISAPNP, we get this
compile failure:

arch/x86/pci/xen.c: In function 'pci_xen_init':
arch/x86/pci/xen.c:138: error: 'isapnp_disable' undeclared (first use in this function)
arch/x86/pci/xen.c:138: error: (Each undeclared identifier is reported only once
arch/x86/pci/xen.c:138: error: for each function it appears in.)

Reported-by: Li Zefan <lizf@cn.fujitsu.com>
Tested-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/pci/xen.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index b19c873d8d0..4e371065ce4 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -133,11 +133,6 @@ int __init pci_xen_init(void)
 	acpi_noirq = 1;
 #endif
 
-#ifdef CONFIG_ISAPNP
-	/* Stop isapnp from probing */
-	isapnp_disable = 1;
-#endif
-
 #ifdef CONFIG_PCI_MSI
 	x86_msi.setup_msi_irqs = xen_setup_msi_irqs;
 	x86_msi.teardown_msi_irq = xen_teardown_msi_irq;
-- 
cgit v1.2.3-18-g5258


From 260586d2b444909380137de6c6423e5b44edf4db Mon Sep 17 00:00:00 2001
From: Daniel Drake <dsd@laptop.org>
Date: Tue, 5 Oct 2010 15:55:21 +0100
Subject: Add OLPC XO-1 rfkill driver

Add a software rfkill switch for the WLAN interface in the OLPC XO-1
laptop. It uses the OLPC embedded controller to cut/restore power to
the Marvell WLAN chip on the motherboard.

Signed-off-by: Daniel Drake <dsd@laptop.org>
Signed-off-by: Matthew Garrett <mjg@redhat.com>
---
 arch/x86/include/asm/olpc.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h
index 101229b0d8e..42a978c0c1b 100644
--- a/arch/x86/include/asm/olpc.h
+++ b/arch/x86/include/asm/olpc.h
@@ -89,6 +89,8 @@ extern int olpc_ec_mask_unset(uint8_t bits);
 /* EC commands */
 
 #define EC_FIRMWARE_REV		0x08
+#define EC_WLAN_ENTER_RESET	0x35
+#define EC_WLAN_LEAVE_RESET	0x25
 
 /* SCI source values */
 
-- 
cgit v1.2.3-18-g5258


From 76fac077db6b34e2c6383a7b4f3f4f7b7d06d8ce Mon Sep 17 00:00:00 2001
From: Alok Kataria <akataria@vmware.com>
Date: Mon, 11 Oct 2010 14:37:08 -0700
Subject: x86, kexec: Make sure to stop all CPUs before exiting the kernel

x86 smp_ops now has a new op, stop_other_cpus which takes a parameter
"wait" this allows the caller to specify if it wants to stop until all
the cpus have processed the stop IPI.  This is required specifically
for the kexec case where we should wait for all the cpus to be stopped
before starting the new kernel.  We now wait for the cpus to stop in
all cases except for panic/kdump where we expect things to be broken
and we are doing our best to make things work anyway.

This patch fixes a legitimate regression, which was introduced during
2.6.30, by commit id 4ef702c10b5df18ab04921fc252c26421d4d6c75.

Signed-off-by: Alok N Kataria <akataria@vmware.com>
LKML-Reference: <1286833028.1372.20.camel@ank32.eng.vmware.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: <stable@kernel.org> v2.6.30-36
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/smp.h |  9 +++++++--
 arch/x86/kernel/reboot.c   |  2 +-
 arch/x86/kernel/smp.c      | 15 +++++++++------
 arch/x86/xen/enlighten.c   |  2 +-
 arch/x86/xen/smp.c         |  6 +++---
 5 files changed, 21 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 4cfc9082406..4c2f63c7fc1 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -50,7 +50,7 @@ struct smp_ops {
 	void (*smp_prepare_cpus)(unsigned max_cpus);
 	void (*smp_cpus_done)(unsigned max_cpus);
 
-	void (*smp_send_stop)(void);
+	void (*stop_other_cpus)(int wait);
 	void (*smp_send_reschedule)(int cpu);
 
 	int (*cpu_up)(unsigned cpu);
@@ -73,7 +73,12 @@ extern struct smp_ops smp_ops;
 
 static inline void smp_send_stop(void)
 {
-	smp_ops.smp_send_stop();
+	smp_ops.stop_other_cpus(0);
+}
+
+static inline void stop_other_cpus(void)
+{
+	smp_ops.stop_other_cpus(1);
 }
 
 static inline void smp_prepare_boot_cpu(void)
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index e3af342fe83..76a0d715a03 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -641,7 +641,7 @@ void native_machine_shutdown(void)
 	/* O.K Now that I'm on the appropriate processor,
 	 * stop all of the others.
 	 */
-	smp_send_stop();
+	stop_other_cpus();
 #endif
 
 	lapic_shutdown();
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index d801210945d..513deac7228 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -159,10 +159,10 @@ asmlinkage void smp_reboot_interrupt(void)
 	irq_exit();
 }
 
-static void native_smp_send_stop(void)
+static void native_stop_other_cpus(int wait)
 {
 	unsigned long flags;
-	unsigned long wait;
+	unsigned long timeout;
 
 	if (reboot_force)
 		return;
@@ -179,9 +179,12 @@ static void native_smp_send_stop(void)
 	if (num_online_cpus() > 1) {
 		apic->send_IPI_allbutself(REBOOT_VECTOR);
 
-		/* Don't wait longer than a second */
-		wait = USEC_PER_SEC;
-		while (num_online_cpus() > 1 && wait--)
+		/*
+		 * Don't wait longer than a second if the caller
+		 * didn't ask us to wait.
+		 */
+		timeout = USEC_PER_SEC;
+		while (num_online_cpus() > 1 && (wait || timeout--))
 			udelay(1);
 	}
 
@@ -227,7 +230,7 @@ struct smp_ops smp_ops = {
 	.smp_prepare_cpus	= native_smp_prepare_cpus,
 	.smp_cpus_done		= native_smp_cpus_done,
 
-	.smp_send_stop		= native_smp_send_stop,
+	.stop_other_cpus	= native_stop_other_cpus,
 	.smp_send_reschedule	= native_smp_send_reschedule,
 
 	.cpu_up			= native_cpu_up,
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 7d46c844141..44f80861382 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1018,7 +1018,7 @@ static void xen_reboot(int reason)
 	struct sched_shutdown r = { .reason = reason };
 
 #ifdef CONFIG_SMP
-	smp_send_stop();
+	stop_other_cpus();
 #endif
 
 	if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r))
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 25f232b18a8..f4d01003146 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -400,9 +400,9 @@ static void stop_self(void *v)
 	BUG();
 }
 
-static void xen_smp_send_stop(void)
+static void xen_stop_other_cpus(int wait)
 {
-	smp_call_function(stop_self, NULL, 0);
+	smp_call_function(stop_self, NULL, wait);
 }
 
 static void xen_smp_send_reschedule(int cpu)
@@ -470,7 +470,7 @@ static const struct smp_ops xen_smp_ops __initdata = {
 	.cpu_disable = xen_cpu_disable,
 	.play_dead = xen_play_dead,
 
-	.smp_send_stop = xen_smp_send_stop,
+	.stop_other_cpus = xen_stop_other_cpus,
 	.smp_send_reschedule = xen_smp_send_reschedule,
 
 	.send_call_func_ipi = xen_smp_send_call_function_ipi,
-- 
cgit v1.2.3-18-g5258


From 07bd8516a2f967aa67904c68ab97bb896a448b09 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Fri, 22 Oct 2010 08:22:35 +0100
Subject: x86, asm: Restore parentheses around one pushl_cfi argument

These were (intentionally) stripped by "fix CFI macro
invocations to deal with shortcomings in gas" to expose problems
with unexpected splitting of arguments by older gas also on
newer versions, but as it turns out there is at least one distro
(Ubuntu 6.06) where even not having *any* spaces in a macro
argument doesn't reliably prevent splitting into multiple
arguments.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Acked-by: Alexander van Heukelum <heukelum@fastmail.fm>
LKML-Reference: <4CC157DB020000780001E8A2@vpn.id2.novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/entry_32.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index f73a4b881aa..59e175e8959 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -395,7 +395,7 @@ sysenter_past_esp:
 	 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
 	 * pushed above; +8 corresponds to copy_thread's esp0 setting.
 	 */
-	pushl_cfi TI_sysenter_return-THREAD_SIZE_asm+8+4*4(%esp)
+	pushl_cfi (TI_sysenter_return-THREAD_SIZE_asm+8+4*4)(%esp)
 	CFI_REL_OFFSET eip, 0
 
 	pushl_cfi %eax
-- 
cgit v1.2.3-18-g5258


From b39f88acd7d989b6b247ba87c480fc24ed71d9c5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 19 Oct 2010 14:08:29 +0200
Subject: perf, x86: Extract PEBS/BTS buffer free routines

So that we may grow additional call-sites..

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Stephane Eranian <eranian@google.com>
LKML-Reference: <20101019134808.196793164@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 4977f9c400e..1bc13518dd5 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -74,6 +74,28 @@ static void fini_debug_store_on_cpu(int cpu)
 	wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
 }
 
+static void release_pebs_buffer(int cpu)
+{
+	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+
+	if (!ds || !x86_pmu.pebs)
+		return;
+
+	kfree((void *)(unsigned long)ds->pebs_buffer_base);
+	ds->pebs_buffer_base = 0;
+}
+
+static void release_bts_buffer(int cpu)
+{
+	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+
+	if (!ds || !x86_pmu.bts)
+		return;
+
+	kfree((void *)(unsigned long)ds->bts_buffer_base);
+	ds->bts_buffer_base = 0;
+}
+
 static void release_ds_buffers(void)
 {
 	int cpu;
@@ -82,7 +104,6 @@ static void release_ds_buffers(void)
 		return;
 
 	get_online_cpus();
-
 	for_each_online_cpu(cpu)
 		fini_debug_store_on_cpu(cpu);
 
@@ -92,13 +113,12 @@ static void release_ds_buffers(void)
 		if (!ds)
 			continue;
 
-		per_cpu(cpu_hw_events, cpu).ds = NULL;
+		release_pebs_buffer(cpu);
+		release_bts_buffer(cpu);
 
-		kfree((void *)(unsigned long)ds->pebs_buffer_base);
-		kfree((void *)(unsigned long)ds->bts_buffer_base);
+		per_cpu(cpu_hw_events, cpu).ds = NULL;
 		kfree(ds);
 	}
-
 	put_online_cpus();
 }
 
-- 
cgit v1.2.3-18-g5258


From 5ee25c87318fa3722026fd77089fa7ba0db8d447 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 19 Oct 2010 14:15:04 +0200
Subject: perf, x86: Extract PEBS/BTS allocation functions

Mostly a cleanup.. it reduces code indentation and makes the code flow
of reserve_ds_buffers() clearer.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Stephane Eranian <eranian@google.com>
LKML-Reference: <20101019134808.253453452@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 90 +++++++++++++++++++------------
 1 file changed, 56 insertions(+), 34 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 1bc13518dd5..14d98bd5205 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -74,6 +74,32 @@ static void fini_debug_store_on_cpu(int cpu)
 	wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
 }
 
+static int alloc_pebs_buffer(int cpu)
+{
+	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+	int max, thresh = 1; /* always use a single PEBS record */
+	void *buffer;
+
+	if (!x86_pmu.pebs)
+		return 0;
+
+	buffer = kzalloc(PEBS_BUFFER_SIZE, GFP_KERNEL);
+	if (unlikely(!buffer))
+		return -ENOMEM;
+
+	max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size;
+
+	ds->pebs_buffer_base = (u64)(unsigned long)buffer;
+	ds->pebs_index = ds->pebs_buffer_base;
+	ds->pebs_absolute_maximum = ds->pebs_buffer_base +
+		max * x86_pmu.pebs_record_size;
+
+	ds->pebs_interrupt_threshold = ds->pebs_buffer_base +
+		thresh * x86_pmu.pebs_record_size;
+
+	return 0;
+}
+
 static void release_pebs_buffer(int cpu)
 {
 	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
@@ -85,6 +111,32 @@ static void release_pebs_buffer(int cpu)
 	ds->pebs_buffer_base = 0;
 }
 
+static int alloc_bts_buffer(int cpu)
+{
+	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+	int max, thresh;
+	void *buffer;
+
+	if (!x86_pmu.bts)
+		return 0;
+
+	buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
+	if (unlikely(!buffer))
+		return -ENOMEM;
+
+	max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
+	thresh = max / 16;
+
+	ds->bts_buffer_base = (u64)(unsigned long)buffer;
+	ds->bts_index = ds->bts_buffer_base;
+	ds->bts_absolute_maximum = ds->bts_buffer_base +
+		max * BTS_RECORD_SIZE;
+	ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
+		thresh * BTS_RECORD_SIZE;
+
+	return 0;
+}
+
 static void release_bts_buffer(int cpu)
 {
 	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
@@ -133,8 +185,6 @@ static int reserve_ds_buffers(void)
 
 	for_each_possible_cpu(cpu) {
 		struct debug_store *ds;
-		void *buffer;
-		int max, thresh;
 
 		err = -ENOMEM;
 		ds = kzalloc(sizeof(*ds), GFP_KERNEL);
@@ -142,39 +192,11 @@ static int reserve_ds_buffers(void)
 			break;
 		per_cpu(cpu_hw_events, cpu).ds = ds;
 
-		if (x86_pmu.bts) {
-			buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
-			if (unlikely(!buffer))
-				break;
-
-			max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
-			thresh = max / 16;
-
-			ds->bts_buffer_base = (u64)(unsigned long)buffer;
-			ds->bts_index = ds->bts_buffer_base;
-			ds->bts_absolute_maximum = ds->bts_buffer_base +
-				max * BTS_RECORD_SIZE;
-			ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
-				thresh * BTS_RECORD_SIZE;
-		}
+		if (alloc_bts_buffer(cpu))
+			break;
 
-		if (x86_pmu.pebs) {
-			buffer = kzalloc(PEBS_BUFFER_SIZE, GFP_KERNEL);
-			if (unlikely(!buffer))
-				break;
-
-			max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size;
-
-			ds->pebs_buffer_base = (u64)(unsigned long)buffer;
-			ds->pebs_index = ds->pebs_buffer_base;
-			ds->pebs_absolute_maximum = ds->pebs_buffer_base +
-				max * x86_pmu.pebs_record_size;
-			/*
-			 * Always use single record PEBS
-			 */
-			ds->pebs_interrupt_threshold = ds->pebs_buffer_base +
-				x86_pmu.pebs_record_size;
-		}
+		if (alloc_pebs_buffer(cpu))
+			break;
 
 		err = 0;
 	}
-- 
cgit v1.2.3-18-g5258


From 65af94baca56beb3514d6cfce782634db9cf676d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 19 Oct 2010 14:37:23 +0200
Subject: perf, x86: Extract DS alloc/free functions

Again, mostly a cleanup to unclutter the reserve_ds_buffer() code.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Stephane Eranian <eranian@google.com>
LKML-Reference: <20101019134808.304495776@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 40 ++++++++++++++++++++-----------
 1 file changed, 26 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 14d98bd5205..3c86f4d2f02 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -148,6 +148,30 @@ static void release_bts_buffer(int cpu)
 	ds->bts_buffer_base = 0;
 }
 
+static int alloc_ds_buffer(int cpu)
+{
+	struct debug_store *ds;
+
+	ds = kzalloc(sizeof(*ds), GFP_KERNEL);
+	if (unlikely(!ds))
+		return -ENOMEM;
+
+	per_cpu(cpu_hw_events, cpu).ds = ds;
+
+	return 0;
+}
+
+static void release_ds_buffer(int cpu)
+{
+	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+
+	if (!ds)
+		return;
+
+	per_cpu(cpu_hw_events, cpu).ds = NULL;
+	kfree(ds);
+}
+
 static void release_ds_buffers(void)
 {
 	int cpu;
@@ -160,16 +184,9 @@ static void release_ds_buffers(void)
 		fini_debug_store_on_cpu(cpu);
 
 	for_each_possible_cpu(cpu) {
-		struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-
-		if (!ds)
-			continue;
-
 		release_pebs_buffer(cpu);
 		release_bts_buffer(cpu);
-
-		per_cpu(cpu_hw_events, cpu).ds = NULL;
-		kfree(ds);
+		release_ds_buffer(cpu);
 	}
 	put_online_cpus();
 }
@@ -184,13 +201,8 @@ static int reserve_ds_buffers(void)
 	get_online_cpus();
 
 	for_each_possible_cpu(cpu) {
-		struct debug_store *ds;
-
-		err = -ENOMEM;
-		ds = kzalloc(sizeof(*ds), GFP_KERNEL);
-		if (unlikely(!ds))
+		if (alloc_ds_buffer(cpu))
 			break;
-		per_cpu(cpu_hw_events, cpu).ds = ds;
 
 		if (alloc_bts_buffer(cpu))
 			break;
-- 
cgit v1.2.3-18-g5258


From 5553be2620ac901c21a25657bd5b59f73254e6d5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 19 Oct 2010 14:38:11 +0200
Subject: perf, x86: Fixup the precise_ip computation

In case we don't have PEBS, the LBR fixup doesn't make sense.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Stephane Eranian <eranian@google.com>
LKML-Reference: <20101019134808.354429461@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index fe73c1844a9..f369c53315a 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -497,12 +497,13 @@ static int x86_pmu_hw_config(struct perf_event *event)
 		int precise = 0;
 
 		/* Support for constant skid */
-		if (x86_pmu.pebs)
+		if (x86_pmu.pebs) {
 			precise++;
 
-		/* Support for IP fixup */
-		if (x86_pmu.lbr_nr)
-			precise++;
+			/* Support for IP fixup */
+			if (x86_pmu.lbr_nr)
+				precise++;
+		}
 
 		if (event->attr.precise_ip > precise)
 			return -EOPNOTSUPP;
-- 
cgit v1.2.3-18-g5258


From 6809b6ea73f7291f2e495d40397f1172c9caa77e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 19 Oct 2010 14:22:50 +0200
Subject: perf, x86: Less disastrous PEBS/BTS buffer allocation failure

Currently PEBS/BTS buffers are allocated when we instantiate the first
event, when this fails everything fails.

This is a problem because esp. BTS tries to allocate a rather large
buffer (64K), which can easily fail.

This patch changes the logic such that when either buffer allocation
fails, we simply don't allow events that would use these facilities,
but continue functioning for all other events.

This logic comes from a much larger patch proposed by Stephane.

Suggested-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Stephane Eranian <eranian@google.com>
LKML-Reference: <20101019134808.354429461@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c          |  5 +--
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 58 +++++++++++++++++++++++--------
 2 files changed, 47 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index f369c53315a..61e78f65106 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -238,6 +238,7 @@ struct x86_pmu {
 	 * Intel DebugStore bits
 	 */
 	int		bts, pebs;
+	int		bts_active, pebs_active;
 	int		pebs_record_size;
 	void		(*drain_pebs)(struct pt_regs *regs);
 	struct event_constraint *pebs_constraints;
@@ -478,7 +479,7 @@ static int x86_setup_perfctr(struct perf_event *event)
 	if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
 	    (hwc->sample_period == 1)) {
 		/* BTS is not supported by this architecture. */
-		if (!x86_pmu.bts)
+		if (!x86_pmu.bts_active)
 			return -EOPNOTSUPP;
 
 		/* BTS is currently only allowed for user-mode. */
@@ -497,7 +498,7 @@ static int x86_pmu_hw_config(struct perf_event *event)
 		int precise = 0;
 
 		/* Support for constant skid */
-		if (x86_pmu.pebs) {
+		if (x86_pmu.pebs_active) {
 			precise++;
 
 			/* Support for IP fixup */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 3c86f4d2f02..05c7db68277 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -193,36 +193,66 @@ static void release_ds_buffers(void)
 
 static int reserve_ds_buffers(void)
 {
-	int cpu, err = 0;
+	int bts_err = 0, pebs_err = 0;
+	int cpu;
+
+	x86_pmu.bts_active = 0;
+	x86_pmu.pebs_active = 0;
 
 	if (!x86_pmu.bts && !x86_pmu.pebs)
 		return 0;
 
+	if (!x86_pmu.bts)
+		bts_err = 1;
+
+	if (!x86_pmu.pebs)
+		pebs_err = 1;
+
 	get_online_cpus();
 
 	for_each_possible_cpu(cpu) {
-		if (alloc_ds_buffer(cpu))
-			break;
+		if (alloc_ds_buffer(cpu)) {
+			bts_err = 1;
+			pebs_err = 1;
+		}
 
-		if (alloc_bts_buffer(cpu))
-			break;
+		if (!bts_err && alloc_bts_buffer(cpu))
+			bts_err = 1;
+
+		if (!pebs_err && alloc_pebs_buffer(cpu))
+			pebs_err = 1;
 
-		if (alloc_pebs_buffer(cpu))
+		if (bts_err && pebs_err)
 			break;
+	}
+
+	if (bts_err) {
+		for_each_possible_cpu(cpu)
+			release_bts_buffer(cpu);
+	}
 
-		err = 0;
+	if (pebs_err) {
+		for_each_possible_cpu(cpu)
+			release_pebs_buffer(cpu);
 	}
 
-	if (err)
-		release_ds_buffers();
-	else {
+	if (bts_err && pebs_err) {
+		for_each_possible_cpu(cpu)
+			release_ds_buffer(cpu);
+	} else {
+		if (x86_pmu.bts && !bts_err)
+			x86_pmu.bts_active = 1;
+
+		if (x86_pmu.pebs && !pebs_err)
+			x86_pmu.pebs_active = 1;
+
 		for_each_online_cpu(cpu)
 			init_debug_store_on_cpu(cpu);
 	}
 
 	put_online_cpus();
 
-	return err;
+	return 0;
 }
 
 /*
@@ -287,7 +317,7 @@ static int intel_pmu_drain_bts_buffer(void)
 	if (!event)
 		return 0;
 
-	if (!ds)
+	if (!x86_pmu.bts_active)
 		return 0;
 
 	at  = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
@@ -557,7 +587,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 	struct pebs_record_core *at, *top;
 	int n;
 
-	if (!ds || !x86_pmu.pebs)
+	if (!x86_pmu.pebs_active)
 		return;
 
 	at  = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base;
@@ -599,7 +629,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 	u64 status = 0;
 	int bit, n;
 
-	if (!ds || !x86_pmu.pebs)
+	if (!x86_pmu.pebs_active)
 		return;
 
 	at  = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
-- 
cgit v1.2.3-18-g5258


From f80c9e304b8e8062230b0cda2c2fdd586149c771 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 19 Oct 2010 14:50:02 +0200
Subject: perf, x86: Clean up reserve_ds_buffers() signature

Now that reserve_ds_buffers() never fails, change it to return
void and remove all code dealing with the error return.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Stephane Eranian <eranian@google.com>
LKML-Reference: <20101019134808.462621937@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c          | 9 +++------
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 9 +++------
 2 files changed, 6 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 61e78f65106..a333bf9189f 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -382,7 +382,7 @@ static void release_pmc_hardware(void) {}
 
 #endif
 
-static int reserve_ds_buffers(void);
+static void reserve_ds_buffers(void);
 static void release_ds_buffers(void);
 
 static void hw_perf_event_destroy(struct perf_event *event)
@@ -546,11 +546,8 @@ static int __x86_pmu_event_init(struct perf_event *event)
 		if (atomic_read(&active_events) == 0) {
 			if (!reserve_pmc_hardware())
 				err = -EBUSY;
-			else {
-				err = reserve_ds_buffers();
-				if (err)
-					release_pmc_hardware();
-			}
+			else
+				reserve_ds_buffers();
 		}
 		if (!err)
 			atomic_inc(&active_events);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 05c7db68277..8a7f81cbd61 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -191,7 +191,7 @@ static void release_ds_buffers(void)
 	put_online_cpus();
 }
 
-static int reserve_ds_buffers(void)
+static void reserve_ds_buffers(void)
 {
 	int bts_err = 0, pebs_err = 0;
 	int cpu;
@@ -200,7 +200,7 @@ static int reserve_ds_buffers(void)
 	x86_pmu.pebs_active = 0;
 
 	if (!x86_pmu.bts && !x86_pmu.pebs)
-		return 0;
+		return;
 
 	if (!x86_pmu.bts)
 		bts_err = 1;
@@ -251,8 +251,6 @@ static int reserve_ds_buffers(void)
 	}
 
 	put_online_cpus();
-
-	return 0;
 }
 
 /*
@@ -714,9 +712,8 @@ static void intel_ds_init(void)
 
 #else /* CONFIG_CPU_SUP_INTEL */
 
-static int reserve_ds_buffers(void)
+static void reserve_ds_buffers(void)
 {
-	return 0;
 }
 
 static void release_ds_buffers(void)
-- 
cgit v1.2.3-18-g5258


From 96681fc3c9e7d1f89ab64e5eec40b6467c97680f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 19 Oct 2010 14:55:33 +0200
Subject: perf, x86: Use NUMA aware allocations for PEBS/BTS/DS allocations

For performance reasons its best to use memory node local memory for
per-cpu buffers.

This logic comes from a much larger patch proposed by Stephane.

Suggested-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Stephane Eranian <eranian@google.com>
LKML-Reference: <20101019134808.514465326@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 8a7f81cbd61..b7dcd9f2b8a 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -77,13 +77,14 @@ static void fini_debug_store_on_cpu(int cpu)
 static int alloc_pebs_buffer(int cpu)
 {
 	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+	int node = cpu_to_node(cpu);
 	int max, thresh = 1; /* always use a single PEBS record */
 	void *buffer;
 
 	if (!x86_pmu.pebs)
 		return 0;
 
-	buffer = kzalloc(PEBS_BUFFER_SIZE, GFP_KERNEL);
+	buffer = kmalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node);
 	if (unlikely(!buffer))
 		return -ENOMEM;
 
@@ -114,13 +115,14 @@ static void release_pebs_buffer(int cpu)
 static int alloc_bts_buffer(int cpu)
 {
 	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+	int node = cpu_to_node(cpu);
 	int max, thresh;
 	void *buffer;
 
 	if (!x86_pmu.bts)
 		return 0;
 
-	buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
+	buffer = kmalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node);
 	if (unlikely(!buffer))
 		return -ENOMEM;
 
@@ -150,9 +152,10 @@ static void release_bts_buffer(int cpu)
 
 static int alloc_ds_buffer(int cpu)
 {
+	int node = cpu_to_node(cpu);
 	struct debug_store *ds;
 
-	ds = kzalloc(sizeof(*ds), GFP_KERNEL);
+	ds = kmalloc_node(sizeof(*ds), GFP_KERNEL | __GFP_ZERO, node);
 	if (unlikely(!ds))
 		return -ENOMEM;
 
-- 
cgit v1.2.3-18-g5258


From b2a33c172890b231444803b0bb38c25ac5a0f274 Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Sun, 5 Sep 2010 21:00:20 +0200
Subject: [CPUFREQ] arch/x86/kernel/cpu/cpufreq: Fix unsigned return type

In each case, the function has an unsigned return type, but returns a
negative constant to indicate an error condition.  Each function is only
called once.  For nforce2_detect_chipset, the result is only compared to 0,
and for longrun_determine_freqs, the result is stored in a variable of type
(signed) int.  Thus, for both functions, unsigned can be dropped from the
return type.

A sematic match that finds this problem is as follows:
(http://coccinelle.lip6.fr/)

// <smpl>
@exists@
identifier f;
constant C;
@@

 unsigned f(...)
 { <+...
*  return -C;
 ...+> }
// </smpl>

Signed-off-by: Julia Lawall <julia@diku.dk>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c | 2 +-
 arch/x86/kernel/cpu/cpufreq/longrun.c         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
index 733093d6043..141abebc451 100644
--- a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
+++ b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
@@ -393,7 +393,7 @@ static struct cpufreq_driver nforce2_driver = {
  * Detects nForce2 A2 and C1 stepping
  *
  */
-static unsigned int nforce2_detect_chipset(void)
+static int nforce2_detect_chipset(void)
 {
 	nforce2_dev = pci_get_subsys(PCI_VENDOR_ID_NVIDIA,
 					PCI_DEVICE_ID_NVIDIA_NFORCE2,
diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c
index fc09f142d94..77945bfbded 100644
--- a/arch/x86/kernel/cpu/cpufreq/longrun.c
+++ b/arch/x86/kernel/cpu/cpufreq/longrun.c
@@ -165,7 +165,7 @@ static unsigned int longrun_get(unsigned int cpu)
  * TMTA rules:
  * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq)
  */
-static unsigned int __cpuinit longrun_determine_freqs(unsigned int *low_freq,
+static int __cpuinit longrun_determine_freqs(unsigned int *low_freq,
 						      unsigned int *high_freq)
 {
 	u32 msr_lo, msr_hi;
-- 
cgit v1.2.3-18-g5258


From a69a0612c4cb7b08570d1b25b542cef478a2d79a Mon Sep 17 00:00:00 2001
From: Rakib Mullick <rakib.mullick@gmail.com>
Date: Thu, 21 Oct 2010 17:28:44 +0600
Subject: [CPUFREQ]: x86, cpufreq: Mark longrun_get_policy with __cpuinit.

This patch fixes the following warning. The function
longrun_cpu_init() is marked with __cpuinit which calls
longrun_get_policy() which is a __init function. So make
longrun_get_policy with __cpuinit.

WARNING: arch/x86/kernel/cpu/cpufreq/longrun.o(.cpuinit.text+0x4c5):
Section mismatch in reference from the function longrun_cpu_init() to
the function .init.text:longrun_get_policy()
The function __cpuinit longrun_cpu_init() references
a function __init longrun_get_policy().
If longrun_get_policy is only used by longrun_cpu_init then
annotate longrun_get_policy with a matching annotation.

Signed-off-by: Rakib Mullick <rakib.mullick@gmail.com>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 arch/x86/kernel/cpu/cpufreq/longrun.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c
index 77945bfbded..d9f51367666 100644
--- a/arch/x86/kernel/cpu/cpufreq/longrun.c
+++ b/arch/x86/kernel/cpu/cpufreq/longrun.c
@@ -35,7 +35,7 @@ static unsigned int longrun_low_freq, longrun_high_freq;
  * Reads the current LongRun policy by access to MSR_TMTA_LONGRUN_FLAGS
  * and MSR_TMTA_LONGRUN_CTRL
  */
-static void __init longrun_get_policy(struct cpufreq_policy *policy)
+static void __cpuinit longrun_get_policy(struct cpufreq_policy *policy)
 {
 	u32 msr_lo, msr_hi;
 
-- 
cgit v1.2.3-18-g5258


From 5e941c093989dfb6b67148d2410d79b1be8debfe Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Thu, 26 Aug 2010 15:31:36 -0700
Subject: x86: add RESERVE_BRK_ARRAY() helper

Useful when converting static arrays into boottime brk allocated objects.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/include/asm/setup.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index ef292c792d7..d6763b139a8 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -93,6 +93,11 @@ void *extend_brk(size_t size, size_t align);
 			: : "i" (sz));					\
 	}
 
+/* Helper for reserving space for arrays of things */
+#define RESERVE_BRK_ARRAY(type, name, entries)		\
+	type *name;					\
+	RESERVE_BRK(name, sizeof(type) * entries)
+
 #ifdef __i386__
 
 void __init i386_start_kernel(void);
-- 
cgit v1.2.3-18-g5258


From a171ce6e7b4d967b9f9b8ba7c076a8a6d26e432b Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Thu, 26 Aug 2010 15:04:48 -0700
Subject: xen: dynamically allocate p2m space

Use early brk mechanism to allocate p2m tables, to save memory when
booting non-Xen.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/mmu.c | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 42086ac406a..ecbdcf0d45d 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -174,18 +174,16 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3);	 /* actual vcpu cr3 */
 #define TOP_ENTRIES		(MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
 
 /* Placeholder for holes in the address space */
-static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
-		{ [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
+static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_ENTRIES_PER_PAGE);
 
  /* Array of pointers to pages containing p2m entries */
-static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data =
-		{ [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
+static RESERVE_BRK_ARRAY(unsigned long *, p2m_top, TOP_ENTRIES);
 
 /* Arrays of p2m arrays expressed in mfns used for save/restore */
-static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss;
+static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, TOP_ENTRIES);
 
-static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE]
-	__page_aligned_bss;
+static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn_list,
+			 (TOP_ENTRIES / P2M_ENTRIES_PER_PAGE));
 
 static inline unsigned p2m_top_index(unsigned long pfn)
 {
@@ -209,7 +207,7 @@ void xen_build_mfn_list_list(void)
 		p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]);
 	}
 
-	for (idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) {
+	for (idx = 0; idx < TOP_ENTRIES/P2M_ENTRIES_PER_PAGE; idx++) {
 		unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
 		p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
 	}
@@ -230,6 +228,22 @@ void __init xen_build_dynamic_phys_to_machine(void)
 	unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
 	unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
 	unsigned pfn;
+	unsigned i;
+
+	p2m_missing = extend_brk(sizeof(*p2m_missing) * P2M_ENTRIES_PER_PAGE,
+				 PAGE_SIZE);
+	for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
+		p2m_missing[i] = ~0UL;
+
+	p2m_top = extend_brk(sizeof(*p2m_top) * TOP_ENTRIES,
+			     PAGE_SIZE);
+	for (i = 0; i < TOP_ENTRIES; i++)
+		p2m_top[i] = p2m_missing;
+
+	p2m_top_mfn = extend_brk(sizeof(*p2m_top_mfn) * TOP_ENTRIES, PAGE_SIZE);
+	p2m_top_mfn_list = extend_brk(sizeof(*p2m_top_mfn_list) *
+				      (TOP_ENTRIES / P2M_ENTRIES_PER_PAGE),
+				      PAGE_SIZE);
 
 	for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
 		unsigned topidx = p2m_top_index(pfn);
-- 
cgit v1.2.3-18-g5258


From a2e875298729540300a9a0324ee66e3b7883a912 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Thu, 26 Aug 2010 16:08:31 -0700
Subject: xen: allocate p2m size based on actual max size

Allocate p2m tables based on the actual runtime maximum pfn rather than
the static config-time limit.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/mmu.c | 35 +++++++++++++++++++++--------------
 1 file changed, 21 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index ecbdcf0d45d..151813d9755 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -169,25 +169,27 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3);	 /* actual vcpu cr3 */
  */
 #define USER_LIMIT	((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
 
+static unsigned long max_p2m_pfn __read_mostly = MAX_DOMAIN_PAGES;
 
-#define P2M_ENTRIES_PER_PAGE	(PAGE_SIZE / sizeof(unsigned long))
-#define TOP_ENTRIES		(MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
+#define P2M_ENTRIES_PER_PAGE		(PAGE_SIZE / sizeof(unsigned long))
+#define TOP_ENTRIES(pages)		((pages) / P2M_ENTRIES_PER_PAGE)
+#define MAX_TOP_ENTRIES			TOP_ENTRIES(MAX_DOMAIN_PAGES)
 
 /* Placeholder for holes in the address space */
 static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_ENTRIES_PER_PAGE);
 
  /* Array of pointers to pages containing p2m entries */
-static RESERVE_BRK_ARRAY(unsigned long *, p2m_top, TOP_ENTRIES);
+static RESERVE_BRK_ARRAY(unsigned long *, p2m_top, MAX_TOP_ENTRIES);
 
 /* Arrays of p2m arrays expressed in mfns used for save/restore */
-static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, TOP_ENTRIES);
+static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, MAX_TOP_ENTRIES);
 
 static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn_list,
-			 (TOP_ENTRIES / P2M_ENTRIES_PER_PAGE));
+			 (MAX_TOP_ENTRIES / P2M_ENTRIES_PER_PAGE));
 
 static inline unsigned p2m_top_index(unsigned long pfn)
 {
-	BUG_ON(pfn >= MAX_DOMAIN_PAGES);
+	BUG_ON(pfn >= max_p2m_pfn);
 	return pfn / P2M_ENTRIES_PER_PAGE;
 }
 
@@ -201,13 +203,15 @@ void xen_build_mfn_list_list(void)
 {
 	unsigned pfn, idx;
 
-	for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) {
+	for (pfn = 0; pfn < max_p2m_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
 		unsigned topidx = p2m_top_index(pfn);
 
 		p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]);
 	}
 
-	for (idx = 0; idx < TOP_ENTRIES/P2M_ENTRIES_PER_PAGE; idx++) {
+	for (idx = 0;
+	     idx < TOP_ENTRIES(max_p2m_pfn)/P2M_ENTRIES_PER_PAGE;
+	     idx++) {
 		unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
 		p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
 	}
@@ -230,19 +234,22 @@ void __init xen_build_dynamic_phys_to_machine(void)
 	unsigned pfn;
 	unsigned i;
 
+	max_p2m_pfn = max_pfn;
+
 	p2m_missing = extend_brk(sizeof(*p2m_missing) * P2M_ENTRIES_PER_PAGE,
 				 PAGE_SIZE);
 	for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
 		p2m_missing[i] = ~0UL;
 
-	p2m_top = extend_brk(sizeof(*p2m_top) * TOP_ENTRIES,
+	p2m_top = extend_brk(sizeof(*p2m_top) * TOP_ENTRIES(max_pfn),
 			     PAGE_SIZE);
-	for (i = 0; i < TOP_ENTRIES; i++)
+	for (i = 0; i < TOP_ENTRIES(max_pfn); i++)
 		p2m_top[i] = p2m_missing;
 
-	p2m_top_mfn = extend_brk(sizeof(*p2m_top_mfn) * TOP_ENTRIES, PAGE_SIZE);
+	p2m_top_mfn = extend_brk(sizeof(*p2m_top_mfn) * TOP_ENTRIES(max_pfn),
+				 PAGE_SIZE);
 	p2m_top_mfn_list = extend_brk(sizeof(*p2m_top_mfn_list) *
-				      (TOP_ENTRIES / P2M_ENTRIES_PER_PAGE),
+				      (TOP_ENTRIES(max_pfn) / P2M_ENTRIES_PER_PAGE),
 				      PAGE_SIZE);
 
 	for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
@@ -258,7 +265,7 @@ unsigned long get_phys_to_machine(unsigned long pfn)
 {
 	unsigned topidx, idx;
 
-	if (unlikely(pfn >= MAX_DOMAIN_PAGES))
+	if (unlikely(pfn >= max_p2m_pfn))
 		return INVALID_P2M_ENTRY;
 
 	topidx = p2m_top_index(pfn);
@@ -304,7 +311,7 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 {
 	unsigned topidx, idx;
 
-	if (unlikely(pfn >= MAX_DOMAIN_PAGES)) {
+	if (unlikely(pfn >= max_p2m_pfn)) {
 		BUG_ON(mfn != INVALID_P2M_ENTRY);
 		return true;
 	}
-- 
cgit v1.2.3-18-g5258


From f0991802bb4368e33848e7f823caa487d23555fb Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Thu, 26 Aug 2010 16:16:28 -0700
Subject: xen: use early_brk for level2_kernel_pgt

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/mmu.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 151813d9755..71c6af6c89a 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1843,13 +1843,15 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
 	return pgd;
 }
 #else	/* !CONFIG_X86_64 */
-static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
+static RESERVE_BRK_ARRAY(pmd_t, level2_kernel_pgt, PTRS_PER_PMD);
 
 __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
 					 unsigned long max_pfn)
 {
 	pmd_t *kernel_pmd;
 
+	level2_kernel_pgt = extend_brk(sizeof(pmd_t *) * PTRS_PER_PMD, PAGE_SIZE);
+
 	max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
 				  xen_start_info->nr_pt_frames * PAGE_SIZE +
 				  512*1024);
-- 
cgit v1.2.3-18-g5258


From 764f0138b9f54aa96761810055a74fce1e58c300 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Thu, 26 Aug 2010 16:23:51 -0700
Subject: xen: allocate level1_ident_pgt

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/mmu.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 71c6af6c89a..3de42d1e475 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -138,7 +138,8 @@ static inline void check_zero(void)
  * large enough to allocate page table pages to allocate the rest.
  * Each page can map 2MB.
  */
-static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
+#define LEVEL1_IDENT_ENTRIES	(PTRS_PER_PTE * 4)
+static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES);
 
 #ifdef CONFIG_X86_64
 /* l3 pud for userspace vsyscall mapping */
@@ -1718,6 +1719,9 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
 	unsigned ident_pte;
 	unsigned long pfn;
 
+	level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES,
+				      PAGE_SIZE);
+
 	ident_pte = 0;
 	pfn = 0;
 	for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
@@ -1728,7 +1732,7 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
 			pte_page = m2v(pmd[pmdidx].pmd);
 		else {
 			/* Check for free pte pages */
-			if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
+			if (ident_pte == LEVEL1_IDENT_ENTRIES)
 				break;
 
 			pte_page = &level1_ident_pgt[ident_pte];
-- 
cgit v1.2.3-18-g5258


From 1e17fc7eff56d23a835d5d33e71d813aa9eb8ecc Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Fri, 3 Sep 2010 15:04:08 -0700
Subject: xen: remove noise about registering vcpu info

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/enlighten.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 7d46c844141..ee304b52d8b 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -135,9 +135,6 @@ static void xen_vcpu_setup(int cpu)
 	info.mfn = arbitrary_virt_to_mfn(vcpup);
 	info.offset = offset_in_page(vcpup);
 
-	printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n",
-	       cpu, vcpup, info.mfn, info.offset);
-
 	/* Check to see if the hypervisor will put the vcpu_info
 	   structure where we want it, which allows direct access via
 	   a percpu-variable. */
@@ -151,9 +148,6 @@ static void xen_vcpu_setup(int cpu)
 		/* This cpu is using the registered vcpu info, even if
 		   later ones fail to. */
 		per_cpu(xen_vcpu, cpu) = vcpup;
-
-		printk(KERN_DEBUG "cpu %d using vcpu_info at %p\n",
-		       cpu, vcpup);
 	}
 }
 
@@ -873,8 +867,6 @@ void xen_setup_vcpu_info_placement(void)
 	/* xen_vcpu_setup managed to place the vcpu_info within the
 	   percpu area for all cpus, so make use of it */
 	if (have_vcpu_info_placement) {
-		printk(KERN_INFO "Xen: using vcpu_info placement\n");
-
 		pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
 		pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
 		pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
-- 
cgit v1.2.3-18-g5258


From b7eb4ad39134ee5b09634a710e50c2990f533231 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Thu, 26 Aug 2010 17:06:58 -0700
Subject: xen: set shared_info->arch.max_pfn to max_p2m_pfn

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 3de42d1e475..909ad637c38 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -224,7 +224,7 @@ void xen_setup_mfn_list_list(void)
 
 	HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
 		virt_to_mfn(p2m_top_mfn_list);
-	HYPERVISOR_shared_info->arch.max_pfn = xen_start_info->nr_pages;
+	HYPERVISOR_shared_info->arch.max_pfn = max_p2m_mfn;
 }
 
 /* Set up p2m_top to point to the domain-builder provided p2m pages */
-- 
cgit v1.2.3-18-g5258


From 1f2d9dd309feb08fdbc711fa03841650dfff87d8 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Thu, 26 Aug 2010 17:11:35 -0700
Subject: xen: set the actual extent of the mfn_list_list

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 909ad637c38..fcff8c82979 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -224,7 +224,7 @@ void xen_setup_mfn_list_list(void)
 
 	HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
 		virt_to_mfn(p2m_top_mfn_list);
-	HYPERVISOR_shared_info->arch.max_pfn = max_p2m_mfn;
+	HYPERVISOR_shared_info->arch.max_pfn = max_p2m_pfn;
 }
 
 /* Set up p2m_top to point to the domain-builder provided p2m pages */
-- 
cgit v1.2.3-18-g5258


From bbbf61eff92c7c236f57ee1953ad84055443717e Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Thu, 26 Aug 2010 17:12:17 -0700
Subject: xen: make install_p2mtop_page() static

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/mmu.c | 4 ++--
 arch/x86/xen/mmu.h | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index fcff8c82979..00969099b05 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -275,8 +275,8 @@ unsigned long get_phys_to_machine(unsigned long pfn)
 }
 EXPORT_SYMBOL_GPL(get_phys_to_machine);
 
-/* install a  new p2m_top page */
-bool install_p2mtop_page(unsigned long pfn, unsigned long *p)
+/* install a new p2m_top page */
+static bool install_p2mtop_page(unsigned long pfn, unsigned long *p)
 {
 	unsigned topidx = p2m_top_index(pfn);
 	unsigned long **pfnp, *mfnp;
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index fa938c4aa2f..537bb9aab77 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -12,7 +12,6 @@ enum pt_level {
 
 
 bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
-bool install_p2mtop_page(unsigned long pfn, unsigned long *p);
 
 void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
 
-- 
cgit v1.2.3-18-g5258


From 58e05027b530ff081ecea68e38de8d59db8f87e0 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Fri, 27 Aug 2010 13:28:48 -0700
Subject: xen: convert p2m to a 3 level tree

Make the p2m structure a 3 level tree which covers the full possible
physical space.

The p2m structure contains mappings from the domain's pfns to system-wide
mfns.  The structure has 3 levels and two roots.  The first root is for
the domain's own use, and is linked with virtual addresses.  The second
is all mfn references, and is used by Xen on save/restore to allow it to
update the p2m mapping for the domain.

At boot, the domain builder provides a simple flat p2m array for all the
initially present pages.  We construct the two levels above that using
the early_brk allocator.  After early boot time, set_phys_to_machine()
will allocate any missing levels using the normal kernel allocator
(at GFP_KERNEL, so it must be called in a normal blocking context).

Because the early_brk() API requires us to pre-reserve the maximum amount
of memory we could allocate, there is still a CONFIG_XEN_MAX_DOMAIN_MEMORY
config option, but its only negative side-effect is to increase the
kernel's apparent bss size.  However, since all unused brk memory is
returned to the heap, there's no real downside to making it large.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/Kconfig |  11 +-
 arch/x86/xen/mmu.c   | 318 +++++++++++++++++++++++++++++++++++++++------------
 2 files changed, 246 insertions(+), 83 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 68128a1b401..90a7f5ad691 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -19,15 +19,12 @@ config XEN_PVHVM
 	depends on X86_LOCAL_APIC
 
 config XEN_MAX_DOMAIN_MEMORY
-       int "Maximum allowed size of a domain in gigabytes"
-       default 8 if X86_32
-       default 32 if X86_64
+       int
+       default 128
        depends on XEN
        help
-         The pseudo-physical to machine address array is sized
-         according to the maximum possible memory size of a Xen
-         domain.  This array uses 1 page per gigabyte, so there's no
-         need to be too stingy here.
+         This only affects the sizing of some bss arrays, the unused
+         portions of which are freed.
 
 config XEN_SAVE_RESTORE
        bool
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 00969099b05..d4c7265cf0a 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -170,51 +170,162 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3);	 /* actual vcpu cr3 */
  */
 #define USER_LIMIT	((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
 
-static unsigned long max_p2m_pfn __read_mostly = MAX_DOMAIN_PAGES;
+/*
+ * Xen leaves the responsibility for maintaining p2m mappings to the
+ * guests themselves, but it must also access and update the p2m array
+ * during suspend/resume when all the pages are reallocated.
+ *
+ * The p2m table is logically a flat array, but we implement it as a
+ * three-level tree to allow the address space to be sparse.
+ *
+ *                               Xen
+ *                                |
+ *     p2m_top              p2m_top_mfn
+ *       /  \                   /   \
+ * p2m_mid p2m_mid	p2m_mid_mfn p2m_mid_mfn
+ *    / \      / \         /           /
+ *  p2m p2m p2m p2m p2m p2m p2m ...
+ *
+ * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the
+ * maximum representable pseudo-physical address space is:
+ *  P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
+ *
+ * P2M_PER_PAGE depends on the architecture, as a mfn is always
+ * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
+ * 512 and 1024 entries respectively. 
+ */
 
-#define P2M_ENTRIES_PER_PAGE		(PAGE_SIZE / sizeof(unsigned long))
-#define TOP_ENTRIES(pages)		((pages) / P2M_ENTRIES_PER_PAGE)
-#define MAX_TOP_ENTRIES			TOP_ENTRIES(MAX_DOMAIN_PAGES)
+static unsigned long max_p2m_pfn __read_mostly;
 
-/* Placeholder for holes in the address space */
-static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_ENTRIES_PER_PAGE);
+#define P2M_PER_PAGE		(PAGE_SIZE / sizeof(unsigned long))
+#define P2M_MID_PER_PAGE	(PAGE_SIZE / sizeof(unsigned long *))
+#define P2M_TOP_PER_PAGE	(PAGE_SIZE / sizeof(unsigned long **))
 
- /* Array of pointers to pages containing p2m entries */
-static RESERVE_BRK_ARRAY(unsigned long *, p2m_top, MAX_TOP_ENTRIES);
+#define MAX_P2M_PFN		(P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
 
-/* Arrays of p2m arrays expressed in mfns used for save/restore */
-static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, MAX_TOP_ENTRIES);
+/* Placeholders for holes in the address space */
+static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE);
 
-static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn_list,
-			 (MAX_TOP_ENTRIES / P2M_ENTRIES_PER_PAGE));
+static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
+
+RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
+RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
 
 static inline unsigned p2m_top_index(unsigned long pfn)
 {
-	BUG_ON(pfn >= max_p2m_pfn);
-	return pfn / P2M_ENTRIES_PER_PAGE;
+	BUG_ON(pfn >= MAX_P2M_PFN);
+	return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE);
+}
+
+static inline unsigned p2m_mid_index(unsigned long pfn)
+{
+	return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE;
 }
 
 static inline unsigned p2m_index(unsigned long pfn)
 {
-	return pfn % P2M_ENTRIES_PER_PAGE;
+	return pfn % P2M_PER_PAGE;
 }
 
-/* Build the parallel p2m_top_mfn structures */
+static void p2m_top_init(unsigned long ***top)
+{
+	unsigned i;
+
+	for (i = 0; i < P2M_TOP_PER_PAGE; i++)
+		top[i] = p2m_mid_missing;
+}
+
+static void p2m_top_mfn_init(unsigned long *top)
+{
+	unsigned i;
+
+	for (i = 0; i < P2M_TOP_PER_PAGE; i++)
+		top[i] = virt_to_mfn(p2m_mid_missing_mfn);
+}
+
+static void p2m_mid_init(unsigned long **mid)
+{
+	unsigned i;
+
+	for (i = 0; i < P2M_MID_PER_PAGE; i++)
+		mid[i] = p2m_missing;
+}
+
+static void p2m_mid_mfn_init(unsigned long *mid)
+{
+	unsigned i;
+
+	for (i = 0; i < P2M_MID_PER_PAGE; i++)
+		mid[i] = virt_to_mfn(p2m_missing);
+}
+
+static void p2m_init(unsigned long *p2m)
+{
+	unsigned i;
+
+	for (i = 0; i < P2M_MID_PER_PAGE; i++)
+		p2m[i] = INVALID_P2M_ENTRY;
+}
+
+/*
+ * Build the parallel p2m_top_mfn and p2m_mid_mfn structures
+ *
+ * This is called both at boot time, and after resuming from suspend:
+ * - At boot time we're called very early, and must use extend_brk()
+ *   to allocate memory.
+ *
+ * - After resume we're called from within stop_machine, but the mfn
+ *   tree should alreay be completely allocated.
+ */
 void xen_build_mfn_list_list(void)
 {
-	unsigned pfn, idx;
+	unsigned pfn, i;
 
-	for (pfn = 0; pfn < max_p2m_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
-		unsigned topidx = p2m_top_index(pfn);
+	/* Pre-initialize p2m_top_mfn to be completely missing */
+	if (p2m_top_mfn == NULL) {
+		p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
+		p2m_mid_mfn_init(p2m_mid_missing_mfn);
 
-		p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]);
+		p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
+		p2m_top_mfn_init(p2m_top_mfn);
 	}
 
-	for (idx = 0;
-	     idx < TOP_ENTRIES(max_p2m_pfn)/P2M_ENTRIES_PER_PAGE;
-	     idx++) {
-		unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
-		p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
+	for (pfn = 0; pfn < max_p2m_pfn; pfn += P2M_PER_PAGE) {
+		unsigned topidx = p2m_top_index(pfn);
+		unsigned mididx = p2m_mid_index(pfn);
+		unsigned long **mid;
+		unsigned long mid_mfn;
+		unsigned long *mid_mfn_p;
+
+		mid = p2m_top[topidx];
+
+		/* Don't bother allocating any mfn mid levels if
+		   they're just missing */
+		if (mid[mididx] == p2m_missing)
+			continue;
+
+		mid_mfn = p2m_top_mfn[topidx];
+		mid_mfn_p = mfn_to_virt(mid_mfn);
+
+		if (mid_mfn_p == p2m_mid_missing_mfn) {
+			/*
+			 * XXX boot-time only!  We should never find
+			 * missing parts of the mfn tree after
+			 * runtime.  extend_brk() will BUG if we call
+			 * it too late.
+			 */
+			mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
+			p2m_mid_mfn_init(mid_mfn_p);
+
+			mid_mfn = virt_to_mfn(mid_mfn_p);
+			
+			p2m_top_mfn[topidx] = mid_mfn;
+		}
+
+		mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]);
 	}
 }
 
@@ -223,7 +334,7 @@ void xen_setup_mfn_list_list(void)
 	BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
 
 	HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
-		virt_to_mfn(p2m_top_mfn_list);
+		virt_to_mfn(p2m_top_mfn);
 	HYPERVISOR_shared_info->arch.max_pfn = max_p2m_pfn;
 }
 
@@ -233,99 +344,154 @@ void __init xen_build_dynamic_phys_to_machine(void)
 	unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
 	unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
 	unsigned pfn;
-	unsigned i;
 
 	max_p2m_pfn = max_pfn;
 
-	p2m_missing = extend_brk(sizeof(*p2m_missing) * P2M_ENTRIES_PER_PAGE,
-				 PAGE_SIZE);
-	for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
-		p2m_missing[i] = ~0UL;
+	p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
+	p2m_init(p2m_missing);
 
-	p2m_top = extend_brk(sizeof(*p2m_top) * TOP_ENTRIES(max_pfn),
-			     PAGE_SIZE);
-	for (i = 0; i < TOP_ENTRIES(max_pfn); i++)
-		p2m_top[i] = p2m_missing;
+	p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
+	p2m_mid_init(p2m_mid_missing);
 
-	p2m_top_mfn = extend_brk(sizeof(*p2m_top_mfn) * TOP_ENTRIES(max_pfn),
-				 PAGE_SIZE);
-	p2m_top_mfn_list = extend_brk(sizeof(*p2m_top_mfn_list) *
-				      (TOP_ENTRIES(max_pfn) / P2M_ENTRIES_PER_PAGE),
-				      PAGE_SIZE);
+	p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
+	p2m_top_init(p2m_top);
 
-	for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
+	/*
+	 * The domain builder gives us a pre-constructed p2m array in
+	 * mfn_list for all the pages initially given to us, so we just
+	 * need to graft that into our tree structure.
+	 */
+	for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) {
 		unsigned topidx = p2m_top_index(pfn);
+		unsigned mididx = p2m_mid_index(pfn);
+
+		if (p2m_top[topidx] == p2m_mid_missing) {
+			unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
+			p2m_mid_init(mid);
 
-		p2m_top[topidx] = &mfn_list[pfn];
+			p2m_top[topidx] = mid;
+		}
+
+		p2m_top[topidx][mididx] = &mfn_list[pfn];
 	}
 
+	/* Allocate and initialize top and mid mfn levels */
 	xen_build_mfn_list_list();
 }
 
 unsigned long get_phys_to_machine(unsigned long pfn)
 {
-	unsigned topidx, idx;
+	unsigned topidx, mididx, idx;
 
-	if (unlikely(pfn >= max_p2m_pfn))
+	if (unlikely(pfn >= MAX_P2M_PFN))
 		return INVALID_P2M_ENTRY;
 
 	topidx = p2m_top_index(pfn);
+	mididx = p2m_mid_index(pfn);
 	idx = p2m_index(pfn);
-	return p2m_top[topidx][idx];
+
+	return p2m_top[topidx][mididx][idx];
 }
 EXPORT_SYMBOL_GPL(get_phys_to_machine);
 
-/* install a new p2m_top page */
-static bool install_p2mtop_page(unsigned long pfn, unsigned long *p)
+static void *alloc_p2m_page(void)
 {
-	unsigned topidx = p2m_top_index(pfn);
-	unsigned long **pfnp, *mfnp;
-	unsigned i;
+	return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
+}
 
-	pfnp = &p2m_top[topidx];
-	mfnp = &p2m_top_mfn[topidx];
+static void free_p2m_page(void *p)
+{
+	free_page((unsigned long)p);
+}
 
-	for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
-		p[i] = INVALID_P2M_ENTRY;
+/* 
+ * Fully allocate the p2m structure for a given pfn.  We need to check
+ * that both the top and mid levels are allocated, and make sure the
+ * parallel mfn tree is kept in sync.  We may race with other cpus, so
+ * the new pages are installed with cmpxchg; if we lose the race then
+ * simply free the page we allocated and use the one that's there.
+ */
+static bool alloc_p2m(unsigned long pfn)
+{
+	unsigned topidx, mididx;
+	unsigned long ***top_p, **mid;
+	unsigned long *top_mfn_p, *mid_mfn;
 
-	if (cmpxchg(pfnp, p2m_missing, p) == p2m_missing) {
-		*mfnp = virt_to_mfn(p);
-		return true;
+	topidx = p2m_top_index(pfn);
+	mididx = p2m_mid_index(pfn);
+
+	top_p = &p2m_top[topidx];
+	mid = *top_p;
+
+	if (mid == p2m_mid_missing) {
+		/* Mid level is missing, allocate a new one */
+		mid = alloc_p2m_page();
+		if (!mid)
+			return false;
+
+		p2m_mid_init(mid);
+
+		if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
+			free_p2m_page(mid);
 	}
 
-	return false;
-}
+	top_mfn_p = &p2m_top_mfn[topidx];
+	mid_mfn = mfn_to_virt(*top_mfn_p);
 
-static void alloc_p2m(unsigned long pfn)
-{
-	unsigned long *p;
+	if (mid_mfn == p2m_mid_missing_mfn) {
+		/* Separately check the mid mfn level */
+		unsigned long missing_mfn;
+		unsigned long mid_mfn_mfn;
+
+		mid_mfn = alloc_p2m_page();
+		if (!mid_mfn)
+			return false;
+
+		p2m_mid_mfn_init(mid_mfn);
+		
+		missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
+		mid_mfn_mfn = virt_to_mfn(mid_mfn);
+		if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn)
+			free_p2m_page(mid_mfn);
+	}
 
-	p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
-	BUG_ON(p == NULL);
+	if (p2m_top[topidx][mididx] == p2m_missing) {
+		/* p2m leaf page is missing */
+		unsigned long *p2m;
 
-	if (!install_p2mtop_page(pfn, p))
-		free_page((unsigned long)p);
+		p2m = alloc_p2m_page();
+		if (!p2m)
+			return false;
+
+		p2m_init(p2m);
+
+		if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing)
+			free_p2m_page(p2m);
+		else
+			mid_mfn[mididx] = virt_to_mfn(p2m);
+	}
+
+	return true;
 }
 
 /* Try to install p2m mapping; fail if intermediate bits missing */
 bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 {
-	unsigned topidx, idx;
+	unsigned topidx, mididx, idx;
 
-	if (unlikely(pfn >= max_p2m_pfn)) {
+	if (unlikely(pfn >= MAX_P2M_PFN)) {
 		BUG_ON(mfn != INVALID_P2M_ENTRY);
 		return true;
 	}
 
 	topidx = p2m_top_index(pfn);
-	if (p2m_top[topidx] == p2m_missing) {
-		if (mfn == INVALID_P2M_ENTRY)
-			return true;
-		return false;
-	}
-
+	mididx = p2m_mid_index(pfn);
 	idx = p2m_index(pfn);
-	p2m_top[topidx][idx] = mfn;
+
+	if (p2m_top[topidx][mididx] == p2m_missing)
+		return mfn == INVALID_P2M_ENTRY;
+
+	p2m_top[topidx][mididx][idx] = mfn;
 
 	return true;
 }
@@ -338,7 +504,7 @@ void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 	}
 
 	if (unlikely(!__set_phys_to_machine(pfn, mfn)))  {
-		alloc_p2m(pfn);
+		WARN(!alloc_p2m(pfn), "Can't allocate p2m for %lx, %lx", pfn, mfn);
 
 		if (!__set_phys_to_machine(pfn, mfn))
 			BUG();
-- 
cgit v1.2.3-18-g5258


From c3798062f100c3e1d4ae1241bc536f3b1f28a6ca Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Fri, 27 Aug 2010 13:42:04 -0700
Subject: xen: add return value to set_phys_to_machine()

set_phys_to_machine() can return false on failure, which means a memory
allocation failure for the p2m structure.  It can only fail if setting
the mfn for a pfn in previously unused address space.  It is guaranteed
to succeed if you're setting a mapping to INVALID_P2M_ENTRY or updating
the mfn for an existing pfn.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/include/asm/xen/page.h |  2 +-
 arch/x86/xen/mmu.c              | 13 ++++++++-----
 2 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index bf5f7d32bd0..e40ca6e67bb 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -37,7 +37,7 @@ typedef struct xpaddr {
 
 
 extern unsigned long get_phys_to_machine(unsigned long pfn);
-extern void set_phys_to_machine(unsigned long pfn, unsigned long mfn);
+extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
 
 static inline unsigned long pfn_to_mfn(unsigned long pfn)
 {
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index d4c7265cf0a..b9651343723 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -282,7 +282,7 @@ static void p2m_init(unsigned long *p2m)
  */
 void xen_build_mfn_list_list(void)
 {
-	unsigned pfn, i;
+	unsigned pfn;
 
 	/* Pre-initialize p2m_top_mfn to be completely missing */
 	if (p2m_top_mfn == NULL) {
@@ -496,19 +496,22 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 	return true;
 }
 
-void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 {
 	if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
 		BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
-		return;
+		return true;
 	}
 
 	if (unlikely(!__set_phys_to_machine(pfn, mfn)))  {
-		WARN(!alloc_p2m(pfn), "Can't allocate p2m for %lx, %lx", pfn, mfn);
+		if (!alloc_p2m(pfn))
+			return false;
 
 		if (!__set_phys_to_machine(pfn, mfn))
-			BUG();
+			return false;
 	}
+
+	return true;
 }
 
 unsigned long arbitrary_virt_to_mfn(void *vaddr)
-- 
cgit v1.2.3-18-g5258


From 33a847502b0338351cebd8fc0c68ac796cfadbbd Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Fri, 27 Aug 2010 15:18:19 -0700
Subject: xen: defer building p2m mfn structures until kernel is mapped

When building mfn parts of p2m structure, we rely on being able to
use mfn_to_virt, which in turn requires kernel to be mapped into
the linear area (which is distinct from the kernel image mapping
on 64-bit).  Defer calling xen_build_mfn_list_list() until after
xen_setup_kernel_pagetable();

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/enlighten.c | 3 +++
 arch/x86/xen/mmu.c       | 3 ---
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index ee304b52d8b..d8873014b5e 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1178,6 +1178,9 @@ asmlinkage void __init xen_start_kernel(void)
 	xen_raw_console_write("mapping kernel into physical memory\n");
 	pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages);
 
+	/* Allocate and initialize top and mid mfn levels for p2m structure */
+	xen_build_mfn_list_list();
+
 	init_mm.pgd = pgd;
 
 	/* keep using Xen gdt for now; no urgent need to change it */
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index b9651343723..9b43bb398d3 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -374,9 +374,6 @@ void __init xen_build_dynamic_phys_to_machine(void)
 
 		p2m_top[topidx][mididx] = &mfn_list[pfn];
 	}
-
-	/* Allocate and initialize top and mid mfn levels */
-	xen_build_mfn_list_list();
 }
 
 unsigned long get_phys_to_machine(unsigned long pfn)
-- 
cgit v1.2.3-18-g5258


From cfd8951e082a589637f9de3c33efd3218fdb3c03 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Tue, 31 Aug 2010 14:06:22 -0700
Subject: xen: don't map missing memory

When setting up a pte for a missing pfn (no matching mfn), just create
an empty pte rather than a junk mapping.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/include/asm/xen/page.h |  9 ++++++++-
 arch/x86/xen/mmu.c              | 15 ++++++++++++++-
 2 files changed, 22 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index e40ca6e67bb..875f5a08a6c 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -41,10 +41,17 @@ extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
 
 static inline unsigned long pfn_to_mfn(unsigned long pfn)
 {
+	unsigned long mfn;
+
 	if (xen_feature(XENFEAT_auto_translated_physmap))
 		return pfn;
 
-	return get_phys_to_machine(pfn) & ~FOREIGN_FRAME_BIT;
+	mfn = get_phys_to_machine(pfn);
+
+	if (mfn != INVALID_P2M_ENTRY)
+		mfn &= ~FOREIGN_FRAME_BIT;
+
+	return mfn;
 }
 
 static inline int phys_to_machine_mapping_valid(unsigned long pfn)
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 9b43bb398d3..4c63b7f452d 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -745,7 +745,20 @@ static pteval_t pte_pfn_to_mfn(pteval_t val)
 	if (val & _PAGE_PRESENT) {
 		unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
 		pteval_t flags = val & PTE_FLAGS_MASK;
-		val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags;
+		unsigned long mfn = pfn_to_mfn(pfn);
+
+		/*
+		 * If there's no mfn for the pfn, then just create an
+		 * empty non-present pte.  Unfortunately this loses
+		 * information about the original pfn, so
+		 * pte_mfn_to_pfn is asymmetric.
+		 */
+		if (unlikely(mfn == INVALID_P2M_ENTRY)) {
+			mfn = 0;
+			flags = 0;
+		}
+
+		val = ((pteval_t)mfn << PAGE_SHIFT) | flags;
 	}
 
 	return val;
-- 
cgit v1.2.3-18-g5258


From 35ae11fd146384d222f3bb1f17eed1970cc92c36 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Fri, 6 Feb 2009 19:09:48 -0800
Subject: xen: Use host-provided E820 map

Rather than simply using a flat memory map from Xen, use its provided
E820 map.  This allows the domain builder to tell the domain to reserve
space for more pages than those initially provided at domain-build time.

It also allows the host to specify holes in the address space (for
PCI-passthrough, for example).

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/setup.c | 38 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 36 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 328b0030542..dd2eb2a9303 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -19,6 +19,7 @@
 
 #include <xen/page.h>
 #include <xen/interface/callback.h>
+#include <xen/interface/memory.h>
 #include <xen/interface/physdev.h>
 #include <xen/interface/memory.h>
 #include <xen/features.h>
@@ -107,13 +108,46 @@ static unsigned long __init xen_return_unused_memory(unsigned long max_pfn,
 
 char * __init xen_memory_setup(void)
 {
+	static struct e820entry map[E820MAX] __initdata;
+
 	unsigned long max_pfn = xen_start_info->nr_pages;
+	unsigned long long mem_end;
+	int rc;
+	struct xen_memory_map memmap;
+	int i;
 
 	max_pfn = min(MAX_DOMAIN_PAGES, max_pfn);
+	mem_end = PFN_PHYS(max_pfn);
+
+	memmap.nr_entries = E820MAX;
+	set_xen_guest_handle(memmap.buffer, map);
+
+	rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
+	if (rc == -ENOSYS) {
+		memmap.nr_entries = 1;
+		map[0].addr = 0ULL;
+		map[0].size = mem_end;
+		/* 8MB slack (to balance backend allocations). */
+		map[0].size += 8ULL << 20;
+		map[0].type = E820_RAM;
+		rc = 0;
+	}
+	BUG_ON(rc);
 
 	e820.nr_map = 0;
-
-	e820_add_region(0, PFN_PHYS((u64)max_pfn), E820_RAM);
+	for (i = 0; i < memmap.nr_entries; i++) {
+		unsigned long long end = map[i].addr + map[i].size;
+		if (map[i].type == E820_RAM) {
+			if (map[i].addr > mem_end)
+				continue;
+			if (end > mem_end) {
+				/* Truncate region to max_mem. */
+				map[i].size -= end - mem_end;
+			}
+		}
+		if (map[i].size > 0)
+			e820_add_region(map[i].addr, map[i].size, map[i].type);
+	}
 
 	/*
 	 * Even though this is normal, usable memory under Xen, reserve
-- 
cgit v1.2.3-18-g5258


From 42ee1471e9b879479a15debac752314a596c738e Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Mon, 30 Aug 2010 16:41:02 -0700
Subject: xen: implement "extra" memory to reserve space for pages not present
 at boot

When using the e820 map to get the initial pseudo-physical address space,
look for either Xen-provided memory which doesn't lie within an E820
region, or an E820 RAM region which extends beyond the Xen-provided
memory range.

Count these pages, and add them to a new "extra memory" range.  This range
has an E820 RAM range to describe it - so the kernel will allocate page
structures for it - but it is also marked reserved so that the kernel
will not attempt to use it.

The balloon driver can then add this range as a set of currently
ballooned-out pages, which can be used to extend the domain beyond its
original size.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/setup.c | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index dd2eb2a9303..f9a99eaddcd 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -34,6 +34,26 @@ extern void xen_sysenter_target(void);
 extern void xen_syscall_target(void);
 extern void xen_syscall32_target(void);
 
+/* Amount of extra memory space we add to the e820 ranges */
+phys_addr_t xen_extra_mem_start, xen_extra_mem_size;
+
+static __init void xen_add_extra_mem(unsigned long pages)
+{
+	u64 size = (u64)pages * PAGE_SIZE;
+
+	if (!pages)
+		return;
+
+	e820_add_region(xen_extra_mem_start + xen_extra_mem_size, size, E820_RAM);
+	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+
+	reserve_early(xen_extra_mem_start + xen_extra_mem_size,
+		      xen_extra_mem_start + xen_extra_mem_size + size,
+		      "XEN EXTRA");
+
+	xen_extra_mem_size += size;
+}
+
 static unsigned long __init xen_release_chunk(phys_addr_t start_addr,
 					      phys_addr_t end_addr)
 {
@@ -105,7 +125,6 @@ static unsigned long __init xen_return_unused_memory(unsigned long max_pfn,
 /**
  * machine_specific_memory_setup - Hook for machine specific memory setup.
  **/
-
 char * __init xen_memory_setup(void)
 {
 	static struct e820entry map[E820MAX] __initdata;
@@ -114,6 +133,7 @@ char * __init xen_memory_setup(void)
 	unsigned long long mem_end;
 	int rc;
 	struct xen_memory_map memmap;
+	unsigned long extra_pages = 0;
 	int i;
 
 	max_pfn = min(MAX_DOMAIN_PAGES, max_pfn);
@@ -135,6 +155,7 @@ char * __init xen_memory_setup(void)
 	BUG_ON(rc);
 
 	e820.nr_map = 0;
+	xen_extra_mem_start = mem_end;
 	for (i = 0; i < memmap.nr_entries; i++) {
 		unsigned long long end = map[i].addr + map[i].size;
 		if (map[i].type == E820_RAM) {
@@ -143,6 +164,8 @@ char * __init xen_memory_setup(void)
 			if (end > mem_end) {
 				/* Truncate region to max_mem. */
 				map[i].size -= end - mem_end;
+
+				extra_pages += PFN_DOWN(end - mem_end);
 			}
 		}
 		if (map[i].size > 0)
@@ -169,7 +192,9 @@ char * __init xen_memory_setup(void)
 
 	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
 
-	xen_return_unused_memory(xen_start_info->nr_pages, &e820);
+	extra_pages += xen_return_unused_memory(xen_start_info->nr_pages, &e820);
+
+	xen_add_extra_mem(extra_pages);
 
 	return "Xen";
 }
-- 
cgit v1.2.3-18-g5258


From 36bc251b87f88147e9d8346e4b431f42353c3d38 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Thu, 2 Sep 2010 17:07:03 -0700
Subject: xen: make sure xen_extra_mem_start is beyond all non-RAM e820

If Xen gives us non-RAM E820 entries (dom0 only, typically), then
make sure the extra RAM region is beyond them.  It's OK for
the extra space to grow into E820 regions, however.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/setup.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index f9a99eaddcd..eac010008cd 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -167,7 +167,8 @@ char * __init xen_memory_setup(void)
 
 				extra_pages += PFN_DOWN(end - mem_end);
 			}
-		}
+		} else if (map[i].type != E820_RAM)
+			xen_extra_mem_start = end;
 		if (map[i].size > 0)
 			e820_add_region(map[i].addr, map[i].size, map[i].type);
 	}
-- 
cgit v1.2.3-18-g5258


From b5b43ced7a6e79d30df3232b37dc82c5d8dfa843 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Thu, 2 Sep 2010 17:10:12 -0700
Subject: xen: add extra pages for E820 RAM regions, even if beyond mem_end

If an entire E820 RAM region is beyond mem_end, still add its
pages to the extra area so that space can be used by the kernel.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/setup.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index eac010008cd..1e85e26efa6 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -158,9 +158,8 @@ char * __init xen_memory_setup(void)
 	xen_extra_mem_start = mem_end;
 	for (i = 0; i < memmap.nr_entries; i++) {
 		unsigned long long end = map[i].addr + map[i].size;
+
 		if (map[i].type == E820_RAM) {
-			if (map[i].addr > mem_end)
-				continue;
 			if (end > mem_end) {
 				/* Truncate region to max_mem. */
 				map[i].size -= end - mem_end;
@@ -169,7 +168,9 @@ char * __init xen_memory_setup(void)
 			}
 		} else if (map[i].type != E820_RAM)
 			xen_extra_mem_start = end;
-		if (map[i].size > 0)
+
+		if ((map[i].type != E820_RAM || map[i].addr < mem_end) &&
+		    map[i].size > 0)
 			e820_add_region(map[i].addr, map[i].size, map[i].type);
 	}
 
-- 
cgit v1.2.3-18-g5258


From 698bb8d14a5b577b6841acaccdf5095d3b7c7389 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Tue, 14 Sep 2010 10:19:14 -0700
Subject: xen: limit extra memory to a certain ratio of base

If extra memory is very much larger than the base memory size
then all of the base memory can be filled with structures reserved to
describe the extra memory, leaving no space for anything else.

Even at the maximum ratio there will be little space for anything else,
but this change is intended to at least allow the system to boot rather
than crash mysteriously.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/setup.c | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 1e85e26efa6..6c9039e92f8 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -37,6 +37,18 @@ extern void xen_syscall32_target(void);
 /* Amount of extra memory space we add to the e820 ranges */
 phys_addr_t xen_extra_mem_start, xen_extra_mem_size;
 
+/* 
+ * The maximum amount of extra memory compared to the base size.  The
+ * main scaling factor is the size of struct page.  At extreme ratios
+ * of base:extra, all the base memory can be filled with page
+ * structures for the extra memory, leaving no space for anything
+ * else.
+ * 
+ * 10x seems like a reasonable balance between scaling flexibility and
+ * leaving a practically usable system.
+ */
+#define EXTRA_MEM_RATIO		(10)
+
 static __init void xen_add_extra_mem(unsigned long pages)
 {
 	u64 size = (u64)pages * PAGE_SIZE;
@@ -134,6 +146,7 @@ char * __init xen_memory_setup(void)
 	int rc;
 	struct xen_memory_map memmap;
 	unsigned long extra_pages = 0;
+	unsigned long extra_limit;
 	int i;
 
 	max_pfn = min(MAX_DOMAIN_PAGES, max_pfn);
@@ -196,6 +209,25 @@ char * __init xen_memory_setup(void)
 
 	extra_pages += xen_return_unused_memory(xen_start_info->nr_pages, &e820);
 
+	/*
+	 * Clamp the amount of extra memory to a EXTRA_MEM_RATIO
+	 * factor the base size.  On non-highmem systems, the base
+	 * size is the full initial memory allocation; on highmem it
+	 * is limited to the max size of lowmem, so that it doesn't
+	 * get completely filled.
+	 *
+	 * In principle there could be a problem in lowmem systems if
+	 * the initial memory is also very large with respect to
+	 * lowmem, but we won't try to deal with that here.
+	 */
+	extra_limit = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
+			  max_pfn + extra_pages);
+
+	if (extra_limit >= max_pfn)
+		extra_pages = extra_limit - max_pfn;
+	else
+		extra_pages = 0;
+
 	xen_add_extra_mem(extra_pages);
 
 	return "Xen";
-- 
cgit v1.2.3-18-g5258


From 2f7acb208523a3bf5f1830f01c29f7feda045169 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Wed, 15 Sep 2010 13:32:49 -0700
Subject: xen: make sure xen_max_p2m_pfn is up to date

Keep xen_max_p2m_pfn up to date with the end of the extra memory
we're adding.  It is possible that it will be too high since memory
may be truncated by a "mem=" option on the kernel command line, but
that won't matter.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/mmu.c     | 8 ++++----
 arch/x86/xen/setup.c   | 2 ++
 arch/x86/xen/xen-ops.h | 1 +
 3 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 4c63b7f452d..b2371671b11 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -195,7 +195,7 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3);	 /* actual vcpu cr3 */
  * 512 and 1024 entries respectively. 
  */
 
-static unsigned long max_p2m_pfn __read_mostly;
+unsigned long xen_max_p2m_pfn __read_mostly;
 
 #define P2M_PER_PAGE		(PAGE_SIZE / sizeof(unsigned long))
 #define P2M_MID_PER_PAGE	(PAGE_SIZE / sizeof(unsigned long *))
@@ -293,7 +293,7 @@ void xen_build_mfn_list_list(void)
 		p2m_top_mfn_init(p2m_top_mfn);
 	}
 
-	for (pfn = 0; pfn < max_p2m_pfn; pfn += P2M_PER_PAGE) {
+	for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {
 		unsigned topidx = p2m_top_index(pfn);
 		unsigned mididx = p2m_mid_index(pfn);
 		unsigned long **mid;
@@ -335,7 +335,7 @@ void xen_setup_mfn_list_list(void)
 
 	HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
 		virt_to_mfn(p2m_top_mfn);
-	HYPERVISOR_shared_info->arch.max_pfn = max_p2m_pfn;
+	HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
 }
 
 /* Set up p2m_top to point to the domain-builder provided p2m pages */
@@ -345,7 +345,7 @@ void __init xen_build_dynamic_phys_to_machine(void)
 	unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
 	unsigned pfn;
 
-	max_p2m_pfn = max_pfn;
+	xen_max_p2m_pfn = max_pfn;
 
 	p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
 	p2m_init(p2m_missing);
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 6c9039e92f8..cad2fcd130e 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -64,6 +64,8 @@ static __init void xen_add_extra_mem(unsigned long pages)
 		      "XEN EXTRA");
 
 	xen_extra_mem_size += size;
+
+	xen_max_p2m_pfn = PFN_DOWN(xen_extra_mem_start + xen_extra_mem_size);
 }
 
 static unsigned long __init xen_release_chunk(phys_addr_t start_addr,
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 7c8ab86163e..d505e98e03f 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -30,6 +30,7 @@ void xen_setup_machphys_mapping(void);
 pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
 void xen_ident_map_ISA(void);
 void xen_reserve_top(void);
+extern unsigned long xen_max_p2m_pfn;
 
 char * __init xen_memory_setup(void);
 void __init xen_arch_setup(void);
-- 
cgit v1.2.3-18-g5258


From 41f2e4771a4f1ba26c35438daf32917b9ef7858d Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Tue, 30 Mar 2010 11:47:40 -0700
Subject: xen: add support for PAT

Convert Linux PAT entries into Xen ones when constructing ptes.  Linux
doesn't use _PAGE_PAT for ptes, so the only difference in the first 4
entries is that Linux uses _PAGE_PWT for WC, whereas Xen (and default)
use it for WT.

xen_pte_val does the inverse conversion.

We hard-code assumptions about Linux's current PAT layout, but a
warning on the wrmsr to MSR_IA32_CR_PAT should point out any problems.
If necessary we could go to a more general table-based conversion between
Linux and Xen PAT entries.

hugetlbfs poses a problem at the moment, the x86 architecture uses the
same flag for _PAGE_PAT and _PAGE_PSE, which changes meaning depending
on which pagetable level we're using.  At the moment this should be OK
so long as nobody tries to do a pte_val on a hugetlbfs pte.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/enlighten.c |  5 +++++
 arch/x86/xen/mmu.c       | 53 +++++++++++++++++++++++++++++++++++++++++++++---
 arch/x86/xen/xen-ops.h   |  2 ++
 3 files changed, 57 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index d8873014b5e..b860e576ed0 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -829,6 +829,11 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
 		   Xen console noise. */
 		break;
 
+	case MSR_IA32_CR_PAT:
+		if (smp_processor_id() == 0)
+			xen_set_pat(((u64)high << 32) | low);
+		break;
+
 	default:
 		ret = native_write_msr_safe(msr, low, high);
 	}
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index b2371671b11..67b41017f7b 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -55,6 +55,7 @@
 #include <asm/e820.h>
 #include <asm/linkage.h>
 #include <asm/page.h>
+#include <asm/pat.h>
 
 #include <asm/xen/hypercall.h>
 #include <asm/xen/hypervisor.h>
@@ -780,10 +781,18 @@ static pteval_t iomap_pte(pteval_t val)
 
 pteval_t xen_pte_val(pte_t pte)
 {
-	if (xen_initial_domain() && (pte.pte & _PAGE_IOMAP))
-		return pte.pte;
+	pteval_t pteval = pte.pte;
 
-	return pte_mfn_to_pfn(pte.pte);
+	/* If this is a WC pte, convert back from Xen WC to Linux WC */
+	if ((pteval & (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)) == _PAGE_PAT) {
+		WARN_ON(!pat_enabled);
+		pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT;
+	}
+
+	if (xen_initial_domain() && (pteval & _PAGE_IOMAP))
+		return pteval;
+
+	return pte_mfn_to_pfn(pteval);
 }
 PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
 
@@ -793,10 +802,48 @@ pgdval_t xen_pgd_val(pgd_t pgd)
 }
 PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
 
+/*
+ * Xen's PAT setup is part of its ABI, though I assume entries 6 & 7
+ * are reserved for now, to correspond to the Intel-reserved PAT
+ * types.
+ *
+ * We expect Linux's PAT set as follows:
+ *
+ * Idx  PTE flags        Linux    Xen    Default
+ * 0                     WB       WB     WB
+ * 1            PWT      WC       WT     WT
+ * 2        PCD          UC-      UC-    UC-
+ * 3        PCD PWT      UC       UC     UC
+ * 4    PAT              WB       WC     WB
+ * 5    PAT     PWT      WC       WP     WT
+ * 6    PAT PCD          UC-      UC     UC-
+ * 7    PAT PCD PWT      UC       UC     UC
+ */
+
+void xen_set_pat(u64 pat)
+{
+	/* We expect Linux to use a PAT setting of
+	 * UC UC- WC WB (ignoring the PAT flag) */
+	WARN_ON(pat != 0x0007010600070106ull);
+}
+
 pte_t xen_make_pte(pteval_t pte)
 {
 	phys_addr_t addr = (pte & PTE_PFN_MASK);
 
+	/* If Linux is trying to set a WC pte, then map to the Xen WC.
+	 * If _PAGE_PAT is set, then it probably means it is really
+	 * _PAGE_PSE, so avoid fiddling with the PAT mapping and hope
+	 * things work out OK...
+	 *
+	 * (We should never see kernel mappings with _PAGE_PSE set,
+	 * but we could see hugetlbfs mappings, I think.).
+	 */
+	if (pat_enabled && !WARN_ON(pte & _PAGE_PAT)) {
+		if ((pte & (_PAGE_PCD | _PAGE_PWT)) == _PAGE_PWT)
+			pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT;
+	}
+
 	/*
 	 * Unprivileged domains are allowed to do IOMAPpings for
 	 * PCI passthrough, but not map ISA space.  The ISA
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index d505e98e03f..64044747348 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -32,6 +32,8 @@ void xen_ident_map_ISA(void);
 void xen_reserve_top(void);
 extern unsigned long xen_max_p2m_pfn;
 
+void xen_set_pat(u64);
+
 char * __init xen_memory_setup(void);
 void __init xen_arch_setup(void);
 void __init xen_init_IRQ(void);
-- 
cgit v1.2.3-18-g5258


From 3654581e47adc07072aebe239818485b68ea04f0 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Wed, 29 Sep 2010 16:54:33 -0700
Subject: xen: don't add extra_pages for RAM after mem_end

If an E820 region is entirely beyond mem_end, don't attempt to truncate
it and add the truncated pages to extra_pages, as they will be negative.

Also, make sure the extra memory region starts after all BIOS provided
E820 regions (and in the case of RAM regions, post-clipping).

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/setup.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index cad2fcd130e..7a4ab05cff8 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -52,20 +52,19 @@ phys_addr_t xen_extra_mem_start, xen_extra_mem_size;
 static __init void xen_add_extra_mem(unsigned long pages)
 {
 	u64 size = (u64)pages * PAGE_SIZE;
+	u64 extra_start = xen_extra_mem_start + xen_extra_mem_size;
 
 	if (!pages)
 		return;
 
-	e820_add_region(xen_extra_mem_start + xen_extra_mem_size, size, E820_RAM);
+	e820_add_region(extra_start, size, E820_RAM);
 	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
 
-	reserve_early(xen_extra_mem_start + xen_extra_mem_size,
-		      xen_extra_mem_start + xen_extra_mem_size + size,
-		      "XEN EXTRA");
+	reserve_early(extra_start, extra_start + size, "XEN EXTRA");
 
 	xen_extra_mem_size += size;
 
-	xen_max_p2m_pfn = PFN_DOWN(xen_extra_mem_start + xen_extra_mem_size);
+	xen_max_p2m_pfn = PFN_DOWN(extra_start + size);
 }
 
 static unsigned long __init xen_release_chunk(phys_addr_t start_addr,
@@ -175,15 +174,21 @@ char * __init xen_memory_setup(void)
 		unsigned long long end = map[i].addr + map[i].size;
 
 		if (map[i].type == E820_RAM) {
-			if (end > mem_end) {
+			if (map[i].addr < mem_end && end > mem_end) {
 				/* Truncate region to max_mem. */
-				map[i].size -= end - mem_end;
+				u64 delta = end - mem_end;
 
-				extra_pages += PFN_DOWN(end - mem_end);
+				map[i].size -= delta;
+				extra_pages += PFN_DOWN(delta);
+
+				end = mem_end;
 			}
-		} else if (map[i].type != E820_RAM)
+		}
+
+		if (end > xen_extra_mem_start)
 			xen_extra_mem_start = end;
 
+		/* If region is non-RAM or below mem_end, add what remains */
 		if ((map[i].type != E820_RAM || map[i].addr < mem_end) &&
 		    map[i].size > 0)
 			e820_add_region(map[i].addr, map[i].size, map[i].type);
-- 
cgit v1.2.3-18-g5258


From 375b2a9ada6d105483aab22f1af1d727bc3c418d Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Thu, 21 Oct 2010 11:00:46 +0100
Subject: xen: correctly rebuild mfn list list after migration.

Otherwise the second migration attempt fails because the mfn_list_list
still refers to all the old mfns.

We need to update the entires in both p2m_top_mfn and the mid_mfn
pages which p2m_top_mfn refers to.

In order to do this we need to keep track of the virtual addresses
mapping the p2m_mid_mfn pages since we cannot rely on
mfn_to_virt(p2m_top_mfn[idx]) since p2m_top_mfn[idx] will still
contain the old MFN after a migration, which may now belong to another
domain and hence have a different mapping in the m2p.

Therefore add and maintain a third top level page, p2m_top_mfn_p[],
which tracks the virtual addresses of the mfns contained in
p2m_top_mfn[].

We also need to update the content of the p2m_mid_missing_mfn page on
resume to refer to the page's new mfn.

p2m_missing does not need updating since the migration process takes
care of the leaf p2m pages for us.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/mmu.c | 50 +++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 37 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 67b41017f7b..e41683cf290 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -187,6 +187,8 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3);	 /* actual vcpu cr3 */
  *    / \      / \         /           /
  *  p2m p2m p2m p2m p2m p2m p2m ...
  *
+ * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p.
+ *
  * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the
  * maximum representable pseudo-physical address space is:
  *  P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
@@ -211,6 +213,7 @@ static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE);
 
 static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
 static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
 
 RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
 RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
@@ -247,6 +250,14 @@ static void p2m_top_mfn_init(unsigned long *top)
 		top[i] = virt_to_mfn(p2m_mid_missing_mfn);
 }
 
+static void p2m_top_mfn_p_init(unsigned long **top)
+{
+	unsigned i;
+
+	for (i = 0; i < P2M_TOP_PER_PAGE; i++)
+		top[i] = p2m_mid_missing_mfn;
+}
+
 static void p2m_mid_init(unsigned long **mid)
 {
 	unsigned i;
@@ -283,33 +294,43 @@ static void p2m_init(unsigned long *p2m)
  */
 void xen_build_mfn_list_list(void)
 {
-	unsigned pfn;
+	unsigned long pfn;
 
 	/* Pre-initialize p2m_top_mfn to be completely missing */
 	if (p2m_top_mfn == NULL) {
 		p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
 		p2m_mid_mfn_init(p2m_mid_missing_mfn);
 
+		p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
+		p2m_top_mfn_p_init(p2m_top_mfn_p);
+
 		p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
 		p2m_top_mfn_init(p2m_top_mfn);
+	} else {
+		/* Reinitialise, mfn's all change after migration */
+		p2m_mid_mfn_init(p2m_mid_missing_mfn);
 	}
 
 	for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {
 		unsigned topidx = p2m_top_index(pfn);
 		unsigned mididx = p2m_mid_index(pfn);
 		unsigned long **mid;
-		unsigned long mid_mfn;
 		unsigned long *mid_mfn_p;
 
 		mid = p2m_top[topidx];
+		mid_mfn_p = p2m_top_mfn_p[topidx];
 
 		/* Don't bother allocating any mfn mid levels if
-		   they're just missing */
-		if (mid[mididx] == p2m_missing)
+		 * they're just missing, just update the stored mfn,
+		 * since all could have changed over a migrate.
+		 */
+		if (mid == p2m_mid_missing) {
+			BUG_ON(mididx);
+			BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
+			p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn);
+			pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE;
 			continue;
-
-		mid_mfn = p2m_top_mfn[topidx];
-		mid_mfn_p = mfn_to_virt(mid_mfn);
+		}
 
 		if (mid_mfn_p == p2m_mid_missing_mfn) {
 			/*
@@ -321,11 +342,10 @@ void xen_build_mfn_list_list(void)
 			mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
 			p2m_mid_mfn_init(mid_mfn_p);
 
-			mid_mfn = virt_to_mfn(mid_mfn_p);
-			
-			p2m_top_mfn[topidx] = mid_mfn;
+			p2m_top_mfn_p[topidx] = mid_mfn_p;
 		}
 
+		p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
 		mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]);
 	}
 }
@@ -344,7 +364,7 @@ void __init xen_build_dynamic_phys_to_machine(void)
 {
 	unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
 	unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
-	unsigned pfn;
+	unsigned long pfn;
 
 	xen_max_p2m_pfn = max_pfn;
 
@@ -434,7 +454,9 @@ static bool alloc_p2m(unsigned long pfn)
 	}
 
 	top_mfn_p = &p2m_top_mfn[topidx];
-	mid_mfn = mfn_to_virt(*top_mfn_p);
+	mid_mfn = p2m_top_mfn_p[topidx];
+
+	BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
 
 	if (mid_mfn == p2m_mid_missing_mfn) {
 		/* Separately check the mid mfn level */
@@ -446,11 +468,13 @@ static bool alloc_p2m(unsigned long pfn)
 			return false;
 
 		p2m_mid_mfn_init(mid_mfn);
-		
+
 		missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
 		mid_mfn_mfn = virt_to_mfn(mid_mfn);
 		if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn)
 			free_p2m_page(mid_mfn);
+		else
+			p2m_top_mfn_p[topidx] = mid_mfn;
 	}
 
 	if (p2m_top[topidx][mididx] == p2m_missing) {
-- 
cgit v1.2.3-18-g5258


From 9e9a5fcb04e3af077d1be32710298b852210d93f Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Thu, 2 Sep 2010 16:16:00 +0100
Subject: xen: use host E820 map for dom0

When running as initial domain, get the real physical memory map from
xen using the XENMEM_machine_memory_map hypercall and use it to setup
the e820 regions.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/setup.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 7a4ab05cff8..0ce9d58cb29 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -149,6 +149,7 @@ char * __init xen_memory_setup(void)
 	unsigned long extra_pages = 0;
 	unsigned long extra_limit;
 	int i;
+	int op;
 
 	max_pfn = min(MAX_DOMAIN_PAGES, max_pfn);
 	mem_end = PFN_PHYS(max_pfn);
@@ -156,7 +157,10 @@ char * __init xen_memory_setup(void)
 	memmap.nr_entries = E820MAX;
 	set_xen_guest_handle(memmap.buffer, map);
 
-	rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
+	op = xen_initial_domain() ?
+		XENMEM_machine_memory_map :
+		XENMEM_memory_map;
+	rc = HYPERVISOR_memory_op(op, &memmap);
 	if (rc == -ENOSYS) {
 		memmap.nr_entries = 1;
 		map[0].addr = 0ULL;
@@ -235,7 +239,8 @@ char * __init xen_memory_setup(void)
 	else
 		extra_pages = 0;
 
-	xen_add_extra_mem(extra_pages);
+	if (!xen_initial_domain())
+		xen_add_extra_mem(extra_pages);
 
 	return "Xen";
 }
-- 
cgit v1.2.3-18-g5258


From 42a1de56f35a9c87932f45439dc1b09c8da0cc95 Mon Sep 17 00:00:00 2001
From: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Date: Thu, 24 Jun 2010 16:42:04 +0100
Subject: xen: implement xen_hvm_register_pirq

xen_hvm_register_pirq allows the kernel to map a GSI into a Xen pirq and
receive the interrupt as an event channel from that point on.

Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/pci/xen.c | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 4e371065ce4..08e3cdccdfa 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -17,6 +17,44 @@
 #include <xen/events.h>
 #include <asm/xen/pci.h>
 
+#ifdef CONFIG_ACPI
+static int xen_hvm_register_pirq(u32 gsi, int triggering)
+{
+	int rc, irq;
+	struct physdev_map_pirq map_irq;
+	int shareable = 0;
+	char *name;
+
+	if (!xen_hvm_domain())
+		return -1;
+
+	map_irq.domid = DOMID_SELF;
+	map_irq.type = MAP_PIRQ_TYPE_GSI;
+	map_irq.index = gsi;
+	map_irq.pirq = -1;
+
+	rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
+	if (rc) {
+		printk(KERN_WARNING "xen map irq failed %d\n", rc);
+		return -1;
+	}
+
+	if (triggering == ACPI_EDGE_SENSITIVE) {
+		shareable = 0;
+		name = "ioapic-edge";
+	} else {
+		shareable = 1;
+		name = "ioapic-level";
+	}
+
+	irq = xen_map_pirq_gsi(map_irq.pirq, gsi, shareable, name);
+
+	printk(KERN_DEBUG "xen: --> irq=%d, pirq=%d\n", irq, map_irq.pirq);
+
+	return irq;
+}
+#endif
+
 #if defined(CONFIG_PCI_MSI)
 #include <linux/msi.h>
 
-- 
cgit v1.2.3-18-g5258


From 2f065aef17b8d50a51a72451d03c7d7304249fb5 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Thu, 24 Jun 2010 16:59:16 +0100
Subject: acpi: use indirect call to register gsi in different modes

Rather than using a tree of conditionals, use function pointer
for acpi_register_gsi.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Acked-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 arch/x86/kernel/acpi/boot.c | 59 ++++++++++++++++++++++++++++++++-------------
 1 file changed, 42 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index c05872aa3ce..031f0c23891 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -513,35 +513,61 @@ int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
 	return 0;
 }
 
-/*
- * success: return IRQ number (>=0)
- * failure: return < 0
- */
-int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
+static int acpi_register_gsi_pic(struct device *dev, u32 gsi,
+				 int trigger, int polarity)
 {
-	unsigned int irq;
-	unsigned int plat_gsi = gsi;
-
 #ifdef CONFIG_PCI
 	/*
 	 * Make sure all (legacy) PCI IRQs are set as level-triggered.
 	 */
-	if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) {
-		if (trigger == ACPI_LEVEL_SENSITIVE)
-			eisa_set_level_irq(gsi);
-	}
+	if (trigger == ACPI_LEVEL_SENSITIVE)
+		eisa_set_level_irq(gsi);
 #endif
 
+	return gsi;
+}
+
+static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi,
+				    int trigger, int polarity)
+{
 #ifdef CONFIG_X86_IO_APIC
-	if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) {
-		plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity);
-	}
+	gsi = mp_register_gsi(dev, gsi, trigger, polarity);
 #endif
+
+	return gsi;
+}
+
+static int (*__acpi_register_gsi)(struct device *dev, u32 gsi, int trigger, int polarity) = acpi_register_gsi_pic;
+
+/*
+ * success: return IRQ number (>=0)
+ * failure: return < 0
+ */
+int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
+{
+	unsigned int irq;
+	unsigned int plat_gsi = gsi;
+
+	plat_gsi = (*__acpi_register_gsi)(dev, gsi, trigger, polarity);
 	irq = gsi_to_irq(plat_gsi);
 
 	return irq;
 }
 
+void __init acpi_set_irq_model_pic(void)
+{
+	acpi_irq_model = ACPI_IRQ_MODEL_PIC;
+	__acpi_register_gsi = acpi_register_gsi_pic;
+	acpi_ioapic = 0;
+}
+
+void __init acpi_set_irq_model_ioapic(void)
+{
+	acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC;
+	__acpi_register_gsi = acpi_register_gsi_ioapic;
+	acpi_ioapic = 1;
+}
+
 /*
  *  ACPI based hotplug support for CPU
  */
@@ -1259,8 +1285,7 @@ static void __init acpi_process_madt(void)
 			 */
 			error = acpi_parse_madt_ioapic_entries();
 			if (!error) {
-				acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC;
-				acpi_ioapic = 1;
+				acpi_set_irq_model_ioapic();
 
 				smp_found_config = 1;
 			}
-- 
cgit v1.2.3-18-g5258


From 90f6881e6430ea7b38b9e0f9837719b1935616e0 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Thu, 24 Jun 2010 17:05:41 +0100
Subject: xen: add xen hvm acpi_register_gsi variant

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Acked-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 arch/x86/include/asm/acpi.h | 3 +++
 arch/x86/kernel/acpi/boot.c | 3 ++-
 arch/x86/pci/xen.c          | 6 ++++++
 3 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 92091de1111..55d106b5e31 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -93,6 +93,9 @@ extern u8 acpi_sci_flags;
 extern int acpi_sci_override_gsi;
 void acpi_pic_sci_set_trigger(unsigned int, u16);
 
+extern int (*__acpi_register_gsi)(struct device *dev, u32 gsi,
+				  int trigger, int polarity);
+
 static inline void disable_acpi(void)
 {
 	acpi_disabled = 1;
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 031f0c23891..71232b941b6 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -537,7 +537,8 @@ static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi,
 	return gsi;
 }
 
-static int (*__acpi_register_gsi)(struct device *dev, u32 gsi, int trigger, int polarity) = acpi_register_gsi_pic;
+int (*__acpi_register_gsi)(struct device *dev, u32 gsi,
+			   int trigger, int polarity) = acpi_register_gsi_pic;
 
 /*
  * success: return IRQ number (>=0)
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 08e3cdccdfa..3a4ab0b4dcc 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -53,6 +53,12 @@ static int xen_hvm_register_pirq(u32 gsi, int triggering)
 
 	return irq;
 }
+
+static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi,
+				 int trigger, int polarity)
+{
+	return xen_hvm_register_pirq(gsi, trigger);
+}
 #endif
 
 #if defined(CONFIG_PCI_MSI)
-- 
cgit v1.2.3-18-g5258


From 3942b740e5183caad47a4a3fcb37a4509ce7af83 Mon Sep 17 00:00:00 2001
From: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Date: Thu, 24 Jun 2010 17:50:18 +0100
Subject: xen: support GSI -> pirq remapping in PV on HVM guests

Disable pcifront when running on HVM: it is meant to be used with pv
guests that don't have PCI bus.

Use acpi_register_gsi_xen_hvm to remap GSIs into pirqs.

Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/include/asm/xen/pci.h |  5 +++++
 arch/x86/pci/xen.c             | 16 ++++++++++++++++
 2 files changed, 21 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/xen/pci.h b/arch/x86/include/asm/xen/pci.h
index 449c82f7167..f89a42aff28 100644
--- a/arch/x86/include/asm/xen/pci.h
+++ b/arch/x86/include/asm/xen/pci.h
@@ -3,10 +3,15 @@
 
 #if defined(CONFIG_PCI_XEN)
 extern int __init pci_xen_init(void);
+extern int __init pci_xen_hvm_init(void);
 #define pci_xen 1
 #else
 #define pci_xen 0
 #define pci_xen_init (0)
+static inline int pci_xen_hvm_init(void)
+{
+	return -1;
+}
 #endif
 
 #if defined(CONFIG_PCI_MSI)
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 3a4ab0b4dcc..d5284c491ae 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -14,6 +14,7 @@
 
 #include <asm/xen/hypervisor.h>
 
+#include <xen/features.h>
 #include <xen/events.h>
 #include <asm/xen/pci.h>
 
@@ -184,3 +185,18 @@ int __init pci_xen_init(void)
 #endif
 	return 0;
 }
+
+int __init pci_xen_hvm_init(void)
+{
+	if (!xen_feature(XENFEAT_hvm_pirqs))
+		return 0;
+
+#ifdef CONFIG_ACPI
+	/*
+	 * We don't want to change the actual ACPI delivery model,
+	 * just how GSIs get registered.
+	 */
+	__acpi_register_gsi = acpi_register_gsi_xen_hvm;
+#endif
+	return 0;
+}
-- 
cgit v1.2.3-18-g5258


From 809f9267bbaba7765cdb86a47f2e6e4bf4951b69 Mon Sep 17 00:00:00 2001
From: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Date: Thu, 1 Jul 2010 17:10:39 +0100
Subject: xen: map MSIs into pirqs

Map MSIs into pirqs, writing 0 in the MSI vector data field and the pirq
number in the MSI destination id field.

Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/pci/xen.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index d5284c491ae..b5bd6420851 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -64,10 +64,62 @@ static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi,
 
 #if defined(CONFIG_PCI_MSI)
 #include <linux/msi.h>
+#include <asm/msidef.h>
 
 struct xen_pci_frontend_ops *xen_pci_frontend;
 EXPORT_SYMBOL_GPL(xen_pci_frontend);
 
+static void xen_msi_compose_msg(struct pci_dev *pdev, unsigned int pirq,
+		struct msi_msg *msg)
+{
+	/* We set vector == 0 to tell the hypervisor we don't care about it,
+	 * but we want a pirq setup instead.
+	 * We use the dest_id field to pass the pirq that we want. */
+	msg->address_hi = MSI_ADDR_BASE_HI | MSI_ADDR_EXT_DEST_ID(pirq);
+	msg->address_lo =
+		MSI_ADDR_BASE_LO |
+		MSI_ADDR_DEST_MODE_PHYSICAL |
+		MSI_ADDR_REDIRECTION_CPU |
+		MSI_ADDR_DEST_ID(pirq);
+
+	msg->data =
+		MSI_DATA_TRIGGER_EDGE |
+		MSI_DATA_LEVEL_ASSERT |
+		/* delivery mode reserved */
+		(3 << 8) |
+		MSI_DATA_VECTOR(0);
+}
+
+static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+	int irq, pirq, ret = 0;
+	struct msi_desc *msidesc;
+	struct msi_msg msg;
+
+	list_for_each_entry(msidesc, &dev->msi_list, list) {
+		xen_allocate_pirq_msi((type == PCI_CAP_ID_MSIX) ?
+				"msi-x" : "msi", &irq, &pirq);
+		if (irq < 0 || pirq < 0)
+			goto error;
+		printk(KERN_DEBUG "xen: msi --> irq=%d, pirq=%d\n", irq, pirq);
+		xen_msi_compose_msg(dev, pirq, &msg);
+		ret = set_irq_msi(irq, msidesc);
+		if (ret < 0)
+			goto error_while;
+		write_msi_msg(irq, &msg);
+	}
+	return 0;
+
+error_while:
+	unbind_from_irqhandler(irq, NULL);
+error:
+	if (ret == -ENODEV)
+		dev_err(&dev->dev, "Xen PCI frontend has not registered" \
+				" MSI/MSI-X support!\n");
+
+	return ret;
+}
+
 /*
  * For MSI interrupts we have to use drivers/xen/event.s functions to
  * allocate an irq_desc and setup the right */
@@ -198,5 +250,10 @@ int __init pci_xen_hvm_init(void)
 	 */
 	__acpi_register_gsi = acpi_register_gsi_xen_hvm;
 #endif
+
+#ifdef CONFIG_PCI_MSI
+	x86_msi.setup_msi_irqs = xen_hvm_setup_msi_irqs;
+	x86_msi.teardown_msi_irq = xen_teardown_msi_irq;
+#endif
 	return 0;
 }
-- 
cgit v1.2.3-18-g5258


From 6b0661a5e6fbfb159b78a39c0476905aa9b575fe Mon Sep 17 00:00:00 2001
From: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Date: Thu, 2 Sep 2010 15:47:32 +0100
Subject: xen: introduce XEN_DOM0 as a silent option

Add XEN_DOM0 to arch/x86/xen/Kconfig as a silent compile time option
that gets enabled when xen and basic x86, acpi and pci support are
selected.

Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/Kconfig | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 68128a1b401..a234b9a71ab 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -13,6 +13,16 @@ config XEN
 	  kernel to boot in a paravirtualized environment under the
 	  Xen hypervisor.
 
+config XEN_DOM0
+	def_bool y
+	depends on XEN && PCI_XEN && SWIOTLB_XEN
+	depends on X86_LOCAL_APIC && X86_IO_APIC && ACPI && PCI
+
+# Dummy symbol since people have come to rely on the PRIVILEGED_GUEST
+# name in tools.
+config XEN_PRIVILEGED_GUEST
+	def_bool XEN_DOM0
+
 config XEN_PVHVM
 	def_bool y
 	depends on XEN
-- 
cgit v1.2.3-18-g5258


From 38aa66fcb79e0a46c24bba96b6f2b851a6ec2037 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Thu, 2 Sep 2010 14:51:39 +0100
Subject: xen: remap GSIs as pirqs when running as initial domain

Implement xen_register_gsi to setup the correct triggering and polarity
properties of a gsi.
Implement xen_register_pirq to register a particular gsi as pirq and
receive interrupts as events.
Call xen_setup_pirqs to register all the legacy ISA irqs as pirqs.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/include/asm/xen/pci.h |   7 +++
 arch/x86/pci/xen.c             | 135 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 142 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/xen/pci.h b/arch/x86/include/asm/xen/pci.h
index f89a42aff28..2329b3eaf8d 100644
--- a/arch/x86/include/asm/xen/pci.h
+++ b/arch/x86/include/asm/xen/pci.h
@@ -13,6 +13,13 @@ static inline int pci_xen_hvm_init(void)
 	return -1;
 }
 #endif
+#if defined(CONFIG_XEN_DOM0)
+void __init xen_setup_pirqs(void);
+#else
+static inline void __init xen_setup_pirqs(void)
+{
+}
+#endif
 
 #if defined(CONFIG_PCI_MSI)
 #if defined(CONFIG_PCI_XEN)
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index b5bd6420851..dd0b5fdb27b 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -257,3 +257,138 @@ int __init pci_xen_hvm_init(void)
 #endif
 	return 0;
 }
+
+#ifdef CONFIG_XEN_DOM0
+static int xen_register_pirq(u32 gsi, int triggering)
+{
+	int rc, irq;
+	struct physdev_map_pirq map_irq;
+	int shareable = 0;
+	char *name;
+
+	if (!xen_pv_domain())
+		return -1;
+
+	if (triggering == ACPI_EDGE_SENSITIVE) {
+		shareable = 0;
+		name = "ioapic-edge";
+	} else {
+		shareable = 1;
+		name = "ioapic-level";
+	}
+
+	irq = xen_allocate_pirq(gsi, shareable, name);
+
+	printk(KERN_DEBUG "xen: --> irq=%d\n", irq);
+
+	if (irq < 0)
+		goto out;
+
+	map_irq.domid = DOMID_SELF;
+	map_irq.type = MAP_PIRQ_TYPE_GSI;
+	map_irq.index = gsi;
+	map_irq.pirq = irq;
+
+	rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
+	if (rc) {
+		printk(KERN_WARNING "xen map irq failed %d\n", rc);
+		return -1;
+	}
+
+out:
+	return irq;
+}
+
+static int xen_register_gsi(u32 gsi, int triggering, int polarity)
+{
+	int rc, irq;
+	struct physdev_setup_gsi setup_gsi;
+
+	if (!xen_pv_domain())
+		return -1;
+
+	printk(KERN_DEBUG "xen: registering gsi %u triggering %d polarity %d\n",
+			gsi, triggering, polarity);
+
+	irq = xen_register_pirq(gsi, triggering);
+
+	setup_gsi.gsi = gsi;
+	setup_gsi.triggering = (triggering == ACPI_EDGE_SENSITIVE ? 0 : 1);
+	setup_gsi.polarity = (polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
+
+	rc = HYPERVISOR_physdev_op(PHYSDEVOP_setup_gsi, &setup_gsi);
+	if (rc == -EEXIST)
+		printk(KERN_INFO "Already setup the GSI :%d\n", gsi);
+	else if (rc) {
+		printk(KERN_ERR "Failed to setup GSI :%d, err_code:%d\n",
+				gsi, rc);
+	}
+
+	return irq;
+}
+
+static __init void xen_setup_acpi_sci(void)
+{
+	int rc;
+	int trigger, polarity;
+	int gsi = acpi_sci_override_gsi;
+
+	if (!gsi)
+		return;
+
+	rc = acpi_get_override_irq(gsi, &trigger, &polarity);
+	if (rc) {
+		printk(KERN_WARNING "xen: acpi_get_override_irq failed for acpi"
+				" sci, rc=%d\n", rc);
+		return;
+	}
+	trigger = trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE;
+	polarity = polarity ? ACPI_ACTIVE_LOW : ACPI_ACTIVE_HIGH;
+	
+	printk(KERN_INFO "xen: sci override: global_irq=%d trigger=%d "
+			"polarity=%d\n", gsi, trigger, polarity);
+
+	gsi = xen_register_gsi(gsi, trigger, polarity);
+	printk(KERN_INFO "xen: acpi sci %d\n", gsi);
+
+	return;
+}
+
+static int acpi_register_gsi_xen(struct device *dev, u32 gsi,
+				 int trigger, int polarity)
+{
+	return xen_register_gsi(gsi, trigger, polarity);
+}
+
+static int __init pci_xen_initial_domain(void)
+{
+	xen_setup_acpi_sci();
+	__acpi_register_gsi = acpi_register_gsi_xen;
+
+	return 0;
+}
+
+void __init xen_setup_pirqs(void)
+{
+	int irq;
+
+	pci_xen_initial_domain();
+
+	if (0 == nr_ioapics) {
+		for (irq = 0; irq < NR_IRQS_LEGACY; irq++)
+			xen_allocate_pirq(irq, 0, "xt-pic");
+		return;
+	}
+
+	/* Pre-allocate legacy irqs */
+	for (irq = 0; irq < NR_IRQS_LEGACY; irq++) {
+		int trigger, polarity;
+
+		if (acpi_get_override_irq(irq, &trigger, &polarity) == -1)
+			continue;
+
+		xen_register_pirq(irq,
+			trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE);
+	}
+}
+#endif
-- 
cgit v1.2.3-18-g5258


From f731e3ef02b4744f4d7ca2f63539b900e47db31f Mon Sep 17 00:00:00 2001
From: Qing He <qing.he@intel.com>
Date: Mon, 11 Oct 2010 15:30:09 +0100
Subject: xen: remap MSIs into pirqs when running as initial domain

Implement xen_create_msi_irq to create an msi and remap it as pirq.
Use xen_create_msi_irq to implement an initial domain specific version
of setup_msi_irqs.

Signed-off-by: Qing He <qing.he@intel.com>
Signed-off-by: Yunhong Jiang <yunhong.jiang@intel.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/pci/xen.c | 55 ++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 37 insertions(+), 18 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index dd0b5fdb27b..b3f4b30222f 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -135,14 +135,12 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 	if (!v)
 		return -ENOMEM;
 
-	if (!xen_initial_domain()) {
-		if (type == PCI_CAP_ID_MSIX)
-			ret = xen_pci_frontend_enable_msix(dev, &v, nvec);
-		else
-			ret = xen_pci_frontend_enable_msi(dev, &v);
-		if (ret)
-			goto error;
-	}
+	if (type == PCI_CAP_ID_MSIX)
+		ret = xen_pci_frontend_enable_msix(dev, &v, nvec);
+	else
+		ret = xen_pci_frontend_enable_msi(dev, &v);
+	if (ret)
+		goto error;
 	i = 0;
 	list_for_each_entry(msidesc, &dev->msi_list, list) {
 		irq = xen_allocate_pirq(v[i], 0, /* not sharable */
@@ -172,23 +170,40 @@ error:
 
 static void xen_teardown_msi_irqs(struct pci_dev *dev)
 {
-	/* Only do this when were are in non-privileged mode.*/
-	if (!xen_initial_domain()) {
-		struct msi_desc *msidesc;
-
-		msidesc = list_entry(dev->msi_list.next, struct msi_desc, list);
-		if (msidesc->msi_attrib.is_msix)
-			xen_pci_frontend_disable_msix(dev);
-		else
-			xen_pci_frontend_disable_msi(dev);
-	}
+	struct msi_desc *msidesc;
 
+	msidesc = list_entry(dev->msi_list.next, struct msi_desc, list);
+	if (msidesc->msi_attrib.is_msix)
+		xen_pci_frontend_disable_msix(dev);
+	else
+		xen_pci_frontend_disable_msi(dev);
 }
 
 static void xen_teardown_msi_irq(unsigned int irq)
 {
 	xen_destroy_irq(irq);
 }
+
+static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+	int irq, ret;
+	struct msi_desc *msidesc;
+
+	list_for_each_entry(msidesc, &dev->msi_list, list) {
+		irq = xen_create_msi_irq(dev, msidesc, type);
+		if (irq < 0)
+			return -1;
+
+		ret = set_irq_msi(irq, msidesc);
+		if (ret)
+			goto error;
+	}
+	return 0;
+
+error:
+	xen_destroy_irq(irq);
+	return ret;
+}
 #endif
 
 static int xen_pcifront_enable_irq(struct pci_dev *dev)
@@ -362,6 +377,10 @@ static int acpi_register_gsi_xen(struct device *dev, u32 gsi,
 
 static int __init pci_xen_initial_domain(void)
 {
+#ifdef CONFIG_PCI_MSI
+	x86_msi.setup_msi_irqs = xen_initdom_setup_msi_irqs;
+	x86_msi.teardown_msi_irq = xen_teardown_msi_irq;
+#endif
 	xen_setup_acpi_sci();
 	__acpi_register_gsi = acpi_register_gsi_xen;
 
-- 
cgit v1.2.3-18-g5258


From 98511f3532eb7fce274f37d94f29790922799e15 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Fri, 3 Sep 2010 14:55:16 +0100
Subject: xen: map a dummy page for local apic and ioapic in xen_set_fixmap

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/mmu.c | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 42086ac406a..ffc5e24a53b 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1861,6 +1861,8 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
 }
 #endif	/* CONFIG_X86_64 */
 
+static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;
+
 static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
 {
 	pte_t pte;
@@ -1880,9 +1882,6 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
 # endif
 #else
 	case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
-#endif
-#ifdef CONFIG_X86_LOCAL_APIC
-	case FIX_APIC_BASE:	/* maps dummy local APIC */
 #endif
 	case FIX_TEXT_POKE0:
 	case FIX_TEXT_POKE1:
@@ -1890,6 +1889,22 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
 		pte = pfn_pte(phys, prot);
 		break;
 
+#ifdef CONFIG_X86_LOCAL_APIC
+	case FIX_APIC_BASE:	/* maps dummy local APIC */
+		pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
+		break;
+#endif
+
+#ifdef CONFIG_X86_IO_APIC
+	case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END:
+		/*
+		 * We just don't map the IO APIC - all access is via
+		 * hypercalls.  Keep the address in the pte for reference.
+		 */
+		pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
+		break;
+#endif
+
 	case FIX_PARAVIRT_BOOTMAP:
 		/* This is an MFN, but it isn't an IO mapping from the
 		   IO domain */
@@ -2027,6 +2042,8 @@ void __init xen_init_mmu_ops(void)
 	pv_mmu_ops = xen_mmu_ops;
 
 	vmap_lazy_unmap = false;
+
+	memset(dummy_mapping, 0xff, PAGE_SIZE);
 }
 
 /* Protected by xen_reservation_lock. */
-- 
cgit v1.2.3-18-g5258


From 801fd14a725ef7757d33f07b83415cdd2165e50a Mon Sep 17 00:00:00 2001
From: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Date: Thu, 23 Sep 2010 12:06:25 +0100
Subject: xen: use vcpu_ops to setup cpu masks

Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/smp.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 25f232b18a8..138676781dd 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -156,11 +156,16 @@ static void __init xen_fill_possible_map(void)
 {
 	int i, rc;
 
+	num_processors = 0;
+	disabled_cpus = 0;
 	for (i = 0; i < nr_cpu_ids; i++) {
 		rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
 		if (rc >= 0) {
 			num_processors++;
 			set_cpu_possible(i, true);
+		} else {
+			set_cpu_possible(i, false);
+			set_cpu_present(i, false);
 		}
 	}
 }
@@ -190,6 +195,8 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
 	if (xen_smp_intr_init(0))
 		BUG();
 
+	xen_fill_possible_map();
+
 	if (!alloc_cpumask_var(&xen_cpu_initialized_map, GFP_KERNEL))
 		panic("could not allocate xen_cpu_initialized_map\n");
 
@@ -480,6 +487,5 @@ static const struct smp_ops xen_smp_ops __initdata = {
 void __init xen_smp_init(void)
 {
 	smp_ops = xen_smp_ops;
-	xen_fill_possible_map();
 	xen_init_spinlocks();
 }
-- 
cgit v1.2.3-18-g5258


From 4ec5387cc36c6472a2ff2c82e9865abe8cab96c2 Mon Sep 17 00:00:00 2001
From: Juan Quintela <quintela@redhat.com>
Date: Thu, 2 Sep 2010 15:45:43 +0100
Subject: xen: add the direct mapping area for ISA bus access

add the direct mapping area for ISA bus access when running as initial
domain

Signed-off-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/enlighten.c |  1 +
 arch/x86/xen/mmu.c       | 24 ++++++++++++++++++++++++
 arch/x86/xen/setup.c     |  3 +++
 3 files changed, 28 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 1ccfa1bf0f8..9efb0044625 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1186,6 +1186,7 @@ asmlinkage void __init xen_start_kernel(void)
 
 	xen_raw_console_write("mapping kernel into physical memory\n");
 	pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages);
+	xen_ident_map_ISA();
 
 	init_mm.pgd = pgd;
 
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index ffc5e24a53b..eed9c7cee4b 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1682,6 +1682,7 @@ static void *m2v(phys_addr_t maddr)
 	return __ka(m2p(maddr));
 }
 
+/* Set the page permissions on an identity-mapped pages */
 static void set_page_prot(void *addr, pgprot_t prot)
 {
 	unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
@@ -1929,6 +1930,29 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
 #endif
 }
 
+__init void xen_ident_map_ISA(void)
+{
+	unsigned long pa;
+
+	/*
+	 * If we're dom0, then linear map the ISA machine addresses into
+	 * the kernel's address space.
+	 */
+	if (!xen_initial_domain())
+		return;
+
+	xen_raw_printk("Xen: setup ISA identity maps\n");
+
+	for (pa = ISA_START_ADDRESS; pa < ISA_END_ADDRESS; pa += PAGE_SIZE) {
+		pte_t pte = mfn_pte(PFN_DOWN(pa), PAGE_KERNEL_IO);
+
+		if (HYPERVISOR_update_va_mapping(PAGE_OFFSET + pa, pte, 0))
+			BUG();
+	}
+
+	xen_flush_tlb();
+}
+
 static __init void xen_post_allocator_init(void)
 {
 	pv_mmu_ops.set_pte = xen_set_pte;
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index c413132702f..62ceb786401 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -119,6 +119,9 @@ char * __init xen_memory_setup(void)
 	 * Even though this is normal, usable memory under Xen, reserve
 	 * ISA memory anyway because too many things think they can poke
 	 * about in there.
+	 *
+	 * In a dom0 kernel, this region is identity mapped with the
+	 * hardware ISA area, so it really is out of bounds.
 	 */
 	e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
 			E820_RESERVED);
-- 
cgit v1.2.3-18-g5258


From ff12849a7a187e17fcbd888b39850d22103395c6 Mon Sep 17 00:00:00 2001
From: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Date: Tue, 28 Sep 2010 16:45:51 +0100
Subject: xen: mask the MTRR feature from the cpuid

We don't want Linux to think that the cpu supports MTRRs when running
under Xen because MTRR operations could only be performed through
hypercalls.

Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/enlighten.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 9efb0044625..d48a32b10a3 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -243,6 +243,7 @@ static __init void xen_init_cpuid_mask(void)
 	cpuid_leaf1_edx_mask =
 		~((1 << X86_FEATURE_MCE)  |  /* disable MCE */
 		  (1 << X86_FEATURE_MCA)  |  /* disable MCA */
+		  (1 << X86_FEATURE_MTRR) |  /* disable MTRR */
 		  (1 << X86_FEATURE_ACC));   /* thermal monitoring */
 
 	if (!xen_initial_domain())
-- 
cgit v1.2.3-18-g5258


From 0e058e527784a9a23f7ed7a73ffafebb53a889da Mon Sep 17 00:00:00 2001
From: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Date: Thu, 21 Oct 2010 17:40:08 +0100
Subject: xen: add a missing #include to arch/x86/pci/xen.c

Add missing #include <asm/io_apic.h> to arch/x86/pci/xen.c.

Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
---
 arch/x86/pci/xen.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index b3f4b30222f..117f5b8daf7 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -10,6 +10,7 @@
 #include <linux/acpi.h>
 
 #include <linux/io.h>
+#include <asm/io_apic.h>
 #include <asm/pci_x86.h>
 
 #include <asm/xen/hypervisor.h>
-- 
cgit v1.2.3-18-g5258


From fad99fac2627e2cc0ebfe07fcb5046c0b4e103f9 Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Wed, 20 Oct 2010 08:20:00 -0500
Subject: x86,kgdb: fix debugger hw breakpoint test regression in 2.6.35

HW breakpoints events stopped working correctly with kgdb as a result
of commit: 018cbffe6819f6f8db20a0a3acd9bab9bfd667e4 (Merge commit
'v2.6.33' into perf/core), later commit:
ba773f7c510c0b252145933926c636c439889207 (x86,kgdb: Fix hw breakpoint
regression) allowed breakpoints to propagate to the debugger core but
did not completely address the original regression in functionality
found in 2.6.35.

When the DR_STEP flag is set in dr6 along with any of the DR_TRAP
bits, the kgdb exception handler will enter once from the
hw_breakpoint API call back and again from the die notifier for
do_debug(), which causes the debugger to stop twice and also for the
kgdb regression tests to fail running under kvm with:

echo V2I1 > /sys/module/kgdbts/parameters/kgdbts

To address the problem, the kgdb overflow handler needs to implement
the same logic as the ptrace overflow handler call back with respect
to updating the virtual copy of dr6.  This will allow the kgdb
do_debug() die notifier to properly handle the exception and the
attached debugger, or kgdb test suite, will only receive a single
notification.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
CC: Frederic Weisbecker <fweisbec@gmail.com>
CC: x86@kernel.org
---
 arch/x86/kernel/kgdb.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 852b81967a3..497f9738641 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -621,7 +621,12 @@ int kgdb_arch_init(void)
 static void kgdb_hw_overflow_handler(struct perf_event *event, int nmi,
 		struct perf_sample_data *data, struct pt_regs *regs)
 {
-	kgdb_ll_trap(DIE_DEBUG, "debug", regs, 0, 0, SIGTRAP);
+	struct task_struct *tsk = current;
+	int i;
+
+	for (i = 0; i < 4; i++)
+		if (breakinfo[i].enabled)
+			tsk->thread.debugreg6 |= (DR_TRAP0 << i);
 }
 
 void kgdb_arch_late(void)
-- 
cgit v1.2.3-18-g5258


From 91b152aa85bbcf076e269565394c31964f940371 Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Mon, 23 Aug 2010 09:20:14 -0500
Subject: kdb,kgdb: fix sparse fixups

Fix the following sparse warnings:

kdb_main.c:328:5: warning: symbol 'kdbgetu64arg' was not declared. Should it be static?
kgdboc.c:246:12: warning: symbol 'kgdboc_early_init' was not declared. Should it be static?
kgdb.c:652:26: warning: incorrect type in argument 1 (different address spaces)
kgdb.c:652:26:    expected void const *ptr
kgdb.c:652:26:    got struct perf_event *[noderef] <asn:3>*pev

The one in kgdb.c required the (void * __force) because of the return
code from register_wide_hw_breakpoint looking like:

        return (void __percpu __force *)ERR_PTR(err);

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 arch/x86/kernel/kgdb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 497f9738641..101bf22cf16 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -649,7 +649,7 @@ void kgdb_arch_late(void)
 		if (breakinfo[i].pev)
 			continue;
 		breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL);
-		if (IS_ERR(breakinfo[i].pev)) {
+		if (IS_ERR((void * __force)breakinfo[i].pev)) {
 			printk(KERN_ERR "kgdb: Could not allocate hw"
 			       "breakpoints\nDisabling the kernel debugger\n");
 			breakinfo[i].pev = NULL;
-- 
cgit v1.2.3-18-g5258


From 39a0715f5ace92268190c89e246fd1cf741dbaea Mon Sep 17 00:00:00 2001
From: Dongdong Deng <dongdong.deng@windriver.com>
Date: Mon, 13 Sep 2010 06:58:00 -0500
Subject: x86,kgdb: remove unnecessary call to kgdb_correct_hw_break()

The kernel debug_core invokes hw breakpoint install and removal via
call backs.  The architecture specific kgdb stubs only need to
implement the call backs and not actually call the functions.

Signed-off-by: Dongdong Deng <dongdong.deng@windriver.com>
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
CC: x86@kernel.org
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Ingo Molnar <mingo@redhat.com>
CC: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/kgdb.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 101bf22cf16..d81cfebb848 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -477,8 +477,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
 				   raw_smp_processor_id());
 		}
 
-		kgdb_correct_hw_break();
-
 		return 0;
 	}
 
-- 
cgit v1.2.3-18-g5258


From e4072a9a9d186fe86293effe8828faa4be75b4a4 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Wed, 20 Oct 2010 16:48:51 +0200
Subject: x86, printk: Get rid of <0> from stack output

The stack output currently looks like this:

 7fffffffffffffff 0000000a00000000 ffffffff81093341 0000000000000046
<0> ffff88003a545fd8 0000000000000000 0000000000000000 00007fffa39769c0
<0> ffff88003e403f58 ffffffff8102fc4c ffff88003e403f58 ffff88003e403f78

The superfluous <0> are caused by recent printk KERN_CONT
change. <*> is now ignored in printk unless some text follows
the level and even then it still has to be the first in the
format message.

Note that the log_lvl parameter is now completely ignored in
show_stack_log_lvl and the stack is dumped with the default
level (like for quite some time already). It behaves the same as
the rest of the dump, function traces are dumped in the very
same manner. Only Code and maybe some lines are printed with
EMERG level.

Unfortunately I see no way how to fix this conceptually to have
the whole oops/BUG/panic output with the same level, so this
removed only the superfluous characters for the time being.

Just for illustration:

<4>Process kworker/0:0 (pid: 0, threadinfo ffff88003c8a6000, task ffff88003c85c100)
<0>Stack:
<4> ffffffff818022c0 0000000a00000001 0000000000000001 0000000000000046
<4> ffff88003c8a7fd8 0000000000000001 ffff88003c8a7e58 0000000000000000
<4> ffff88003e503f48 ffffffff8102fc4c ffff88003e503f48 ffff88003e503f68
<0>Call Trace:
<0> <IRQ>
<4> [<ffffffff8102fc4c>] ? call_softirq+0x1c/0x30 ...
<0>Code: 00 01 00 00 65 8b 04 25 80 c5 00 00 c7 45 ...

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Cc: jirislaby@gmail.com
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1287586131-16222-1-git-send-email-jslaby@suse.cz>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/dumpstack_32.c | 6 +++---
 arch/x86/kernel/dumpstack_64.c | 8 ++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 0f6376ffa2d..1bc7f75a5bd 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -82,11 +82,11 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
 		if (kstack_end(stack))
 			break;
 		if (i && ((i % STACKSLOTS_PER_LINE) == 0))
-			printk("\n%s", log_lvl);
-		printk(" %08lx", *stack++);
+			printk(KERN_CONT "\n");
+		printk(KERN_CONT " %08lx", *stack++);
 		touch_nmi_watchdog();
 	}
-	printk("\n");
+	printk(KERN_CONT "\n");
 	show_trace_log_lvl(task, regs, sp, bp, log_lvl);
 }
 
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 57a21f11c79..6a340485249 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -265,20 +265,20 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
 		if (stack >= irq_stack && stack <= irq_stack_end) {
 			if (stack == irq_stack_end) {
 				stack = (unsigned long *) (irq_stack_end[-1]);
-				printk(" <EOI> ");
+				printk(KERN_CONT " <EOI> ");
 			}
 		} else {
 		if (((long) stack & (THREAD_SIZE-1)) == 0)
 			break;
 		}
 		if (i && ((i % STACKSLOTS_PER_LINE) == 0))
-			printk("\n%s", log_lvl);
-		printk(" %016lx", *stack++);
+			printk(KERN_CONT "\n");
+		printk(KERN_CONT " %016lx", *stack++);
 		touch_nmi_watchdog();
 	}
 	preempt_enable();
 
-	printk("\n");
+	printk(KERN_CONT "\n");
 	show_trace_log_lvl(task, regs, sp, bp, log_lvl);
 }
 
-- 
cgit v1.2.3-18-g5258


From 91269b8f94eedce1767b2f208d656e5a5683326a Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 25 Jul 2010 14:51:16 +0300
Subject: KVM: x86 emulator: fix handling for unemulated instructions

If an instruction is present in the decode tables but not in the execution
switch, it will be emulated as a NOP.  An example is IRET (0xcf).

Fix by adding default: labels to the execution switches.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 66ca98aafdd..70e47d3593d 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3028,6 +3028,8 @@ special_insn:
 		if (c->modrm_reg == 5)
 			goto jump_far;
 		goto grp45;
+	default:
+		goto cannot_emulate;
 	}
 
 writeback:
@@ -3353,6 +3355,8 @@ twobyte_insn:
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		break;
+	default:
+		goto cannot_emulate;
 	}
 	goto writeback;
 
-- 
cgit v1.2.3-18-g5258


From 83babbca4617ab086621fe65a71a2168420f1d88 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 26 Jul 2010 14:37:39 +0300
Subject: KVM: x86 emulator: add macros for repetitive instructions

Some instructions are repetitive in the opcode space, add macros for
consolidating them.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 70e47d3593d..c5c42e041e4 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -94,6 +94,15 @@
 #define Src2One     (3<<29)
 #define Src2Mask    (7<<29)
 
+#define X2(x) (x), (x)
+#define X3(x) X2(x), (x)
+#define X4(x) X2(x), X2(x)
+#define X5(x) X4(x), (x)
+#define X6(x) X4(x), X2(x)
+#define X7(x) X4(x), X3(x)
+#define X8(x) X4(x), X4(x)
+#define X16(x) X8(x), X8(x)
+
 enum {
 	Group1_80, Group1_81, Group1_82, Group1_83,
 	Group1A, Group3_Byte, Group3, Group4, Group5, Group7,
-- 
cgit v1.2.3-18-g5258


From 749358a6b4691bfd2abfa9e4be2142af4697de3a Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 26 Jul 2010 14:37:40 +0300
Subject: KVM: x86 emulator: consolidate inc/dec reg decoding

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index c5c42e041e4..65d89601545 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -147,10 +147,8 @@ static u32 opcode_table[256] = {
 	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
 	ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
 	0, 0,
-	/* 0x40 - 0x47 */
-	DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
-	/* 0x48 - 0x4F */
-	DstReg, DstReg, DstReg, DstReg,	DstReg, DstReg, DstReg, DstReg,
+	/* 0x40 - 0x4F */
+	X16(DstReg),
 	/* 0x50 - 0x57 */
 	SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
 	SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
-- 
cgit v1.2.3-18-g5258


From 3849186c381e2e6291828579c382662520b44696 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 26 Jul 2010 14:37:41 +0300
Subject: KVM: x86 emulator: consolidate push/pop reg decoding

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 65d89601545..68e5b73d22a 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -150,11 +150,9 @@ static u32 opcode_table[256] = {
 	/* 0x40 - 0x4F */
 	X16(DstReg),
 	/* 0x50 - 0x57 */
-	SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
-	SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
+	X8(SrcReg | Stack),
 	/* 0x58 - 0x5F */
-	DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
-	DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
+	X8(DstReg | Stack),
 	/* 0x60 - 0x67 */
 	ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
 	0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
-- 
cgit v1.2.3-18-g5258


From b3ab3405fe3d40ae9c5350ee014c7c086fcf3d97 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 26 Jul 2010 14:37:42 +0300
Subject: KVM: x86 emulator: consolidate Jcc rel8 decoding

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 68e5b73d22a..78708211f18 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -161,12 +161,8 @@ static u32 opcode_table[256] = {
 	SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0,
 	DstDI | ByteOp | Mov | String, DstDI | Mov | String, /* insb, insw/insd */
 	SrcSI | ByteOp | ImplicitOps | String, SrcSI | ImplicitOps | String, /* outsb, outsw/outsd */
-	/* 0x70 - 0x77 */
-	SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
-	SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
-	/* 0x78 - 0x7F */
-	SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
-	SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
+	/* 0x70 - 0x7F */
+	X16(SrcImmByte),
 	/* 0x80 - 0x87 */
 	Group | Group1_80, Group | Group1_81,
 	Group | Group1_82, Group | Group1_83,
-- 
cgit v1.2.3-18-g5258


From b6e6153885d6463896d9b465e59b361eac60efa0 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 26 Jul 2010 14:37:43 +0300
Subject: KVM: x86 emulator: consolidate MOV reg, imm decoding

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 78708211f18..a6ce7f1cf8f 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -188,15 +188,9 @@ static u32 opcode_table[256] = {
 	ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String,
 	ByteOp | DstDI | String, DstDI | String,
 	/* 0xB0 - 0xB7 */
-	ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
-	ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
-	ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
-	ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
+	X8(ByteOp | DstReg | SrcImm | Mov),
 	/* 0xB8 - 0xBF */
-	DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
-	DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
-	DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
-	DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
+	X8(DstReg | SrcImm | Mov),
 	/* 0xC0 - 0xC7 */
 	ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
 	0, ImplicitOps | Stack, 0, 0,
-- 
cgit v1.2.3-18-g5258


From be8eacddbd8ee60506a6f940b3efb93cb61d7861 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 26 Jul 2010 14:37:44 +0300
Subject: KVM: x86 emulator: consolidate CMOVcc decoding

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index a6ce7f1cf8f..0526be18719 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -238,16 +238,8 @@ static u32 twobyte_table[256] = {
 	ImplicitOps | Priv, 0, ImplicitOps | Priv, 0,
 	ImplicitOps, ImplicitOps | Priv, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 0,
-	/* 0x40 - 0x47 */
-	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-	/* 0x48 - 0x4F */
-	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-	DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
+	/* 0x40 - 0x4F */
+	X16(DstReg | SrcMem | ModRM | Mov),
 	/* 0x50 - 0x5F */
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	/* 0x60 - 0x6F */
-- 
cgit v1.2.3-18-g5258


From 880a1883785d37287e13e4faf3fe92b294404de0 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 26 Jul 2010 14:37:45 +0300
Subject: KVM: x86 emulator: consolidate Jcc rel32 decoding

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 0526be18719..fd4073546ca 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -247,8 +247,7 @@ static u32 twobyte_table[256] = {
 	/* 0x70 - 0x7F */
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	/* 0x80 - 0x8F */
-	SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm,
-	SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm,
+	X16(SrcImm),
 	/* 0x90 - 0x9F */
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	/* 0xA0 - 0xA7 */
-- 
cgit v1.2.3-18-g5258


From 2ce495365f6cdd5792c4db0ddb8ac8544950b671 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 26 Jul 2010 14:37:46 +0300
Subject: KVM: x86 emulator: Make group storage bits separate from operand bits

Currently group bits are stored in bits 0:7, where operand bits are stored.

Make group bits be 0:3, and move the existing bits 0:3 to 16:19, so we can
mix group and operand bits.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index fd4073546ca..61139e20b89 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -46,15 +46,15 @@
  */
 
 /* Operand sizes: 8-bit operands or specified/overridden size. */
-#define ByteOp      (1<<0)	/* 8-bit operands. */
+#define ByteOp      (1<<16)	/* 8-bit operands. */
 /* Destination operand type. */
-#define ImplicitOps (1<<1)	/* Implicit in opcode. No generic decode. */
-#define DstReg      (2<<1)	/* Register operand. */
-#define DstMem      (3<<1)	/* Memory operand. */
-#define DstAcc      (4<<1)      /* Destination Accumulator */
-#define DstDI       (5<<1)	/* Destination is in ES:(E)DI */
-#define DstMem64    (6<<1)	/* 64bit memory operand */
-#define DstMask     (7<<1)
+#define ImplicitOps (1<<17)	/* Implicit in opcode. No generic decode. */
+#define DstReg      (2<<17)	/* Register operand. */
+#define DstMem      (3<<17)	/* Memory operand. */
+#define DstAcc      (4<<17)	/* Destination Accumulator */
+#define DstDI       (5<<17)	/* Destination is in ES:(E)DI */
+#define DstMem64    (6<<17)	/* 64bit memory operand */
+#define DstMask     (7<<17)
 /* Source operand type. */
 #define SrcNone     (0<<4)	/* No source operand. */
 #define SrcImplicit (0<<4)	/* Source operand is implicit in the opcode. */
@@ -82,7 +82,7 @@
 #define Stack       (1<<13)     /* Stack instruction (push/pop) */
 #define Group       (1<<14)     /* Bits 3:5 of modrm byte extend opcode */
 #define GroupDual   (1<<15)     /* Alternate decoding of mod == 3 */
-#define GroupMask   0xff        /* Group number stored in bits 0:7 */
+#define GroupMask   0x0f        /* Group number stored in bits 0:3 */
 /* Misc flags */
 #define Lock        (1<<26) /* lock prefix is allowed for the instruction */
 #define Priv        (1<<27) /* instruction generates #GP if current CPL != 0 */
-- 
cgit v1.2.3-18-g5258


From 047a4818094217a1323d8f31f9318ea2e142f745 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 26 Jul 2010 14:37:47 +0300
Subject: KVM: x86 emulator: add Undefined decode flag

Add a decode flag to indicate the instruction is invalid.  Will come in useful
later, when we mix decode bits from the opcode and group table.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 61139e20b89..b1e3e8c2aff 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -84,6 +84,7 @@
 #define GroupDual   (1<<15)     /* Alternate decoding of mod == 3 */
 #define GroupMask   0x0f        /* Group number stored in bits 0:3 */
 /* Misc flags */
+#define Undefined   (1<<25) /* No Such Instruction */
 #define Lock        (1<<26) /* lock prefix is allowed for the instruction */
 #define Priv        (1<<27) /* instruction generates #GP if current CPL != 0 */
 #define No64	    (1<<28)
@@ -1065,7 +1066,7 @@ done_prefixes:
 	}
 
 	/* Unrecognised? */
-	if (c->d == 0) {
+	if (c->d == 0 || (c->d & Undefined)) {
 		DPRINTF("Cannot emulate %02x\n", c->b);
 		return -1;
 	}
-- 
cgit v1.2.3-18-g5258


From 52811d7de565b2db988257591fbf2a6be31c1459 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 26 Jul 2010 14:37:48 +0300
Subject: KVM: x86 emulator: mix decode bits from opcode and group decode
 tables

Allow bits that are common to all members of a group to be specified in the
opcode table instead of the group table.  This allows some simplification
of the decode tables.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index b1e3e8c2aff..ef2b5af33a3 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -955,7 +955,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	struct decode_cache *c = &ctxt->decode;
 	int rc = X86EMUL_CONTINUE;
 	int mode = ctxt->mode;
-	int def_op_bytes, def_ad_bytes, group;
+	int def_op_bytes, def_ad_bytes, group, dual;
 
 
 	/* we cannot decode insn before we complete previous rep insn */
@@ -1055,14 +1055,16 @@ done_prefixes:
 
 	if (c->d & Group) {
 		group = c->d & GroupMask;
+		dual = c->d & GroupDual;
 		c->modrm = insn_fetch(u8, 1, c->eip);
 		--c->eip;
 
 		group = (group << 3) + ((c->modrm >> 3) & 7);
-		if ((c->d & GroupDual) && (c->modrm >> 6) == 3)
-			c->d = group2_table[group];
+		c->d &= ~(Group | GroupDual | GroupMask);
+		if (dual && (c->modrm >> 6) == 3)
+			c->d |= group2_table[group];
 		else
-			c->d = group_table[group];
+			c->d |= group_table[group];
 	}
 
 	/* Unrecognised? */
-- 
cgit v1.2.3-18-g5258


From 4968ec4e26007770d8759fbface4d4712a27b5d4 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 26 Jul 2010 14:37:49 +0300
Subject: KVM: x86 emulator: simplify Group 1 decoding

Move operand decoding to the opcode table, keep lock decoding in the group
table.  This allows us to get consolidate the four variants of Group 1 into one
group.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 47 +++++++----------------------------------------
 1 file changed, 7 insertions(+), 40 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index ef2b5af33a3..1ce9c6de0ae 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -105,8 +105,7 @@
 #define X16(x) X8(x), X8(x)
 
 enum {
-	Group1_80, Group1_81, Group1_82, Group1_83,
-	Group1A, Group3_Byte, Group3, Group4, Group5, Group7,
+	Group1, Group1A, Group3_Byte, Group3, Group4, Group5, Group7,
 	Group8, Group9,
 };
 
@@ -165,8 +164,10 @@ static u32 opcode_table[256] = {
 	/* 0x70 - 0x7F */
 	X16(SrcImmByte),
 	/* 0x80 - 0x87 */
-	Group | Group1_80, Group | Group1_81,
-	Group | Group1_82, Group | Group1_83,
+	ByteOp | DstMem | SrcImm | ModRM | Group | Group1,
+	DstMem | SrcImm | ModRM | Group | Group1,
+	ByteOp | DstMem | SrcImm | ModRM | No64 | Group | Group1,
+	DstMem | SrcImmByte | ModRM | Group | Group1,
 	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
 	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
 	/* 0x88 - 0x8F */
@@ -285,42 +286,8 @@ static u32 twobyte_table[256] = {
 };
 
 static u32 group_table[] = {
-	[Group1_80*8] =
-	ByteOp | DstMem | SrcImm | ModRM | Lock,
-	ByteOp | DstMem | SrcImm | ModRM | Lock,
-	ByteOp | DstMem | SrcImm | ModRM | Lock,
-	ByteOp | DstMem | SrcImm | ModRM | Lock,
-	ByteOp | DstMem | SrcImm | ModRM | Lock,
-	ByteOp | DstMem | SrcImm | ModRM | Lock,
-	ByteOp | DstMem | SrcImm | ModRM | Lock,
-	ByteOp | DstMem | SrcImm | ModRM,
-	[Group1_81*8] =
-	DstMem | SrcImm | ModRM | Lock,
-	DstMem | SrcImm | ModRM | Lock,
-	DstMem | SrcImm | ModRM | Lock,
-	DstMem | SrcImm | ModRM | Lock,
-	DstMem | SrcImm | ModRM | Lock,
-	DstMem | SrcImm | ModRM | Lock,
-	DstMem | SrcImm | ModRM | Lock,
-	DstMem | SrcImm | ModRM,
-	[Group1_82*8] =
-	ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
-	ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
-	ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
-	ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
-	ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
-	ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
-	ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
-	ByteOp | DstMem | SrcImm | ModRM | No64,
-	[Group1_83*8] =
-	DstMem | SrcImmByte | ModRM | Lock,
-	DstMem | SrcImmByte | ModRM | Lock,
-	DstMem | SrcImmByte | ModRM | Lock,
-	DstMem | SrcImmByte | ModRM | Lock,
-	DstMem | SrcImmByte | ModRM | Lock,
-	DstMem | SrcImmByte | ModRM | Lock,
-	DstMem | SrcImmByte | ModRM | Lock,
-	DstMem | SrcImmByte | ModRM,
+	[Group1*8] =
+	X7(Lock), 0,
 	[Group1A*8] =
 	DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0,
 	[Group3_Byte*8] =
-- 
cgit v1.2.3-18-g5258


From dfe11481d8f1b6a7354c34cb252ff1a8af233cfe Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 26 Jul 2010 14:37:50 +0300
Subject: KVM: x86 emulator: Allow LOCK prefix for NEG and NOT

Opcodes F6/2, F6/3, F7/2, F7/3.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 1ce9c6de0ae..bbe2d097c4a 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -292,11 +292,11 @@ static u32 group_table[] = {
 	DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0,
 	[Group3_Byte*8] =
 	ByteOp | SrcImm | DstMem | ModRM, ByteOp | SrcImm | DstMem | ModRM,
-	ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
+	ByteOp | DstMem | SrcNone | ModRM | Lock, ByteOp | DstMem | SrcNone | ModRM | Lock,
 	0, 0, 0, 0,
 	[Group3*8] =
 	DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
-	DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
+	DstMem | SrcNone | ModRM | Lock, DstMem | SrcNone | ModRM | Lock,
 	0, 0, 0, 0,
 	[Group4*8] =
 	ByteOp | DstMem | SrcNone | ModRM | Lock, ByteOp | DstMem | SrcNone | ModRM | Lock,
-- 
cgit v1.2.3-18-g5258


From e071edd5ba8dd7a493eef229d495cf6232b09534 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 26 Jul 2010 14:37:51 +0300
Subject: KVM: x86 emulator: unify the two Group 3 variants

Use just one group table for byte (F6) and word (F7) opcodes.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index bbe2d097c4a..7f615c57cba 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -105,8 +105,7 @@
 #define X16(x) X8(x), X8(x)
 
 enum {
-	Group1, Group1A, Group3_Byte, Group3, Group4, Group5, Group7,
-	Group8, Group9,
+	Group1, Group1A, Group3, Group4, Group5, Group7, Group8, Group9,
 };
 
 static u32 opcode_table[256] = {
@@ -217,7 +216,7 @@ static u32 opcode_table[256] = {
 	SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
 	/* 0xF0 - 0xF7 */
 	0, 0, 0, 0,
-	ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3,
+	ImplicitOps | Priv, ImplicitOps, ByteOp | Group | Group3, Group | Group3,
 	/* 0xF8 - 0xFF */
 	ImplicitOps, 0, ImplicitOps, ImplicitOps,
 	ImplicitOps, ImplicitOps, Group | Group4, Group | Group5,
@@ -290,14 +289,10 @@ static u32 group_table[] = {
 	X7(Lock), 0,
 	[Group1A*8] =
 	DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0,
-	[Group3_Byte*8] =
-	ByteOp | SrcImm | DstMem | ModRM, ByteOp | SrcImm | DstMem | ModRM,
-	ByteOp | DstMem | SrcNone | ModRM | Lock, ByteOp | DstMem | SrcNone | ModRM | Lock,
-	0, 0, 0, 0,
 	[Group3*8] =
 	DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
 	DstMem | SrcNone | ModRM | Lock, DstMem | SrcNone | ModRM | Lock,
-	0, 0, 0, 0,
+	X4(Undefined),
 	[Group4*8] =
 	ByteOp | DstMem | SrcNone | ModRM | Lock, ByteOp | DstMem | SrcNone | ModRM | Lock,
 	0, 0, 0, 0, 0, 0,
-- 
cgit v1.2.3-18-g5258


From d359192feaf02861327339a9dda6b2b2d765c2bc Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 26 Jul 2010 18:32:39 +0300
Subject: KVM: VMX: Use host_gdt variable wherever we need the host gdt

Now that we have the host gdt conveniently stored in a variable, make use
of it instead of querying the cpu.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7bddfab1201..751a2d29f4c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -706,11 +706,10 @@ static void reload_tss(void)
 	/*
 	 * VT restores TR but not its size.  Useless.
 	 */
-	struct desc_ptr gdt;
+	struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
 	struct desc_struct *descs;
 
-	native_store_gdt(&gdt);
-	descs = (void *)gdt.address;
+	descs = (void *)gdt->address;
 	descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
 	load_TR_desc();
 }
@@ -753,7 +752,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
 
 static unsigned long segment_base(u16 selector)
 {
-	struct desc_ptr gdt;
+	struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
 	struct desc_struct *d;
 	unsigned long table_base;
 	unsigned long v;
@@ -761,8 +760,7 @@ static unsigned long segment_base(u16 selector)
 	if (!(selector & ~3))
 		return 0;
 
-	native_store_gdt(&gdt);
-	table_base = gdt.address;
+	table_base = gdt->address;
 
 	if (selector & 4) {           /* from ldt */
 		u16 ldt_selector = kvm_read_ldt();
@@ -897,7 +895,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	}
 
 	if (vcpu->cpu != cpu) {
-		struct desc_ptr dt;
+		struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
 		unsigned long sysenter_esp;
 
 		kvm_migrate_timers(vcpu);
@@ -913,8 +911,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		 * processors.
 		 */
 		vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */
-		native_store_gdt(&dt);
-		vmcs_writel(HOST_GDTR_BASE, dt.address);   /* 22.2.4 */
+		vmcs_writel(HOST_GDTR_BASE, gdt->address);   /* 22.2.4 */
 
 		rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
 		vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
-- 
cgit v1.2.3-18-g5258


From 19ada5c4b6170bbc7ac4f2f38dba0068fdc7755a Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Tue, 27 Jul 2010 11:21:18 +0800
Subject: KVM: MMU: remove valueless output message

After commit 53383eaad08d, the '*spte' has updated before call
rmap_remove()(in most case it's 'shadow_trap_nonpresent_pte'), so
remove this information from error message

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 311f6dad895..82f7622c17d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -645,18 +645,17 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 	gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
 	rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
 	if (!*rmapp) {
-		printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
+		printk(KERN_ERR "rmap_remove: %p 0->BUG\n", spte);
 		BUG();
 	} else if (!(*rmapp & 1)) {
-		rmap_printk("rmap_remove:  %p %llx 1->0\n", spte, *spte);
+		rmap_printk("rmap_remove:  %p 1->0\n", spte);
 		if ((u64 *)*rmapp != spte) {
-			printk(KERN_ERR "rmap_remove:  %p %llx 1->BUG\n",
-			       spte, *spte);
+			printk(KERN_ERR "rmap_remove:  %p 1->BUG\n", spte);
 			BUG();
 		}
 		*rmapp = 0;
 	} else {
-		rmap_printk("rmap_remove:  %p %llx many->many\n", spte, *spte);
+		rmap_printk("rmap_remove:  %p many->many\n", spte);
 		desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
 		prev_desc = NULL;
 		while (desc) {
@@ -670,7 +669,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 			prev_desc = desc;
 			desc = desc->more;
 		}
-		pr_err("rmap_remove: %p %llx many->many\n", spte, *spte);
+		pr_err("rmap_remove: %p many->many\n", spte);
 		BUG();
 	}
 }
-- 
cgit v1.2.3-18-g5258


From 3f6a9d1693deaeef28d98109bc92c98dd94a8523 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 27 Jul 2010 18:14:20 +0200
Subject: KVM: SVM: Sync efer back into nested vmcb

This patch fixes a bug in a nested hypervisor that heavily
switches between real-mode and long-mode. The problem is
fixed by syncing back efer into the guest vmcb on emulated
vmexit.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/svm.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 8a3f9f64f86..09704a0501d 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1896,6 +1896,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 	nested_vmcb->save.ds     = vmcb->save.ds;
 	nested_vmcb->save.gdtr   = vmcb->save.gdtr;
 	nested_vmcb->save.idtr   = vmcb->save.idtr;
+	nested_vmcb->save.efer   = svm->vcpu.arch.efer;
 	nested_vmcb->save.cr0    = kvm_read_cr0(&svm->vcpu);
 	nested_vmcb->save.cr3    = svm->vcpu.arch.cr3;
 	nested_vmcb->save.cr2    = vmcb->save.cr2;
-- 
cgit v1.2.3-18-g5258


From 7a190667bb316653cbb782fff95cfdfcf51ded45 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 27 Jul 2010 18:14:21 +0200
Subject: KVM: SVM: Emulate next_rip svm feature

This patch implements the emulations of the svm next_rip
feature in the nested svm implementation in kvm.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/svm.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 09704a0501d..116e0341bf4 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1918,6 +1918,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 	nested_vmcb->control.exit_info_2       = vmcb->control.exit_info_2;
 	nested_vmcb->control.exit_int_info     = vmcb->control.exit_int_info;
 	nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
+	nested_vmcb->control.next_rip          = vmcb->control.next_rip;
 
 	/*
 	 * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
@@ -3360,7 +3361,12 @@ static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
 		entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper
 				   ASID emulation to nested SVM */
 		entry->ecx = 0; /* Reserved */
-		entry->edx = 0; /* Do not support any additional features */
+		entry->edx = 0; /* Per default do not support any
+				   additional features */
+
+		/* Support next_rip if host supports it */
+		if (svm_has(SVM_FEATURE_NRIP))
+			entry->edx |= SVM_FEATURE_NRIP;
 
 		break;
 	}
-- 
cgit v1.2.3-18-g5258


From 62bd430e6d41ac84ff2fb719f5783c3692718f47 Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <m.gamal005@gmail.com>
Date: Wed, 28 Jul 2010 12:38:40 +0300
Subject: KVM: x86 emulator: Add IRET instruction

Ths patch adds IRET instruction (opcode 0xcf).
Currently, only IRET in real mode is emulated. Protected mode support is to be added later if needed.

Signed-off-by: Mohammed Gamal <m.gamal005@gmail.com>
Reviewed-by: Avi Kivity <avi@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 81 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 7f615c57cba..b0f45bc63e1 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -341,6 +341,9 @@ static u32 group2_table[] = {
 #define EFLG_PF (1<<2)
 #define EFLG_CF (1<<0)
 
+#define EFLG_RESERVED_ZEROS_MASK 0xffc0802a
+#define EFLG_RESERVED_ONE_MASK 2
+
 /*
  * Instruction emulation:
  * Most instructions are emulated directly via a fragment of inline assembly
@@ -1729,6 +1732,78 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt,
 	return rc;
 }
 
+static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
+			     struct x86_emulate_ops *ops)
+{
+	struct decode_cache *c = &ctxt->decode;
+	int rc = X86EMUL_CONTINUE;
+	unsigned long temp_eip = 0;
+	unsigned long temp_eflags = 0;
+	unsigned long cs = 0;
+	unsigned long mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_TF |
+			     EFLG_IF | EFLG_DF | EFLG_OF | EFLG_IOPL | EFLG_NT | EFLG_RF |
+			     EFLG_AC | EFLG_ID | (1 << 1); /* Last one is the reserved bit */
+	unsigned long vm86_mask = EFLG_VM | EFLG_VIF | EFLG_VIP;
+
+	/* TODO: Add stack limit check */
+
+	rc = emulate_pop(ctxt, ops, &temp_eip, c->op_bytes);
+
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+
+	if (temp_eip & ~0xffff) {
+		emulate_gp(ctxt, 0);
+		return X86EMUL_PROPAGATE_FAULT;
+	}
+
+	rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
+
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+
+	rc = emulate_pop(ctxt, ops, &temp_eflags, c->op_bytes);
+
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+
+	rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS);
+
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+
+	c->eip = temp_eip;
+
+
+	if (c->op_bytes == 4)
+		ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask));
+	else if (c->op_bytes == 2) {
+		ctxt->eflags &= ~0xffff;
+		ctxt->eflags |= temp_eflags;
+	}
+
+	ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */
+	ctxt->eflags |= EFLG_RESERVED_ONE_MASK;
+
+	return rc;
+}
+
+static inline int emulate_iret(struct x86_emulate_ctxt *ctxt,
+				    struct x86_emulate_ops* ops)
+{
+	switch(ctxt->mode) {
+	case X86EMUL_MODE_REAL:
+		return emulate_iret_real(ctxt, ops);
+	case X86EMUL_MODE_VM86:
+	case X86EMUL_MODE_PROT16:
+	case X86EMUL_MODE_PROT32:
+	case X86EMUL_MODE_PROT64:
+	default:
+		/* iret from protected mode unimplemented yet */
+		return X86EMUL_UNHANDLEABLE;
+	}
+}
+
 static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
 				struct x86_emulate_ops *ops)
 {
@@ -2857,6 +2932,12 @@ special_insn:
 		break;
 	case 0xcb:		/* ret far */
 		rc = emulate_ret_far(ctxt, ops);
+		if (rc != X86EMUL_CONTINUE)
+			goto done;
+		break;
+	case 0xcf:		/* iret */
+		rc = emulate_iret(ctxt, ops);
+
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		break;
-- 
cgit v1.2.3-18-g5258


From ea9ef04e19c7c441b1ce9fe28ff6d9522c848baa Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 29 Jul 2010 15:11:34 +0300
Subject: KVM: x86 emulator: drop parentheses in repreat macros

The parenthese make is impossible to use the macros with initializers that
require braces.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index b0f45bc63e1..3bfba948097 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -95,10 +95,10 @@
 #define Src2One     (3<<29)
 #define Src2Mask    (7<<29)
 
-#define X2(x) (x), (x)
-#define X3(x) X2(x), (x)
+#define X2(x) x, x
+#define X3(x) X2(x), x
 #define X4(x) X2(x), X2(x)
-#define X5(x) X4(x), (x)
+#define X5(x) X4(x), x
 #define X6(x) X4(x), X2(x)
 #define X7(x) X4(x), X3(x)
 #define X8(x) X4(x), X4(x)
-- 
cgit v1.2.3-18-g5258


From d65b1dee408243daa45110ee494d204508d31657 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 29 Jul 2010 15:11:35 +0300
Subject: KVM: x86 emulator: introduce 'struct opcode'

This will hold all the information known about the opcode.  Currently, this
is just the decode flags.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 3bfba948097..da7df34036c 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -108,7 +108,11 @@ enum {
 	Group1, Group1A, Group3, Group4, Group5, Group7, Group8, Group9,
 };
 
-static u32 opcode_table[256] = {
+struct opcode {
+	u32 flags;
+};
+
+static struct opcode opcode_table[256] = {
 	/* 0x00 - 0x07 */
 	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
 	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
@@ -222,7 +226,7 @@ static u32 opcode_table[256] = {
 	ImplicitOps, ImplicitOps, Group | Group4, Group | Group5,
 };
 
-static u32 twobyte_table[256] = {
+static struct opcode twobyte_table[256] = {
 	/* 0x00 - 0x0F */
 	0, Group | GroupDual | Group7, 0, 0,
 	0, ImplicitOps, ImplicitOps | Priv, 0,
@@ -284,7 +288,7 @@ static u32 twobyte_table[256] = {
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 };
 
-static u32 group_table[] = {
+static struct opcode group_table[] = {
 	[Group1*8] =
 	X7(Lock), 0,
 	[Group1A*8] =
@@ -313,7 +317,7 @@ static u32 group_table[] = {
 	0, DstMem64 | ModRM | Lock, 0, 0, 0, 0, 0, 0,
 };
 
-static u32 group2_table[] = {
+static struct opcode group2_table[] = {
 	[Group7*8] =
 	SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM | Priv,
 	SrcNone | ModRM | DstMem | Mov, 0,
@@ -1008,13 +1012,13 @@ done_prefixes:
 			c->op_bytes = 8;	/* REX.W */
 
 	/* Opcode byte(s). */
-	c->d = opcode_table[c->b];
+	c->d = opcode_table[c->b].flags;
 	if (c->d == 0) {
 		/* Two-byte opcode? */
 		if (c->b == 0x0f) {
 			c->twobyte = 1;
 			c->b = insn_fetch(u8, 1, c->eip);
-			c->d = twobyte_table[c->b];
+			c->d = twobyte_table[c->b].flags;
 		}
 	}
 
@@ -1027,9 +1031,9 @@ done_prefixes:
 		group = (group << 3) + ((c->modrm >> 3) & 7);
 		c->d &= ~(Group | GroupDual | GroupMask);
 		if (dual && (c->modrm >> 6) == 3)
-			c->d |= group2_table[group];
+			c->d |= group2_table[group].flags;
 		else
-			c->d |= group_table[group];
+			c->d |= group_table[group].flags;
 	}
 
 	/* Unrecognised? */
-- 
cgit v1.2.3-18-g5258


From fd853310a1ebaef257956208165873494bb805dc Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 29 Jul 2010 15:11:36 +0300
Subject: KVM: x86 emulator: Add wrappers for easily defining opcodes

Once 'struct opcode' grows, its initializer will become more complicated.
Wrap the simple initializers in a D() macro, and replace the empty initializers
with an even simpler N macro.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 294 +++++++++++++++++++++++++------------------------
 1 file changed, 150 insertions(+), 144 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index da7df34036c..7059b161197 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -112,220 +112,226 @@ struct opcode {
 	u32 flags;
 };
 
+#define D(_y) { .flags = (_y) }
+#define N    D(0)
+
 static struct opcode opcode_table[256] = {
 	/* 0x00 - 0x07 */
-	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
-	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
-	ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
+	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
+	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
+	D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm),
+	D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
 	/* 0x08 - 0x0F */
-	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
-	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
-	ImplicitOps | Stack | No64, 0,
+	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
+	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
+	D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm),
+	D(ImplicitOps | Stack | No64), N,
 	/* 0x10 - 0x17 */
-	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
-	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
-	ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
+	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
+	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
+	D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm),
+	D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
 	/* 0x18 - 0x1F */
-	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
-	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
-	ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
+	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
+	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
+	D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm),
+	D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
 	/* 0x20 - 0x27 */
-	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
-	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
+	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
+	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
+	D(ByteOp | DstAcc | SrcImmByte), D(DstAcc | SrcImm), N, N,
 	/* 0x28 - 0x2F */
-	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
-	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
+	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
+	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
+	D(ByteOp | DstAcc | SrcImmByte), D(DstAcc | SrcImm), N, N,
 	/* 0x30 - 0x37 */
-	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
-	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
+	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
+	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
+	D(ByteOp | DstAcc | SrcImmByte), D(DstAcc | SrcImm), N, N,
 	/* 0x38 - 0x3F */
-	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
-	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
-	0, 0,
+	D(ByteOp | DstMem | SrcReg | ModRM), D(DstMem | SrcReg | ModRM),
+	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
+	D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm),
+	N, N,
 	/* 0x40 - 0x4F */
-	X16(DstReg),
+	X16(D(DstReg)),
 	/* 0x50 - 0x57 */
-	X8(SrcReg | Stack),
+	X8(D(SrcReg | Stack)),
 	/* 0x58 - 0x5F */
-	X8(DstReg | Stack),
+	X8(D(DstReg | Stack)),
 	/* 0x60 - 0x67 */
-	ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
-	0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
-	0, 0, 0, 0,
+	D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
+	N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ ,
+	N, N, N, N,
 	/* 0x68 - 0x6F */
-	SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0,
-	DstDI | ByteOp | Mov | String, DstDI | Mov | String, /* insb, insw/insd */
-	SrcSI | ByteOp | ImplicitOps | String, SrcSI | ImplicitOps | String, /* outsb, outsw/outsd */
+	D(SrcImm | Mov | Stack), N, D(SrcImmByte | Mov | Stack), N,
+	D(DstDI | ByteOp | Mov | String), D(DstDI | Mov | String), /* insb, insw/insd */
+	D(SrcSI | ByteOp | ImplicitOps | String), D(SrcSI | ImplicitOps | String), /* outsb, outsw/outsd */
 	/* 0x70 - 0x7F */
-	X16(SrcImmByte),
+	X16(D(SrcImmByte)),
 	/* 0x80 - 0x87 */
-	ByteOp | DstMem | SrcImm | ModRM | Group | Group1,
-	DstMem | SrcImm | ModRM | Group | Group1,
-	ByteOp | DstMem | SrcImm | ModRM | No64 | Group | Group1,
-	DstMem | SrcImmByte | ModRM | Group | Group1,
-	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
-	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
+	D(ByteOp | DstMem | SrcImm | ModRM | Group | Group1),
+	D(DstMem | SrcImm | ModRM | Group | Group1),
+	D(ByteOp | DstMem | SrcImm | ModRM | No64 | Group | Group1),
+	D(DstMem | SrcImmByte | ModRM | Group | Group1),
+	D(ByteOp | DstMem | SrcReg | ModRM), D(DstMem | SrcReg | ModRM),
+	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
 	/* 0x88 - 0x8F */
-	ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
-	ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-	DstMem | SrcNone | ModRM | Mov, ModRM | DstReg,
-	ImplicitOps | SrcMem16 | ModRM, Group | Group1A,
+	D(ByteOp | DstMem | SrcReg | ModRM | Mov), D(DstMem | SrcReg | ModRM | Mov),
+	D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem | ModRM | Mov),
+	D(DstMem | SrcNone | ModRM | Mov), D(ModRM | DstReg),
+	D(ImplicitOps | SrcMem16 | ModRM), D(Group | Group1A),
 	/* 0x90 - 0x97 */
-	DstReg, DstReg, DstReg, DstReg,	DstReg, DstReg, DstReg, DstReg,
+	D(DstReg), D(DstReg), D(DstReg), D(DstReg),	D(DstReg), D(DstReg), D(DstReg), D(DstReg),
 	/* 0x98 - 0x9F */
-	0, 0, SrcImmFAddr | No64, 0,
-	ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
+	N, N, D(SrcImmFAddr | No64), N,
+	D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, N,
 	/* 0xA0 - 0xA7 */
-	ByteOp | DstAcc | SrcMem | Mov | MemAbs, DstAcc | SrcMem | Mov | MemAbs,
-	ByteOp | DstMem | SrcAcc | Mov | MemAbs, DstMem | SrcAcc | Mov | MemAbs,
-	ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String,
-	ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String,
+	D(ByteOp | DstAcc | SrcMem | Mov | MemAbs), D(DstAcc | SrcMem | Mov | MemAbs),
+	D(ByteOp | DstMem | SrcAcc | Mov | MemAbs), D(DstMem | SrcAcc | Mov | MemAbs),
+	D(ByteOp | SrcSI | DstDI | Mov | String), D(SrcSI | DstDI | Mov | String),
+	D(ByteOp | SrcSI | DstDI | String), D(SrcSI | DstDI | String),
 	/* 0xA8 - 0xAF */
-	DstAcc | SrcImmByte | ByteOp, DstAcc | SrcImm, ByteOp | DstDI | Mov | String, DstDI | Mov | String,
-	ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String,
-	ByteOp | DstDI | String, DstDI | String,
+	D(DstAcc | SrcImmByte | ByteOp), D(DstAcc | SrcImm), D(ByteOp | DstDI | Mov | String), D(DstDI | Mov | String),
+	D(ByteOp | SrcSI | DstAcc | Mov | String), D(SrcSI | DstAcc | Mov | String),
+	D(ByteOp | DstDI | String), D(DstDI | String),
 	/* 0xB0 - 0xB7 */
-	X8(ByteOp | DstReg | SrcImm | Mov),
+	X8(D(ByteOp | DstReg | SrcImm | Mov)),
 	/* 0xB8 - 0xBF */
-	X8(DstReg | SrcImm | Mov),
+	X8(D(DstReg | SrcImm | Mov)),
 	/* 0xC0 - 0xC7 */
-	ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
-	0, ImplicitOps | Stack, 0, 0,
-	ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
+	D(ByteOp | DstMem | SrcImm | ModRM), D(DstMem | SrcImmByte | ModRM),
+	N, D(ImplicitOps | Stack), N, N,
+	D(ByteOp | DstMem | SrcImm | ModRM | Mov), D(DstMem | SrcImm | ModRM | Mov),
 	/* 0xC8 - 0xCF */
-	0, 0, 0, ImplicitOps | Stack,
-	ImplicitOps, SrcImmByte, ImplicitOps | No64, ImplicitOps,
+	N, N, N, D(ImplicitOps | Stack),
+	D(ImplicitOps), D(SrcImmByte), D(ImplicitOps | No64), D(ImplicitOps),
 	/* 0xD0 - 0xD7 */
-	ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
-	ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
-	0, 0, 0, 0,
+	D(ByteOp | DstMem | SrcImplicit | ModRM), D(DstMem | SrcImplicit | ModRM),
+	D(ByteOp | DstMem | SrcImplicit | ModRM), D(DstMem | SrcImplicit | ModRM),
+	N, N, N, N,
 	/* 0xD8 - 0xDF */
-	0, 0, 0, 0, 0, 0, 0, 0,
+	N, N, N, N, N, N, N, N,
 	/* 0xE0 - 0xE7 */
-	0, 0, 0, 0,
-	ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
-	ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
+	N, N, N, N,
+	D(ByteOp | SrcImmUByte | DstAcc), D(SrcImmUByte | DstAcc),
+	D(ByteOp | SrcImmUByte | DstAcc), D(SrcImmUByte | DstAcc),
 	/* 0xE8 - 0xEF */
-	SrcImm | Stack, SrcImm | ImplicitOps,
-	SrcImmFAddr | No64, SrcImmByte | ImplicitOps,
-	SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
-	SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
+	D(SrcImm | Stack), D(SrcImm | ImplicitOps),
+	D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps),
+	D(SrcNone | ByteOp | DstAcc), D(SrcNone | DstAcc),
+	D(SrcNone | ByteOp | DstAcc), D(SrcNone | DstAcc),
 	/* 0xF0 - 0xF7 */
-	0, 0, 0, 0,
-	ImplicitOps | Priv, ImplicitOps, ByteOp | Group | Group3, Group | Group3,
+	N, N, N, N,
+	D(ImplicitOps | Priv), D(ImplicitOps), D(ByteOp | Group | Group3), D(Group | Group3),
 	/* 0xF8 - 0xFF */
-	ImplicitOps, 0, ImplicitOps, ImplicitOps,
-	ImplicitOps, ImplicitOps, Group | Group4, Group | Group5,
+	D(ImplicitOps), N, D(ImplicitOps), D(ImplicitOps),
+	D(ImplicitOps), D(ImplicitOps), D(Group | Group4), D(Group | Group5),
 };
 
 static struct opcode twobyte_table[256] = {
 	/* 0x00 - 0x0F */
-	0, Group | GroupDual | Group7, 0, 0,
-	0, ImplicitOps, ImplicitOps | Priv, 0,
-	ImplicitOps | Priv, ImplicitOps | Priv, 0, 0,
-	0, ImplicitOps | ModRM, 0, 0,
+	N, D(Group | GroupDual | Group7), N, N,
+	N, D(ImplicitOps), D(ImplicitOps | Priv), N,
+	D(ImplicitOps | Priv), D(ImplicitOps | Priv), N, N,
+	N, D(ImplicitOps | ModRM), N, N,
 	/* 0x10 - 0x1F */
-	0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
+	N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, N,
 	/* 0x20 - 0x2F */
-	ModRM | ImplicitOps | Priv, ModRM | Priv,
-	ModRM | ImplicitOps | Priv, ModRM | Priv,
-	0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0,
+	D(ModRM | ImplicitOps | Priv), D(ModRM | Priv),
+	D(ModRM | ImplicitOps | Priv), D(ModRM | Priv),
+	N, N, N, N,
+	N, N, N, N, N, N, N, N,
 	/* 0x30 - 0x3F */
-	ImplicitOps | Priv, 0, ImplicitOps | Priv, 0,
-	ImplicitOps, ImplicitOps | Priv, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0,
+	D(ImplicitOps | Priv), N, D(ImplicitOps | Priv), N,
+	D(ImplicitOps), D(ImplicitOps | Priv), N, N,
+	N, N, N, N, N, N, N, N,
 	/* 0x40 - 0x4F */
-	X16(DstReg | SrcMem | ModRM | Mov),
+	X16(D(DstReg | SrcMem | ModRM | Mov)),
 	/* 0x50 - 0x5F */
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
 	/* 0x60 - 0x6F */
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
 	/* 0x70 - 0x7F */
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
 	/* 0x80 - 0x8F */
-	X16(SrcImm),
+	X16(D(SrcImm)),
 	/* 0x90 - 0x9F */
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
 	/* 0xA0 - 0xA7 */
-	ImplicitOps | Stack, ImplicitOps | Stack,
-	0, DstMem | SrcReg | ModRM | BitOp,
-	DstMem | SrcReg | Src2ImmByte | ModRM,
-	DstMem | SrcReg | Src2CL | ModRM, 0, 0,
+	D(ImplicitOps | Stack), D(ImplicitOps | Stack),
+	N, D(DstMem | SrcReg | ModRM | BitOp),
+	D(DstMem | SrcReg | Src2ImmByte | ModRM),
+	D(DstMem | SrcReg | Src2CL | ModRM), N, N,
 	/* 0xA8 - 0xAF */
-	ImplicitOps | Stack, ImplicitOps | Stack,
-	0, DstMem | SrcReg | ModRM | BitOp | Lock,
-	DstMem | SrcReg | Src2ImmByte | ModRM,
-	DstMem | SrcReg | Src2CL | ModRM,
-	ModRM, 0,
+	D(ImplicitOps | Stack), D(ImplicitOps | Stack),
+	N, D(DstMem | SrcReg | ModRM | BitOp | Lock),
+	D(DstMem | SrcReg | Src2ImmByte | ModRM),
+	D(DstMem | SrcReg | Src2CL | ModRM),
+	D(ModRM), N,
 	/* 0xB0 - 0xB7 */
-	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
-	0, DstMem | SrcReg | ModRM | BitOp | Lock,
-	0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
-	    DstReg | SrcMem16 | ModRM | Mov,
+	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
+	N, D(DstMem | SrcReg | ModRM | BitOp | Lock),
+	N, N, D(ByteOp | DstReg | SrcMem | ModRM | Mov),
+	    D(DstReg | SrcMem16 | ModRM | Mov),
 	/* 0xB8 - 0xBF */
-	0, 0,
-	Group | Group8, DstMem | SrcReg | ModRM | BitOp | Lock,
-	0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
-	    DstReg | SrcMem16 | ModRM | Mov,
+	N, N,
+	D(Group | Group8), D(DstMem | SrcReg | ModRM | BitOp | Lock),
+	N, N, D(ByteOp | DstReg | SrcMem | ModRM | Mov),
+	    D(DstReg | SrcMem16 | ModRM | Mov),
 	/* 0xC0 - 0xCF */
-	0, 0, 0, DstMem | SrcReg | ModRM | Mov,
-	0, 0, 0, Group | GroupDual | Group9,
-	0, 0, 0, 0, 0, 0, 0, 0,
+	N, N, N, D(DstMem | SrcReg | ModRM | Mov),
+	N, N, N, D(Group | GroupDual | Group9),
+	N, N, N, N, N, N, N, N,
 	/* 0xD0 - 0xDF */
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
 	/* 0xE0 - 0xEF */
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
 	/* 0xF0 - 0xFF */
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N
 };
 
 static struct opcode group_table[] = {
 	[Group1*8] =
-	X7(Lock), 0,
+	X7(D(Lock)), N,
 	[Group1A*8] =
-	DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0,
+	D(DstMem | SrcNone | ModRM | Mov | Stack), N, N, N, N, N, N, N,
 	[Group3*8] =
-	DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
-	DstMem | SrcNone | ModRM | Lock, DstMem | SrcNone | ModRM | Lock,
-	X4(Undefined),
+	D(DstMem | SrcImm | ModRM), D(DstMem | SrcImm | ModRM),
+	D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
+	X4(D(Undefined)),
 	[Group4*8] =
-	ByteOp | DstMem | SrcNone | ModRM | Lock, ByteOp | DstMem | SrcNone | ModRM | Lock,
-	0, 0, 0, 0, 0, 0,
+	D(ByteOp | DstMem | SrcNone | ModRM | Lock), D(ByteOp | DstMem | SrcNone | ModRM | Lock),
+	N, N, N, N, N, N,
 	[Group5*8] =
-	DstMem | SrcNone | ModRM | Lock, DstMem | SrcNone | ModRM | Lock,
-	SrcMem | ModRM | Stack, 0,
-	SrcMem | ModRM | Stack, SrcMemFAddr | ModRM | ImplicitOps,
-	SrcMem | ModRM | Stack, 0,
+	D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
+	D(SrcMem | ModRM | Stack), N,
+	D(SrcMem | ModRM | Stack), D(SrcMemFAddr | ModRM | ImplicitOps),
+	D(SrcMem | ModRM | Stack), N,
 	[Group7*8] =
-	0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv,
-	SrcNone | ModRM | DstMem | Mov, 0,
-	SrcMem16 | ModRM | Mov | Priv, SrcMem | ModRM | ByteOp | Priv,
+	N, N, D(ModRM | SrcMem | Priv), D(ModRM | SrcMem | Priv),
+	D(SrcNone | ModRM | DstMem | Mov), N,
+	D(SrcMem16 | ModRM | Mov | Priv), D(SrcMem | ModRM | ByteOp | Priv),
 	[Group8*8] =
-	0, 0, 0, 0,
-	DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock,
-	DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock,
+	N, N, N, N,
+	D(DstMem | SrcImmByte | ModRM), D(DstMem | SrcImmByte | ModRM | Lock),
+	D(DstMem | SrcImmByte | ModRM | Lock), D(DstMem | SrcImmByte | ModRM | Lock),
 	[Group9*8] =
-	0, DstMem64 | ModRM | Lock, 0, 0, 0, 0, 0, 0,
+	N, D(DstMem64 | ModRM | Lock), N, N, N, N, N, N,
 };
 
 static struct opcode group2_table[] = {
 	[Group7*8] =
-	SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM | Priv,
-	SrcNone | ModRM | DstMem | Mov, 0,
-	SrcMem16 | ModRM | Mov | Priv, 0,
+	D(SrcNone | ModRM | Priv), N, N, D(SrcNone | ModRM | Priv),
+	D(SrcNone | ModRM | DstMem | Mov), N,
+	D(SrcMem16 | ModRM | Mov | Priv), N,
 	[Group9*8] =
-	0, 0, 0, 0, 0, 0, 0, 0,
+	N, N, N, N, N, N, N, N,
 };
 
+#undef D
+#undef N
+
 /* EFLAGS bit definitions. */
 #define EFLG_ID (1<<21)
 #define EFLG_VIP (1<<20)
-- 
cgit v1.2.3-18-g5258


From 42a1c5209570ead6d89abecd99ab12947a41d20a Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 29 Jul 2010 15:11:37 +0300
Subject: KVM: x86 emulator: move group tables to top

No code changes.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 76 +++++++++++++++++++++++++-------------------------
 1 file changed, 38 insertions(+), 38 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 7059b161197..edf09386110 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -115,6 +115,44 @@ struct opcode {
 #define D(_y) { .flags = (_y) }
 #define N    D(0)
 
+static struct opcode group_table[] = {
+	[Group1*8] =
+	X7(D(Lock)), N,
+	[Group1A*8] =
+	D(DstMem | SrcNone | ModRM | Mov | Stack), N, N, N, N, N, N, N,
+	[Group3*8] =
+	D(DstMem | SrcImm | ModRM), D(DstMem | SrcImm | ModRM),
+	D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
+	X4(D(Undefined)),
+	[Group4*8] =
+	D(ByteOp | DstMem | SrcNone | ModRM | Lock), D(ByteOp | DstMem | SrcNone | ModRM | Lock),
+	N, N, N, N, N, N,
+	[Group5*8] =
+	D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
+	D(SrcMem | ModRM | Stack), N,
+	D(SrcMem | ModRM | Stack), D(SrcMemFAddr | ModRM | ImplicitOps),
+	D(SrcMem | ModRM | Stack), N,
+	[Group7*8] =
+	N, N, D(ModRM | SrcMem | Priv), D(ModRM | SrcMem | Priv),
+	D(SrcNone | ModRM | DstMem | Mov), N,
+	D(SrcMem16 | ModRM | Mov | Priv), D(SrcMem | ModRM | ByteOp | Priv),
+	[Group8*8] =
+	N, N, N, N,
+	D(DstMem | SrcImmByte | ModRM), D(DstMem | SrcImmByte | ModRM | Lock),
+	D(DstMem | SrcImmByte | ModRM | Lock), D(DstMem | SrcImmByte | ModRM | Lock),
+	[Group9*8] =
+	N, D(DstMem64 | ModRM | Lock), N, N, N, N, N, N,
+};
+
+static struct opcode group2_table[] = {
+	[Group7*8] =
+	D(SrcNone | ModRM | Priv), N, N, D(SrcNone | ModRM | Priv),
+	D(SrcNone | ModRM | DstMem | Mov), N,
+	D(SrcMem16 | ModRM | Mov | Priv), N,
+	[Group9*8] =
+	N, N, N, N, N, N, N, N,
+};
+
 static struct opcode opcode_table[256] = {
 	/* 0x00 - 0x07 */
 	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
@@ -291,44 +329,6 @@ static struct opcode twobyte_table[256] = {
 	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N
 };
 
-static struct opcode group_table[] = {
-	[Group1*8] =
-	X7(D(Lock)), N,
-	[Group1A*8] =
-	D(DstMem | SrcNone | ModRM | Mov | Stack), N, N, N, N, N, N, N,
-	[Group3*8] =
-	D(DstMem | SrcImm | ModRM), D(DstMem | SrcImm | ModRM),
-	D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
-	X4(D(Undefined)),
-	[Group4*8] =
-	D(ByteOp | DstMem | SrcNone | ModRM | Lock), D(ByteOp | DstMem | SrcNone | ModRM | Lock),
-	N, N, N, N, N, N,
-	[Group5*8] =
-	D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
-	D(SrcMem | ModRM | Stack), N,
-	D(SrcMem | ModRM | Stack), D(SrcMemFAddr | ModRM | ImplicitOps),
-	D(SrcMem | ModRM | Stack), N,
-	[Group7*8] =
-	N, N, D(ModRM | SrcMem | Priv), D(ModRM | SrcMem | Priv),
-	D(SrcNone | ModRM | DstMem | Mov), N,
-	D(SrcMem16 | ModRM | Mov | Priv), D(SrcMem | ModRM | ByteOp | Priv),
-	[Group8*8] =
-	N, N, N, N,
-	D(DstMem | SrcImmByte | ModRM), D(DstMem | SrcImmByte | ModRM | Lock),
-	D(DstMem | SrcImmByte | ModRM | Lock), D(DstMem | SrcImmByte | ModRM | Lock),
-	[Group9*8] =
-	N, D(DstMem64 | ModRM | Lock), N, N, N, N, N, N,
-};
-
-static struct opcode group2_table[] = {
-	[Group7*8] =
-	D(SrcNone | ModRM | Priv), N, N, D(SrcNone | ModRM | Priv),
-	D(SrcNone | ModRM | DstMem | Mov), N,
-	D(SrcMem16 | ModRM | Mov | Priv), N,
-	[Group9*8] =
-	N, N, N, N, N, N, N, N,
-};
-
 #undef D
 #undef N
 
-- 
cgit v1.2.3-18-g5258


From 793d5a8d6baad9062b0a03e034944b31e50dfe5c Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 29 Jul 2010 15:11:38 +0300
Subject: KVM: x86 emulator: reserve group code 0

We'll be using that to distinguish between new-style and old-style groups.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index edf09386110..5e496127a01 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -105,7 +105,7 @@
 #define X16(x) X8(x), X8(x)
 
 enum {
-	Group1, Group1A, Group3, Group4, Group5, Group7, Group8, Group9,
+	NoGrp, Group1, Group1A, Group3, Group4, Group5, Group7, Group8, Group9,
 };
 
 struct opcode {
-- 
cgit v1.2.3-18-g5258


From 120df8902dbe91cc1b3b7886481e350fae7334fe Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 29 Jul 2010 15:11:39 +0300
Subject: KVM: x86 emulator: allow specifying group directly in opcode

Instead of having a group number, store the group table pointer directly in
the opcode.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 47 ++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 38 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5e496127a01..f3b984427d1 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -110,10 +110,21 @@ enum {
 
 struct opcode {
 	u32 flags;
+	union {
+		struct opcode *group;
+		struct group_dual *gdual;
+	} u;
+};
+
+struct group_dual {
+	struct opcode mod012[8];
+	struct opcode mod3[8];
 };
 
 #define D(_y) { .flags = (_y) }
 #define N    D(0)
+#define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) }
+#define GD(_f, _g) { .flags = ((_f) | Group | GroupDual), .u.gdual = (_g) }
 
 static struct opcode group_table[] = {
 	[Group1*8] =
@@ -331,6 +342,8 @@ static struct opcode twobyte_table[256] = {
 
 #undef D
 #undef N
+#undef G
+#undef GD
 
 /* EFLAGS bit definitions. */
 #define EFLG_ID (1<<21)
@@ -930,8 +943,8 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	struct decode_cache *c = &ctxt->decode;
 	int rc = X86EMUL_CONTINUE;
 	int mode = ctxt->mode;
-	int def_op_bytes, def_ad_bytes, group, dual;
-
+	int def_op_bytes, def_ad_bytes, group, dual, goffset;
+	struct opcode opcode, *g_mod012, *g_mod3;
 
 	/* we cannot decode insn before we complete previous rep insn */
 	WARN_ON(ctxt->restart);
@@ -1018,15 +1031,16 @@ done_prefixes:
 			c->op_bytes = 8;	/* REX.W */
 
 	/* Opcode byte(s). */
-	c->d = opcode_table[c->b].flags;
-	if (c->d == 0) {
+	opcode = opcode_table[c->b];
+	if (opcode.flags == 0) {
 		/* Two-byte opcode? */
 		if (c->b == 0x0f) {
 			c->twobyte = 1;
 			c->b = insn_fetch(u8, 1, c->eip);
-			c->d = twobyte_table[c->b].flags;
+			opcode = twobyte_table[c->b];
 		}
 	}
+	c->d = opcode.flags;
 
 	if (c->d & Group) {
 		group = c->d & GroupMask;
@@ -1034,12 +1048,27 @@ done_prefixes:
 		c->modrm = insn_fetch(u8, 1, c->eip);
 		--c->eip;
 
-		group = (group << 3) + ((c->modrm >> 3) & 7);
+		if (group) {
+			g_mod012 = g_mod3 = &group_table[group * 8];
+			if (c->d & GroupDual)
+				g_mod3 = &group2_table[group * 8];
+		} else {
+			if (c->d & GroupDual) {
+				g_mod012 = opcode.u.gdual->mod012;
+				g_mod3 = opcode.u.gdual->mod3;
+			} else
+				g_mod012 = g_mod3 = opcode.u.group;
+		}
+
 		c->d &= ~(Group | GroupDual | GroupMask);
-		if (dual && (c->modrm >> 6) == 3)
-			c->d |= group2_table[group].flags;
+
+		goffset = (c->modrm >> 3) & 7;
+
+		if ((c->modrm >> 6) == 3)
+			opcode = g_mod3[goffset];
 		else
-			c->d |= group_table[group].flags;
+			opcode = g_mod012[goffset];
+		c->d |= opcode.flags;
 	}
 
 	/* Unrecognised? */
-- 
cgit v1.2.3-18-g5258


From 5b92b5faff8ec66c75f3716ae7c4bf1e2b99d7e6 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 29 Jul 2010 15:11:40 +0300
Subject: KVM: x86 emulator: convert group 1 to new style

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index f3b984427d1..6cc4af1b59f 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -105,7 +105,7 @@
 #define X16(x) X8(x), X8(x)
 
 enum {
-	NoGrp, Group1, Group1A, Group3, Group4, Group5, Group7, Group8, Group9,
+	NoGrp, Group1A, Group3, Group4, Group5, Group7, Group8, Group9,
 };
 
 struct opcode {
@@ -126,9 +126,11 @@ struct group_dual {
 #define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) }
 #define GD(_f, _g) { .flags = ((_f) | Group | GroupDual), .u.gdual = (_g) }
 
+static struct opcode group1[] = {
+	X7(D(Lock)), N
+};
+
 static struct opcode group_table[] = {
-	[Group1*8] =
-	X7(D(Lock)), N,
 	[Group1A*8] =
 	D(DstMem | SrcNone | ModRM | Mov | Stack), N, N, N, N, N, N, N,
 	[Group3*8] =
@@ -219,10 +221,10 @@ static struct opcode opcode_table[256] = {
 	/* 0x70 - 0x7F */
 	X16(D(SrcImmByte)),
 	/* 0x80 - 0x87 */
-	D(ByteOp | DstMem | SrcImm | ModRM | Group | Group1),
-	D(DstMem | SrcImm | ModRM | Group | Group1),
-	D(ByteOp | DstMem | SrcImm | ModRM | No64 | Group | Group1),
-	D(DstMem | SrcImmByte | ModRM | Group | Group1),
+	G(ByteOp | DstMem | SrcImm | ModRM | Group, group1),
+	G(DstMem | SrcImm | ModRM | Group, group1),
+	G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1),
+	G(DstMem | SrcImmByte | ModRM | Group, group1),
 	D(ByteOp | DstMem | SrcReg | ModRM), D(DstMem | SrcReg | ModRM),
 	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
 	/* 0x88 - 0x8F */
-- 
cgit v1.2.3-18-g5258


From 99880c5cd54b28a26fd6ed949f545cc0075e4393 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 29 Jul 2010 15:11:41 +0300
Subject: KVM: x86 emulator: convert group 1A to new style

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 6cc4af1b59f..618fdc8c8d0 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -105,7 +105,7 @@
 #define X16(x) X8(x), X8(x)
 
 enum {
-	NoGrp, Group1A, Group3, Group4, Group5, Group7, Group8, Group9,
+	NoGrp, Group3, Group4, Group5, Group7, Group8, Group9,
 };
 
 struct opcode {
@@ -130,9 +130,11 @@ static struct opcode group1[] = {
 	X7(D(Lock)), N
 };
 
-static struct opcode group_table[] = {
-	[Group1A*8] =
+static struct opcode group1A[] = {
 	D(DstMem | SrcNone | ModRM | Mov | Stack), N, N, N, N, N, N, N,
+};
+
+static struct opcode group_table[] = {
 	[Group3*8] =
 	D(DstMem | SrcImm | ModRM), D(DstMem | SrcImm | ModRM),
 	D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
@@ -231,7 +233,7 @@ static struct opcode opcode_table[256] = {
 	D(ByteOp | DstMem | SrcReg | ModRM | Mov), D(DstMem | SrcReg | ModRM | Mov),
 	D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem | ModRM | Mov),
 	D(DstMem | SrcNone | ModRM | Mov), D(ModRM | DstReg),
-	D(ImplicitOps | SrcMem16 | ModRM), D(Group | Group1A),
+	D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A),
 	/* 0x90 - 0x97 */
 	D(DstReg), D(DstReg), D(DstReg), D(DstReg),	D(DstReg), D(DstReg), D(DstReg), D(DstReg),
 	/* 0x98 - 0x9F */
-- 
cgit v1.2.3-18-g5258


From ee70ea30ee81dda2cf5fbc2e143ce3cb303187ce Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 29 Jul 2010 15:11:42 +0300
Subject: KVM: x86 emulator: convert group 3 to new style

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 618fdc8c8d0..a0606a408ad 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -105,7 +105,7 @@
 #define X16(x) X8(x), X8(x)
 
 enum {
-	NoGrp, Group3, Group4, Group5, Group7, Group8, Group9,
+	NoGrp, Group4, Group5, Group7, Group8, Group9,
 };
 
 struct opcode {
@@ -134,11 +134,13 @@ static struct opcode group1A[] = {
 	D(DstMem | SrcNone | ModRM | Mov | Stack), N, N, N, N, N, N, N,
 };
 
-static struct opcode group_table[] = {
-	[Group3*8] =
+static struct opcode group3[] = {
 	D(DstMem | SrcImm | ModRM), D(DstMem | SrcImm | ModRM),
 	D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
 	X4(D(Undefined)),
+};
+
+static struct opcode group_table[] = {
 	[Group4*8] =
 	D(ByteOp | DstMem | SrcNone | ModRM | Lock), D(ByteOp | DstMem | SrcNone | ModRM | Lock),
 	N, N, N, N, N, N,
@@ -276,7 +278,7 @@ static struct opcode opcode_table[256] = {
 	D(SrcNone | ByteOp | DstAcc), D(SrcNone | DstAcc),
 	/* 0xF0 - 0xF7 */
 	N, N, N, N,
-	D(ImplicitOps | Priv), D(ImplicitOps), D(ByteOp | Group | Group3), D(Group | Group3),
+	D(ImplicitOps | Priv), D(ImplicitOps), G(ByteOp, group3), G(0, group3),
 	/* 0xF8 - 0xFF */
 	D(ImplicitOps), N, D(ImplicitOps), D(ImplicitOps),
 	D(ImplicitOps), D(ImplicitOps), D(Group | Group4), D(Group | Group5),
-- 
cgit v1.2.3-18-g5258


From 591c9d20a37db54c7234742bff925cb2e6fdca4b Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 29 Jul 2010 15:11:43 +0300
Subject: KVM: x86 emulator: convert group 4 to new style

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index a0606a408ad..8bb74ea2b27 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -105,7 +105,7 @@
 #define X16(x) X8(x), X8(x)
 
 enum {
-	NoGrp, Group4, Group5, Group7, Group8, Group9,
+	NoGrp, Group5, Group7, Group8, Group9,
 };
 
 struct opcode {
@@ -140,10 +140,12 @@ static struct opcode group3[] = {
 	X4(D(Undefined)),
 };
 
-static struct opcode group_table[] = {
-	[Group4*8] =
+static struct opcode group4[] = {
 	D(ByteOp | DstMem | SrcNone | ModRM | Lock), D(ByteOp | DstMem | SrcNone | ModRM | Lock),
 	N, N, N, N, N, N,
+};
+
+static struct opcode group_table[] = {
 	[Group5*8] =
 	D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
 	D(SrcMem | ModRM | Stack), N,
@@ -281,7 +283,7 @@ static struct opcode opcode_table[256] = {
 	D(ImplicitOps | Priv), D(ImplicitOps), G(ByteOp, group3), G(0, group3),
 	/* 0xF8 - 0xFF */
 	D(ImplicitOps), N, D(ImplicitOps), D(ImplicitOps),
-	D(ImplicitOps), D(ImplicitOps), D(Group | Group4), D(Group | Group5),
+	D(ImplicitOps), D(ImplicitOps), G(0, group4), D(Group | Group5),
 };
 
 static struct opcode twobyte_table[256] = {
-- 
cgit v1.2.3-18-g5258


From b67f9f0741e288c97f73cdc9e39e2c4943004332 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 29 Jul 2010 15:11:44 +0300
Subject: KVM: x86 emulator: convert group 5 to new style

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 8bb74ea2b27..9674d973b99 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -105,7 +105,7 @@
 #define X16(x) X8(x), X8(x)
 
 enum {
-	NoGrp, Group5, Group7, Group8, Group9,
+	NoGrp, Group7, Group8, Group9,
 };
 
 struct opcode {
@@ -145,12 +145,14 @@ static struct opcode group4[] = {
 	N, N, N, N, N, N,
 };
 
-static struct opcode group_table[] = {
-	[Group5*8] =
+static struct opcode group5[] = {
 	D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
 	D(SrcMem | ModRM | Stack), N,
 	D(SrcMem | ModRM | Stack), D(SrcMemFAddr | ModRM | ImplicitOps),
 	D(SrcMem | ModRM | Stack), N,
+};
+
+static struct opcode group_table[] = {
 	[Group7*8] =
 	N, N, D(ModRM | SrcMem | Priv), D(ModRM | SrcMem | Priv),
 	D(SrcNone | ModRM | DstMem | Mov), N,
@@ -283,7 +285,7 @@ static struct opcode opcode_table[256] = {
 	D(ImplicitOps | Priv), D(ImplicitOps), G(ByteOp, group3), G(0, group3),
 	/* 0xF8 - 0xFF */
 	D(ImplicitOps), N, D(ImplicitOps), D(ImplicitOps),
-	D(ImplicitOps), D(ImplicitOps), G(0, group4), D(Group | Group5),
+	D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5),
 };
 
 static struct opcode twobyte_table[256] = {
-- 
cgit v1.2.3-18-g5258


From 2f3a9bc9ebd42e00929f370e1a56e40028a8d651 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 29 Jul 2010 15:11:45 +0300
Subject: KVM: x86 emulator: convert group 7 to new style

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 9674d973b99..5e7a02df18b 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -105,7 +105,7 @@
 #define X16(x) X8(x), X8(x)
 
 enum {
-	NoGrp, Group7, Group8, Group9,
+	NoGrp, Group8, Group9,
 };
 
 struct opcode {
@@ -152,11 +152,17 @@ static struct opcode group5[] = {
 	D(SrcMem | ModRM | Stack), N,
 };
 
-static struct opcode group_table[] = {
-	[Group7*8] =
+static struct group_dual group7 = { {
 	N, N, D(ModRM | SrcMem | Priv), D(ModRM | SrcMem | Priv),
 	D(SrcNone | ModRM | DstMem | Mov), N,
 	D(SrcMem16 | ModRM | Mov | Priv), D(SrcMem | ModRM | ByteOp | Priv),
+}, {
+	D(SrcNone | ModRM | Priv), N, N, D(SrcNone | ModRM | Priv),
+	D(SrcNone | ModRM | DstMem | Mov), N,
+	D(SrcMem16 | ModRM | Mov | Priv), N,
+} };
+
+static struct opcode group_table[] = {
 	[Group8*8] =
 	N, N, N, N,
 	D(DstMem | SrcImmByte | ModRM), D(DstMem | SrcImmByte | ModRM | Lock),
@@ -166,10 +172,6 @@ static struct opcode group_table[] = {
 };
 
 static struct opcode group2_table[] = {
-	[Group7*8] =
-	D(SrcNone | ModRM | Priv), N, N, D(SrcNone | ModRM | Priv),
-	D(SrcNone | ModRM | DstMem | Mov), N,
-	D(SrcMem16 | ModRM | Mov | Priv), N,
 	[Group9*8] =
 	N, N, N, N, N, N, N, N,
 };
@@ -290,7 +292,7 @@ static struct opcode opcode_table[256] = {
 
 static struct opcode twobyte_table[256] = {
 	/* 0x00 - 0x0F */
-	N, D(Group | GroupDual | Group7), N, N,
+	N, GD(0, &group7), N, N,
 	N, D(ImplicitOps), D(ImplicitOps | Priv), N,
 	D(ImplicitOps | Priv), D(ImplicitOps | Priv), N, N,
 	N, D(ImplicitOps | ModRM), N, N,
-- 
cgit v1.2.3-18-g5258


From 2cb20bc8af313b400e5c2c94886e0d87e2ec4e4d Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 29 Jul 2010 15:11:46 +0300
Subject: KVM: x86 emulator: convert group 8 to new style

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5e7a02df18b..b5599b5cac9 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -105,7 +105,7 @@
 #define X16(x) X8(x), X8(x)
 
 enum {
-	NoGrp, Group8, Group9,
+	NoGrp, Group9,
 };
 
 struct opcode {
@@ -162,11 +162,13 @@ static struct group_dual group7 = { {
 	D(SrcMem16 | ModRM | Mov | Priv), N,
 } };
 
-static struct opcode group_table[] = {
-	[Group8*8] =
+static struct opcode group8[] = {
 	N, N, N, N,
 	D(DstMem | SrcImmByte | ModRM), D(DstMem | SrcImmByte | ModRM | Lock),
 	D(DstMem | SrcImmByte | ModRM | Lock), D(DstMem | SrcImmByte | ModRM | Lock),
+};
+
+static struct opcode group_table[] = {
 	[Group9*8] =
 	N, D(DstMem64 | ModRM | Lock), N, N, N, N, N, N,
 };
@@ -337,7 +339,7 @@ static struct opcode twobyte_table[256] = {
 	    D(DstReg | SrcMem16 | ModRM | Mov),
 	/* 0xB8 - 0xBF */
 	N, N,
-	D(Group | Group8), D(DstMem | SrcReg | ModRM | BitOp | Lock),
+	G(0, group8), D(DstMem | SrcReg | ModRM | BitOp | Lock),
 	N, N, D(ByteOp | DstReg | SrcMem | ModRM | Mov),
 	    D(DstReg | SrcMem16 | ModRM | Mov),
 	/* 0xC0 - 0xCF */
-- 
cgit v1.2.3-18-g5258


From 9f5d3220e3047536f702ed67309f6a581c0bed8b Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 29 Jul 2010 15:11:47 +0300
Subject: KVM: x86 emulator: convert group 9 to new style

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index b5599b5cac9..2fe731c8229 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -105,7 +105,7 @@
 #define X16(x) X8(x), X8(x)
 
 enum {
-	NoGrp, Group9,
+	NoGrp,
 };
 
 struct opcode {
@@ -168,14 +168,16 @@ static struct opcode group8[] = {
 	D(DstMem | SrcImmByte | ModRM | Lock), D(DstMem | SrcImmByte | ModRM | Lock),
 };
 
-static struct opcode group_table[] = {
-	[Group9*8] =
+static struct group_dual group9 = { {
 	N, D(DstMem64 | ModRM | Lock), N, N, N, N, N, N,
+}, {
+	N, N, N, N, N, N, N, N,
+} };
+
+static struct opcode group_table[] = {
 };
 
 static struct opcode group2_table[] = {
-	[Group9*8] =
-	N, N, N, N, N, N, N, N,
 };
 
 static struct opcode opcode_table[256] = {
@@ -344,7 +346,7 @@ static struct opcode twobyte_table[256] = {
 	    D(DstReg | SrcMem16 | ModRM | Mov),
 	/* 0xC0 - 0xCF */
 	N, N, N, D(DstMem | SrcReg | ModRM | Mov),
-	N, N, N, D(Group | GroupDual | Group9),
+	N, N, N, GD(0, &group9),
 	N, N, N, N, N, N, N, N,
 	/* 0xD0 - 0xDF */
 	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
-- 
cgit v1.2.3-18-g5258


From 3885d530b0eb26c82b6f085c181442b0aa6f8fed Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 29 Jul 2010 15:11:48 +0300
Subject: KVM: x86 emulator: drop support for old-style groups

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 32 +++++++-------------------------
 1 file changed, 7 insertions(+), 25 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 2fe731c8229..20a7a167df1 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -82,7 +82,6 @@
 #define Stack       (1<<13)     /* Stack instruction (push/pop) */
 #define Group       (1<<14)     /* Bits 3:5 of modrm byte extend opcode */
 #define GroupDual   (1<<15)     /* Alternate decoding of mod == 3 */
-#define GroupMask   0x0f        /* Group number stored in bits 0:3 */
 /* Misc flags */
 #define Undefined   (1<<25) /* No Such Instruction */
 #define Lock        (1<<26) /* lock prefix is allowed for the instruction */
@@ -104,10 +103,6 @@
 #define X8(x) X4(x), X4(x)
 #define X16(x) X8(x), X8(x)
 
-enum {
-	NoGrp,
-};
-
 struct opcode {
 	u32 flags;
 	union {
@@ -174,12 +169,6 @@ static struct group_dual group9 = { {
 	N, N, N, N, N, N, N, N,
 } };
 
-static struct opcode group_table[] = {
-};
-
-static struct opcode group2_table[] = {
-};
-
 static struct opcode opcode_table[256] = {
 	/* 0x00 - 0x07 */
 	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
@@ -959,7 +948,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	struct decode_cache *c = &ctxt->decode;
 	int rc = X86EMUL_CONTINUE;
 	int mode = ctxt->mode;
-	int def_op_bytes, def_ad_bytes, group, dual, goffset;
+	int def_op_bytes, def_ad_bytes, dual, goffset;
 	struct opcode opcode, *g_mod012, *g_mod3;
 
 	/* we cannot decode insn before we complete previous rep insn */
@@ -1059,24 +1048,17 @@ done_prefixes:
 	c->d = opcode.flags;
 
 	if (c->d & Group) {
-		group = c->d & GroupMask;
 		dual = c->d & GroupDual;
 		c->modrm = insn_fetch(u8, 1, c->eip);
 		--c->eip;
 
-		if (group) {
-			g_mod012 = g_mod3 = &group_table[group * 8];
-			if (c->d & GroupDual)
-				g_mod3 = &group2_table[group * 8];
-		} else {
-			if (c->d & GroupDual) {
-				g_mod012 = opcode.u.gdual->mod012;
-				g_mod3 = opcode.u.gdual->mod3;
-			} else
-				g_mod012 = g_mod3 = opcode.u.group;
-		}
+		if (c->d & GroupDual) {
+			g_mod012 = opcode.u.gdual->mod012;
+			g_mod3 = opcode.u.gdual->mod3;
+		} else
+			g_mod012 = g_mod3 = opcode.u.group;
 
-		c->d &= ~(Group | GroupDual | GroupMask);
+		c->d &= ~(Group | GroupDual);
 
 		goffset = (c->modrm >> 3) & 7;
 
-- 
cgit v1.2.3-18-g5258


From ab85b12b1a7fd125588f9447653a71ec8e1b5024 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 29 Jul 2010 15:11:49 +0300
Subject: KVM: x86 emulator: move ByteOp and Dst back to bits 0:3

Now that the group index no longer exists, the space is free.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 20a7a167df1..d7e3ea4797f 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -46,15 +46,15 @@
  */
 
 /* Operand sizes: 8-bit operands or specified/overridden size. */
-#define ByteOp      (1<<16)	/* 8-bit operands. */
+#define ByteOp      (1<<0)	/* 8-bit operands. */
 /* Destination operand type. */
-#define ImplicitOps (1<<17)	/* Implicit in opcode. No generic decode. */
-#define DstReg      (2<<17)	/* Register operand. */
-#define DstMem      (3<<17)	/* Memory operand. */
-#define DstAcc      (4<<17)	/* Destination Accumulator */
-#define DstDI       (5<<17)	/* Destination is in ES:(E)DI */
-#define DstMem64    (6<<17)	/* 64bit memory operand */
-#define DstMask     (7<<17)
+#define ImplicitOps (1<<1)	/* Implicit in opcode. No generic decode. */
+#define DstReg      (2<<1)	/* Register operand. */
+#define DstMem      (3<<1)	/* Memory operand. */
+#define DstAcc      (4<<1)	/* Destination Accumulator */
+#define DstDI       (5<<1)	/* Destination is in ES:(E)DI */
+#define DstMem64    (6<<1)	/* 64bit memory operand */
+#define DstMask     (7<<1)
 /* Source operand type. */
 #define SrcNone     (0<<4)	/* No source operand. */
 #define SrcImplicit (0<<4)	/* Source operand is implicit in the opcode. */
-- 
cgit v1.2.3-18-g5258


From 9aabc88fc8687ba3a520e2ec459821d05f72474e Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 29 Jul 2010 15:11:50 +0300
Subject: KVM: x86 emulator: store x86_emulate_ops in emulation context

It doesn't ever change, so we don't need to pass it around everywhere.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h | 9 ++++-----
 arch/x86/kvm/emulate.c             | 8 +++++---
 arch/x86/kvm/x86.c                 | 7 ++++---
 3 files changed, 13 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 1f99ecfc48e..9ddfa5ed228 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -208,6 +208,8 @@ struct decode_cache {
 };
 
 struct x86_emulate_ctxt {
+	struct x86_emulate_ops *ops;
+
 	/* Register state before/after emulation. */
 	struct kvm_vcpu *vcpu;
 
@@ -249,12 +251,9 @@ struct x86_emulate_ctxt {
 #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
 #endif
 
-int x86_decode_insn(struct x86_emulate_ctxt *ctxt,
-		    struct x86_emulate_ops *ops);
-int x86_emulate_insn(struct x86_emulate_ctxt *ctxt,
-		     struct x86_emulate_ops *ops);
+int x86_decode_insn(struct x86_emulate_ctxt *ctxt);
+int x86_emulate_insn(struct x86_emulate_ctxt *ctxt);
 int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
-			 struct x86_emulate_ops *ops,
 			 u16 tss_selector, int reason,
 			 bool has_error_code, u32 error_code);
 
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index d7e3ea4797f..3689f34a303 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -943,8 +943,9 @@ done:
 }
 
 int
-x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
+x86_decode_insn(struct x86_emulate_ctxt *ctxt)
 {
+	struct x86_emulate_ops *ops = ctxt->ops;
 	struct decode_cache *c = &ctxt->decode;
 	int rc = X86EMUL_CONTINUE;
 	int mode = ctxt->mode;
@@ -2586,10 +2587,10 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 }
 
 int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
-			 struct x86_emulate_ops *ops,
 			 u16 tss_selector, int reason,
 			 bool has_error_code, u32 error_code)
 {
+	struct x86_emulate_ops *ops = ctxt->ops;
 	struct decode_cache *c = &ctxt->decode;
 	int rc;
 
@@ -2619,8 +2620,9 @@ static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base,
 }
 
 int
-x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
+x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 {
+	struct x86_emulate_ops *ops = ctxt->ops;
 	u64 msr_data;
 	struct decode_cache *c = &ctxt->decode;
 	int rc = X86EMUL_CONTINUE;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3a09c625d52..33deb75f16e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3998,7 +3998,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 		vcpu->arch.emulate_ctxt.interruptibility = 0;
 		vcpu->arch.emulate_ctxt.exception = -1;
 
-		r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
+		r = x86_decode_insn(&vcpu->arch.emulate_ctxt);
 		trace_kvm_emulate_insn_start(vcpu);
 
 		/* Only allow emulation of specific instructions on #UD
@@ -4048,7 +4048,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 	memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
 
 restart:
-	r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
+	r = x86_emulate_insn(&vcpu->arch.emulate_ctxt);
 
 	if (r) { /* emulation failed */
 		if (reexecute_instruction(vcpu, cr2))
@@ -5067,7 +5067,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
 	memset(c, 0, sizeof(struct decode_cache));
 	memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
 
-	ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops,
+	ret = emulator_task_switch(&vcpu->arch.emulate_ctxt,
 				   tss_selector, reason, has_error_code,
 				   error_code);
 
@@ -5424,6 +5424,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	BUG_ON(vcpu->kvm == NULL);
 	kvm = vcpu->kvm;
 
+	vcpu->arch.emulate_ctxt.ops = &emulate_ops;
 	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
 	if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
 		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
-- 
cgit v1.2.3-18-g5258


From ef65c88912cafe56de2737c440aefc764fd8f202 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 29 Jul 2010 15:11:51 +0300
Subject: KVM: x86 emulator: allow storing emulator execution function in
 decode tables

Instead of looking up the opcode twice (once for decode flags, once for
the big execution switch) look up both flags and function in the decode tables.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |  1 +
 arch/x86/kvm/emulate.c             | 12 ++++++++++++
 2 files changed, 13 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 9ddfa5ed228..0f901c16cf1 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -190,6 +190,7 @@ struct decode_cache {
 	bool has_seg_override;
 	u8 seg_override;
 	unsigned int d;
+	int (*execute)(struct x86_emulate_ctxt *ctxt);
 	unsigned long regs[NR_VCPU_REGS];
 	unsigned long eip;
 	/* modrm */
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 3689f34a303..799e895fb08 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -106,6 +106,7 @@
 struct opcode {
 	u32 flags;
 	union {
+		int (*execute)(struct x86_emulate_ctxt *ctxt);
 		struct opcode *group;
 		struct group_dual *gdual;
 	} u;
@@ -120,6 +121,7 @@ struct group_dual {
 #define N    D(0)
 #define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) }
 #define GD(_f, _g) { .flags = ((_f) | Group | GroupDual), .u.gdual = (_g) }
+#define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
 
 static struct opcode group1[] = {
 	X7(D(Lock)), N
@@ -349,6 +351,7 @@ static struct opcode twobyte_table[256] = {
 #undef N
 #undef G
 #undef GD
+#undef I
 
 /* EFLAGS bit definitions. */
 #define EFLG_ID (1<<21)
@@ -1070,6 +1073,8 @@ done_prefixes:
 		c->d |= opcode.flags;
 	}
 
+	c->execute = opcode.u.execute;
+
 	/* Unrecognised? */
 	if (c->d == 0 || (c->d & Undefined)) {
 		DPRINTF("Cannot emulate %02x\n", c->b);
@@ -2705,6 +2710,13 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 
 special_insn:
 
+	if (c->execute) {
+		rc = c->execute(ctxt);
+		if (rc != X86EMUL_CONTINUE)
+			goto done;
+		goto writeback;
+	}
+
 	if (c->twobyte)
 		goto twobyte_insn;
 
-- 
cgit v1.2.3-18-g5258


From dde7e6d12a9ef9f727d05ce824f4fe75ca2a5b3a Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 29 Jul 2010 15:11:52 +0300
Subject: KVM: x86 emulator: move x86_decode_insn() downwards

No code changes.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 1602 ++++++++++++++++++++++++------------------------
 1 file changed, 801 insertions(+), 801 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 799e895fb08..c6f43591753 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -945,917 +945,545 @@ done:
 	return rc;
 }
 
-int
-x86_decode_insn(struct x86_emulate_ctxt *ctxt)
+static int read_emulated(struct x86_emulate_ctxt *ctxt,
+			 struct x86_emulate_ops *ops,
+			 unsigned long addr, void *dest, unsigned size)
 {
-	struct x86_emulate_ops *ops = ctxt->ops;
-	struct decode_cache *c = &ctxt->decode;
-	int rc = X86EMUL_CONTINUE;
-	int mode = ctxt->mode;
-	int def_op_bytes, def_ad_bytes, dual, goffset;
-	struct opcode opcode, *g_mod012, *g_mod3;
+	int rc;
+	struct read_cache *mc = &ctxt->decode.mem_read;
+	u32 err;
 
-	/* we cannot decode insn before we complete previous rep insn */
-	WARN_ON(ctxt->restart);
+	while (size) {
+		int n = min(size, 8u);
+		size -= n;
+		if (mc->pos < mc->end)
+			goto read_cached;
 
-	c->eip = ctxt->eip;
-	c->fetch.start = c->fetch.end = c->eip;
-	ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS);
+		rc = ops->read_emulated(addr, mc->data + mc->end, n, &err,
+					ctxt->vcpu);
+		if (rc == X86EMUL_PROPAGATE_FAULT)
+			emulate_pf(ctxt, addr, err);
+		if (rc != X86EMUL_CONTINUE)
+			return rc;
+		mc->end += n;
 
-	switch (mode) {
-	case X86EMUL_MODE_REAL:
-	case X86EMUL_MODE_VM86:
-	case X86EMUL_MODE_PROT16:
-		def_op_bytes = def_ad_bytes = 2;
-		break;
-	case X86EMUL_MODE_PROT32:
-		def_op_bytes = def_ad_bytes = 4;
-		break;
-#ifdef CONFIG_X86_64
-	case X86EMUL_MODE_PROT64:
-		def_op_bytes = 4;
-		def_ad_bytes = 8;
-		break;
-#endif
-	default:
-		return -1;
+	read_cached:
+		memcpy(dest, mc->data + mc->pos, n);
+		mc->pos += n;
+		dest += n;
+		addr += n;
 	}
+	return X86EMUL_CONTINUE;
+}
 
-	c->op_bytes = def_op_bytes;
-	c->ad_bytes = def_ad_bytes;
-
-	/* Legacy prefixes. */
-	for (;;) {
-		switch (c->b = insn_fetch(u8, 1, c->eip)) {
-		case 0x66:	/* operand-size override */
-			/* switch between 2/4 bytes */
-			c->op_bytes = def_op_bytes ^ 6;
-			break;
-		case 0x67:	/* address-size override */
-			if (mode == X86EMUL_MODE_PROT64)
-				/* switch between 4/8 bytes */
-				c->ad_bytes = def_ad_bytes ^ 12;
-			else
-				/* switch between 2/4 bytes */
-				c->ad_bytes = def_ad_bytes ^ 6;
-			break;
-		case 0x26:	/* ES override */
-		case 0x2e:	/* CS override */
-		case 0x36:	/* SS override */
-		case 0x3e:	/* DS override */
-			set_seg_override(c, (c->b >> 3) & 3);
-			break;
-		case 0x64:	/* FS override */
-		case 0x65:	/* GS override */
-			set_seg_override(c, c->b & 7);
-			break;
-		case 0x40 ... 0x4f: /* REX */
-			if (mode != X86EMUL_MODE_PROT64)
-				goto done_prefixes;
-			c->rex_prefix = c->b;
-			continue;
-		case 0xf0:	/* LOCK */
-			c->lock_prefix = 1;
-			break;
-		case 0xf2:	/* REPNE/REPNZ */
-			c->rep_prefix = REPNE_PREFIX;
-			break;
-		case 0xf3:	/* REP/REPE/REPZ */
-			c->rep_prefix = REPE_PREFIX;
-			break;
-		default:
-			goto done_prefixes;
-		}
-
-		/* Any legacy prefix after a REX prefix nullifies its effect. */
+static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
+			   struct x86_emulate_ops *ops,
+			   unsigned int size, unsigned short port,
+			   void *dest)
+{
+	struct read_cache *rc = &ctxt->decode.io_read;
 
-		c->rex_prefix = 0;
+	if (rc->pos == rc->end) { /* refill pio read ahead */
+		struct decode_cache *c = &ctxt->decode;
+		unsigned int in_page, n;
+		unsigned int count = c->rep_prefix ?
+			address_mask(c, c->regs[VCPU_REGS_RCX]) : 1;
+		in_page = (ctxt->eflags & EFLG_DF) ?
+			offset_in_page(c->regs[VCPU_REGS_RDI]) :
+			PAGE_SIZE - offset_in_page(c->regs[VCPU_REGS_RDI]);
+		n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size,
+			count);
+		if (n == 0)
+			n = 1;
+		rc->pos = rc->end = 0;
+		if (!ops->pio_in_emulated(size, port, rc->data, n, ctxt->vcpu))
+			return 0;
+		rc->end = n * size;
 	}
 
-done_prefixes:
+	memcpy(dest, rc->data + rc->pos, size);
+	rc->pos += size;
+	return 1;
+}
 
-	/* REX prefix. */
-	if (c->rex_prefix)
-		if (c->rex_prefix & 8)
-			c->op_bytes = 8;	/* REX.W */
+static u32 desc_limit_scaled(struct desc_struct *desc)
+{
+	u32 limit = get_desc_limit(desc);
 
-	/* Opcode byte(s). */
-	opcode = opcode_table[c->b];
-	if (opcode.flags == 0) {
-		/* Two-byte opcode? */
-		if (c->b == 0x0f) {
-			c->twobyte = 1;
-			c->b = insn_fetch(u8, 1, c->eip);
-			opcode = twobyte_table[c->b];
-		}
-	}
-	c->d = opcode.flags;
+	return desc->g ? (limit << 12) | 0xfff : limit;
+}
 
-	if (c->d & Group) {
-		dual = c->d & GroupDual;
-		c->modrm = insn_fetch(u8, 1, c->eip);
-		--c->eip;
+static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
+				     struct x86_emulate_ops *ops,
+				     u16 selector, struct desc_ptr *dt)
+{
+	if (selector & 1 << 2) {
+		struct desc_struct desc;
+		memset (dt, 0, sizeof *dt);
+		if (!ops->get_cached_descriptor(&desc, VCPU_SREG_LDTR, ctxt->vcpu))
+			return;
 
-		if (c->d & GroupDual) {
-			g_mod012 = opcode.u.gdual->mod012;
-			g_mod3 = opcode.u.gdual->mod3;
-		} else
-			g_mod012 = g_mod3 = opcode.u.group;
+		dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */
+		dt->address = get_desc_base(&desc);
+	} else
+		ops->get_gdt(dt, ctxt->vcpu);
+}
 
-		c->d &= ~(Group | GroupDual);
+/* allowed just for 8 bytes segments */
+static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+				   struct x86_emulate_ops *ops,
+				   u16 selector, struct desc_struct *desc)
+{
+	struct desc_ptr dt;
+	u16 index = selector >> 3;
+	int ret;
+	u32 err;
+	ulong addr;
 
-		goffset = (c->modrm >> 3) & 7;
+	get_descriptor_table_ptr(ctxt, ops, selector, &dt);
 
-		if ((c->modrm >> 6) == 3)
-			opcode = g_mod3[goffset];
-		else
-			opcode = g_mod012[goffset];
-		c->d |= opcode.flags;
+	if (dt.size < index * 8 + 7) {
+		emulate_gp(ctxt, selector & 0xfffc);
+		return X86EMUL_PROPAGATE_FAULT;
 	}
+	addr = dt.address + index * 8;
+	ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu,  &err);
+	if (ret == X86EMUL_PROPAGATE_FAULT)
+		emulate_pf(ctxt, addr, err);
 
-	c->execute = opcode.u.execute;
+       return ret;
+}
 
-	/* Unrecognised? */
-	if (c->d == 0 || (c->d & Undefined)) {
-		DPRINTF("Cannot emulate %02x\n", c->b);
-		return -1;
-	}
+/* allowed just for 8 bytes segments */
+static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+				    struct x86_emulate_ops *ops,
+				    u16 selector, struct desc_struct *desc)
+{
+	struct desc_ptr dt;
+	u16 index = selector >> 3;
+	u32 err;
+	ulong addr;
+	int ret;
 
-	if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
-		c->op_bytes = 8;
+	get_descriptor_table_ptr(ctxt, ops, selector, &dt);
 
-	/* ModRM and SIB bytes. */
-	if (c->d & ModRM)
-		rc = decode_modrm(ctxt, ops);
-	else if (c->d & MemAbs)
-		rc = decode_abs(ctxt, ops);
-	if (rc != X86EMUL_CONTINUE)
-		goto done;
+	if (dt.size < index * 8 + 7) {
+		emulate_gp(ctxt, selector & 0xfffc);
+		return X86EMUL_PROPAGATE_FAULT;
+	}
 
-	if (!c->has_seg_override)
-		set_seg_override(c, VCPU_SREG_DS);
+	addr = dt.address + index * 8;
+	ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
+	if (ret == X86EMUL_PROPAGATE_FAULT)
+		emulate_pf(ctxt, addr, err);
 
-	if (!(!c->twobyte && c->b == 0x8d))
-		c->modrm_ea += seg_override_base(ctxt, ops, c);
+	return ret;
+}
 
-	if (c->ad_bytes != 8)
-		c->modrm_ea = (u32)c->modrm_ea;
+static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+				   struct x86_emulate_ops *ops,
+				   u16 selector, int seg)
+{
+	struct desc_struct seg_desc;
+	u8 dpl, rpl, cpl;
+	unsigned err_vec = GP_VECTOR;
+	u32 err_code = 0;
+	bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
+	int ret;
 
-	if (c->rip_relative)
-		c->modrm_ea += c->eip;
+	memset(&seg_desc, 0, sizeof seg_desc);
 
-	/*
-	 * Decode and fetch the source operand: register, memory
-	 * or immediate.
-	 */
-	switch (c->d & SrcMask) {
-	case SrcNone:
+	if ((seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86)
+	    || ctxt->mode == X86EMUL_MODE_REAL) {
+		/* set real mode segment descriptor */
+		set_desc_base(&seg_desc, selector << 4);
+		set_desc_limit(&seg_desc, 0xffff);
+		seg_desc.type = 3;
+		seg_desc.p = 1;
+		seg_desc.s = 1;
+		goto load;
+	}
+
+	/* NULL selector is not valid for TR, CS and SS */
+	if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR)
+	    && null_selector)
+		goto exception;
+
+	/* TR should be in GDT only */
+	if (seg == VCPU_SREG_TR && (selector & (1 << 2)))
+		goto exception;
+
+	if (null_selector) /* for NULL selector skip all following checks */
+		goto load;
+
+	ret = read_segment_descriptor(ctxt, ops, selector, &seg_desc);
+	if (ret != X86EMUL_CONTINUE)
+		return ret;
+
+	err_code = selector & 0xfffc;
+	err_vec = GP_VECTOR;
+
+	/* can't load system descriptor into segment selecor */
+	if (seg <= VCPU_SREG_GS && !seg_desc.s)
+		goto exception;
+
+	if (!seg_desc.p) {
+		err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
+		goto exception;
+	}
+
+	rpl = selector & 3;
+	dpl = seg_desc.dpl;
+	cpl = ops->cpl(ctxt->vcpu);
+
+	switch (seg) {
+	case VCPU_SREG_SS:
+		/*
+		 * segment is not a writable data segment or segment
+		 * selector's RPL != CPL or segment selector's RPL != CPL
+		 */
+		if (rpl != cpl || (seg_desc.type & 0xa) != 0x2 || dpl != cpl)
+			goto exception;
 		break;
-	case SrcReg:
-		decode_register_operand(&c->src, c, 0);
+	case VCPU_SREG_CS:
+		if (!(seg_desc.type & 8))
+			goto exception;
+
+		if (seg_desc.type & 4) {
+			/* conforming */
+			if (dpl > cpl)
+				goto exception;
+		} else {
+			/* nonconforming */
+			if (rpl > cpl || dpl != cpl)
+				goto exception;
+		}
+		/* CS(RPL) <- CPL */
+		selector = (selector & 0xfffc) | cpl;
 		break;
-	case SrcMem16:
-		c->src.bytes = 2;
-		goto srcmem_common;
-	case SrcMem32:
-		c->src.bytes = 4;
-		goto srcmem_common;
-	case SrcMem:
-		c->src.bytes = (c->d & ByteOp) ? 1 :
-							   c->op_bytes;
-		/* Don't fetch the address for invlpg: it could be unmapped. */
-		if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7)
-			break;
-	srcmem_common:
+	case VCPU_SREG_TR:
+		if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9))
+			goto exception;
+		break;
+	case VCPU_SREG_LDTR:
+		if (seg_desc.s || seg_desc.type != 2)
+			goto exception;
+		break;
+	default: /*  DS, ES, FS, or GS */
 		/*
-		 * For instructions with a ModR/M byte, switch to register
-		 * access if Mod = 3.
+		 * segment is not a data or readable code segment or
+		 * ((segment is a data or nonconforming code segment)
+		 * and (both RPL and CPL > DPL))
 		 */
-		if ((c->d & ModRM) && c->modrm_mod == 3) {
-			c->src.type = OP_REG;
-			c->src.val = c->modrm_val;
-			c->src.ptr = c->modrm_ptr;
-			break;
-		}
-		c->src.type = OP_MEM;
-		c->src.ptr = (unsigned long *)c->modrm_ea;
-		c->src.val = 0;
+		if ((seg_desc.type & 0xa) == 0x8 ||
+		    (((seg_desc.type & 0xc) != 0xc) &&
+		     (rpl > dpl && cpl > dpl)))
+			goto exception;
 		break;
-	case SrcImm:
-	case SrcImmU:
-		c->src.type = OP_IMM;
-		c->src.ptr = (unsigned long *)c->eip;
-		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		if (c->src.bytes == 8)
-			c->src.bytes = 4;
-		/* NB. Immediates are sign-extended as necessary. */
-		switch (c->src.bytes) {
+	}
+
+	if (seg_desc.s) {
+		/* mark segment as accessed */
+		seg_desc.type |= 1;
+		ret = write_segment_descriptor(ctxt, ops, selector, &seg_desc);
+		if (ret != X86EMUL_CONTINUE)
+			return ret;
+	}
+load:
+	ops->set_segment_selector(selector, seg, ctxt->vcpu);
+	ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu);
+	return X86EMUL_CONTINUE;
+exception:
+	emulate_exception(ctxt, err_vec, err_code, true);
+	return X86EMUL_PROPAGATE_FAULT;
+}
+
+static inline int writeback(struct x86_emulate_ctxt *ctxt,
+			    struct x86_emulate_ops *ops)
+{
+	int rc;
+	struct decode_cache *c = &ctxt->decode;
+	u32 err;
+
+	switch (c->dst.type) {
+	case OP_REG:
+		/* The 4-byte case *is* correct:
+		 * in 64-bit mode we zero-extend.
+		 */
+		switch (c->dst.bytes) {
 		case 1:
-			c->src.val = insn_fetch(s8, 1, c->eip);
+			*(u8 *)c->dst.ptr = (u8)c->dst.val;
 			break;
 		case 2:
-			c->src.val = insn_fetch(s16, 2, c->eip);
+			*(u16 *)c->dst.ptr = (u16)c->dst.val;
 			break;
 		case 4:
-			c->src.val = insn_fetch(s32, 4, c->eip);
+			*c->dst.ptr = (u32)c->dst.val;
+			break;	/* 64b: zero-ext */
+		case 8:
+			*c->dst.ptr = c->dst.val;
 			break;
 		}
-		if ((c->d & SrcMask) == SrcImmU) {
-			switch (c->src.bytes) {
-			case 1:
-				c->src.val &= 0xff;
-				break;
-			case 2:
-				c->src.val &= 0xffff;
-				break;
-			case 4:
-				c->src.val &= 0xffffffff;
-				break;
-			}
-		}
 		break;
-	case SrcImmByte:
-	case SrcImmUByte:
-		c->src.type = OP_IMM;
-		c->src.ptr = (unsigned long *)c->eip;
-		c->src.bytes = 1;
-		if ((c->d & SrcMask) == SrcImmByte)
-			c->src.val = insn_fetch(s8, 1, c->eip);
+	case OP_MEM:
+		if (c->lock_prefix)
+			rc = ops->cmpxchg_emulated(
+					(unsigned long)c->dst.ptr,
+					&c->dst.orig_val,
+					&c->dst.val,
+					c->dst.bytes,
+					&err,
+					ctxt->vcpu);
 		else
-			c->src.val = insn_fetch(u8, 1, c->eip);
-		break;
-	case SrcAcc:
-		c->src.type = OP_REG;
-		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->src.ptr = &c->regs[VCPU_REGS_RAX];
-		switch (c->src.bytes) {
-			case 1:
-				c->src.val = *(u8 *)c->src.ptr;
-				break;
-			case 2:
-				c->src.val = *(u16 *)c->src.ptr;
-				break;
-			case 4:
-				c->src.val = *(u32 *)c->src.ptr;
-				break;
-			case 8:
-				c->src.val = *(u64 *)c->src.ptr;
-				break;
-		}
-		break;
-	case SrcOne:
-		c->src.bytes = 1;
-		c->src.val = 1;
-		break;
-	case SrcSI:
-		c->src.type = OP_MEM;
-		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->src.ptr = (unsigned long *)
-			register_address(c,  seg_override_base(ctxt, ops, c),
-					 c->regs[VCPU_REGS_RSI]);
-		c->src.val = 0;
+			rc = ops->write_emulated(
+					(unsigned long)c->dst.ptr,
+					&c->dst.val,
+					c->dst.bytes,
+					&err,
+					ctxt->vcpu);
+		if (rc == X86EMUL_PROPAGATE_FAULT)
+			emulate_pf(ctxt,
+					      (unsigned long)c->dst.ptr, err);
+		if (rc != X86EMUL_CONTINUE)
+			return rc;
 		break;
-	case SrcImmFAddr:
-		c->src.type = OP_IMM;
-		c->src.ptr = (unsigned long *)c->eip;
-		c->src.bytes = c->op_bytes + 2;
-		insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip);
+	case OP_NONE:
+		/* no writeback */
 		break;
-	case SrcMemFAddr:
-		c->src.type = OP_MEM;
-		c->src.ptr = (unsigned long *)c->modrm_ea;
-		c->src.bytes = c->op_bytes + 2;
+	default:
 		break;
 	}
+	return X86EMUL_CONTINUE;
+}
 
-	/*
-	 * Decode and fetch the second source operand: register, memory
-	 * or immediate.
-	 */
-	switch (c->d & Src2Mask) {
-	case Src2None:
-		break;
-	case Src2CL:
-		c->src2.bytes = 1;
-		c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8;
-		break;
-	case Src2ImmByte:
-		c->src2.type = OP_IMM;
-		c->src2.ptr = (unsigned long *)c->eip;
-		c->src2.bytes = 1;
-		c->src2.val = insn_fetch(u8, 1, c->eip);
-		break;
-	case Src2One:
-		c->src2.bytes = 1;
-		c->src2.val = 1;
-		break;
-	}
+static inline void emulate_push(struct x86_emulate_ctxt *ctxt,
+				struct x86_emulate_ops *ops)
+{
+	struct decode_cache *c = &ctxt->decode;
 
-	/* Decode and fetch the destination operand: register or memory. */
-	switch (c->d & DstMask) {
-	case ImplicitOps:
-		/* Special instructions do their own operand decoding. */
-		return 0;
-	case DstReg:
-		decode_register_operand(&c->dst, c,
-			 c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
-		break;
-	case DstMem:
-	case DstMem64:
-		if ((c->d & ModRM) && c->modrm_mod == 3) {
-			c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-			c->dst.type = OP_REG;
-			c->dst.val = c->dst.orig_val = c->modrm_val;
-			c->dst.ptr = c->modrm_ptr;
-			break;
-		}
-		c->dst.type = OP_MEM;
-		c->dst.ptr = (unsigned long *)c->modrm_ea;
-		if ((c->d & DstMask) == DstMem64)
-			c->dst.bytes = 8;
-		else
-			c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->dst.val = 0;
-		if (c->d & BitOp) {
-			unsigned long mask = ~(c->dst.bytes * 8 - 1);
+	c->dst.type  = OP_MEM;
+	c->dst.bytes = c->op_bytes;
+	c->dst.val = c->src.val;
+	register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
+	c->dst.ptr = (void *) register_address(c, ss_base(ctxt, ops),
+					       c->regs[VCPU_REGS_RSP]);
+}
 
-			c->dst.ptr = (void *)c->dst.ptr +
-						   (c->src.val & mask) / 8;
-		}
-		break;
-	case DstAcc:
-		c->dst.type = OP_REG;
-		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->dst.ptr = &c->regs[VCPU_REGS_RAX];
-		switch (c->dst.bytes) {
-			case 1:
-				c->dst.val = *(u8 *)c->dst.ptr;
-				break;
-			case 2:
-				c->dst.val = *(u16 *)c->dst.ptr;
-				break;
-			case 4:
-				c->dst.val = *(u32 *)c->dst.ptr;
-				break;
-			case 8:
-				c->dst.val = *(u64 *)c->dst.ptr;
-				break;
-		}
-		c->dst.orig_val = c->dst.val;
-		break;
-	case DstDI:
-		c->dst.type = OP_MEM;
-		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->dst.ptr = (unsigned long *)
-			register_address(c, es_base(ctxt, ops),
-					 c->regs[VCPU_REGS_RDI]);
-		c->dst.val = 0;
-		break;
-	}
+static int emulate_pop(struct x86_emulate_ctxt *ctxt,
+		       struct x86_emulate_ops *ops,
+		       void *dest, int len)
+{
+	struct decode_cache *c = &ctxt->decode;
+	int rc;
 
-done:
-	return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
+	rc = read_emulated(ctxt, ops, register_address(c, ss_base(ctxt, ops),
+						       c->regs[VCPU_REGS_RSP]),
+			   dest, len);
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+
+	register_address_increment(c, &c->regs[VCPU_REGS_RSP], len);
+	return rc;
 }
 
-static int read_emulated(struct x86_emulate_ctxt *ctxt,
-			 struct x86_emulate_ops *ops,
-			 unsigned long addr, void *dest, unsigned size)
+static int emulate_popf(struct x86_emulate_ctxt *ctxt,
+		       struct x86_emulate_ops *ops,
+		       void *dest, int len)
 {
 	int rc;
-	struct read_cache *mc = &ctxt->decode.mem_read;
-	u32 err;
+	unsigned long val, change_mask;
+	int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
+	int cpl = ops->cpl(ctxt->vcpu);
 
-	while (size) {
-		int n = min(size, 8u);
-		size -= n;
-		if (mc->pos < mc->end)
-			goto read_cached;
+	rc = emulate_pop(ctxt, ops, &val, len);
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
 
-		rc = ops->read_emulated(addr, mc->data + mc->end, n, &err,
-					ctxt->vcpu);
-		if (rc == X86EMUL_PROPAGATE_FAULT)
-			emulate_pf(ctxt, addr, err);
-		if (rc != X86EMUL_CONTINUE)
-			return rc;
-		mc->end += n;
+	change_mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_OF
+		| EFLG_TF | EFLG_DF | EFLG_NT | EFLG_RF | EFLG_AC | EFLG_ID;
 
-	read_cached:
-		memcpy(dest, mc->data + mc->pos, n);
-		mc->pos += n;
-		dest += n;
-		addr += n;
+	switch(ctxt->mode) {
+	case X86EMUL_MODE_PROT64:
+	case X86EMUL_MODE_PROT32:
+	case X86EMUL_MODE_PROT16:
+		if (cpl == 0)
+			change_mask |= EFLG_IOPL;
+		if (cpl <= iopl)
+			change_mask |= EFLG_IF;
+		break;
+	case X86EMUL_MODE_VM86:
+		if (iopl < 3) {
+			emulate_gp(ctxt, 0);
+			return X86EMUL_PROPAGATE_FAULT;
+		}
+		change_mask |= EFLG_IF;
+		break;
+	default: /* real mode */
+		change_mask |= (EFLG_IOPL | EFLG_IF);
+		break;
 	}
-	return X86EMUL_CONTINUE;
+
+	*(unsigned long *)dest =
+		(ctxt->eflags & ~change_mask) | (val & change_mask);
+
+	return rc;
 }
 
-static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
-			   struct x86_emulate_ops *ops,
-			   unsigned int size, unsigned short port,
-			   void *dest)
+static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt,
+			      struct x86_emulate_ops *ops, int seg)
 {
-	struct read_cache *rc = &ctxt->decode.io_read;
+	struct decode_cache *c = &ctxt->decode;
 
-	if (rc->pos == rc->end) { /* refill pio read ahead */
-		struct decode_cache *c = &ctxt->decode;
-		unsigned int in_page, n;
-		unsigned int count = c->rep_prefix ?
-			address_mask(c, c->regs[VCPU_REGS_RCX]) : 1;
-		in_page = (ctxt->eflags & EFLG_DF) ?
-			offset_in_page(c->regs[VCPU_REGS_RDI]) :
-			PAGE_SIZE - offset_in_page(c->regs[VCPU_REGS_RDI]);
-		n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size,
-			count);
-		if (n == 0)
-			n = 1;
-		rc->pos = rc->end = 0;
-		if (!ops->pio_in_emulated(size, port, rc->data, n, ctxt->vcpu))
-			return 0;
-		rc->end = n * size;
-	}
+	c->src.val = ops->get_segment_selector(seg, ctxt->vcpu);
 
-	memcpy(dest, rc->data + rc->pos, size);
-	rc->pos += size;
-	return 1;
+	emulate_push(ctxt, ops);
 }
 
-static u32 desc_limit_scaled(struct desc_struct *desc)
+static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
+			     struct x86_emulate_ops *ops, int seg)
 {
-	u32 limit = get_desc_limit(desc);
+	struct decode_cache *c = &ctxt->decode;
+	unsigned long selector;
+	int rc;
 
-	return desc->g ? (limit << 12) | 0xfff : limit;
+	rc = emulate_pop(ctxt, ops, &selector, c->op_bytes);
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+
+	rc = load_segment_descriptor(ctxt, ops, (u16)selector, seg);
+	return rc;
 }
 
-static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
-				     struct x86_emulate_ops *ops,
-				     u16 selector, struct desc_ptr *dt)
+static int emulate_pusha(struct x86_emulate_ctxt *ctxt,
+			  struct x86_emulate_ops *ops)
 {
-	if (selector & 1 << 2) {
-		struct desc_struct desc;
-		memset (dt, 0, sizeof *dt);
-		if (!ops->get_cached_descriptor(&desc, VCPU_SREG_LDTR, ctxt->vcpu))
-			return;
+	struct decode_cache *c = &ctxt->decode;
+	unsigned long old_esp = c->regs[VCPU_REGS_RSP];
+	int rc = X86EMUL_CONTINUE;
+	int reg = VCPU_REGS_RAX;
 
-		dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */
-		dt->address = get_desc_base(&desc);
-	} else
-		ops->get_gdt(dt, ctxt->vcpu);
-}
+	while (reg <= VCPU_REGS_RDI) {
+		(reg == VCPU_REGS_RSP) ?
+		(c->src.val = old_esp) : (c->src.val = c->regs[reg]);
 
-/* allowed just for 8 bytes segments */
-static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
-				   struct x86_emulate_ops *ops,
-				   u16 selector, struct desc_struct *desc)
-{
-	struct desc_ptr dt;
-	u16 index = selector >> 3;
-	int ret;
-	u32 err;
-	ulong addr;
+		emulate_push(ctxt, ops);
 
-	get_descriptor_table_ptr(ctxt, ops, selector, &dt);
+		rc = writeback(ctxt, ops);
+		if (rc != X86EMUL_CONTINUE)
+			return rc;
 
-	if (dt.size < index * 8 + 7) {
-		emulate_gp(ctxt, selector & 0xfffc);
-		return X86EMUL_PROPAGATE_FAULT;
+		++reg;
 	}
-	addr = dt.address + index * 8;
-	ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu,  &err);
-	if (ret == X86EMUL_PROPAGATE_FAULT)
-		emulate_pf(ctxt, addr, err);
 
-       return ret;
+	/* Disable writeback. */
+	c->dst.type = OP_NONE;
+
+	return rc;
 }
 
-/* allowed just for 8 bytes segments */
-static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
-				    struct x86_emulate_ops *ops,
-				    u16 selector, struct desc_struct *desc)
+static int emulate_popa(struct x86_emulate_ctxt *ctxt,
+			struct x86_emulate_ops *ops)
 {
-	struct desc_ptr dt;
-	u16 index = selector >> 3;
-	u32 err;
-	ulong addr;
-	int ret;
+	struct decode_cache *c = &ctxt->decode;
+	int rc = X86EMUL_CONTINUE;
+	int reg = VCPU_REGS_RDI;
 
-	get_descriptor_table_ptr(ctxt, ops, selector, &dt);
+	while (reg >= VCPU_REGS_RAX) {
+		if (reg == VCPU_REGS_RSP) {
+			register_address_increment(c, &c->regs[VCPU_REGS_RSP],
+							c->op_bytes);
+			--reg;
+		}
 
-	if (dt.size < index * 8 + 7) {
-		emulate_gp(ctxt, selector & 0xfffc);
-		return X86EMUL_PROPAGATE_FAULT;
+		rc = emulate_pop(ctxt, ops, &c->regs[reg], c->op_bytes);
+		if (rc != X86EMUL_CONTINUE)
+			break;
+		--reg;
 	}
-
-	addr = dt.address + index * 8;
-	ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
-	if (ret == X86EMUL_PROPAGATE_FAULT)
-		emulate_pf(ctxt, addr, err);
-
-	return ret;
+	return rc;
 }
 
-static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
-				   struct x86_emulate_ops *ops,
-				   u16 selector, int seg)
+static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
+			     struct x86_emulate_ops *ops)
 {
-	struct desc_struct seg_desc;
-	u8 dpl, rpl, cpl;
-	unsigned err_vec = GP_VECTOR;
-	u32 err_code = 0;
-	bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
-	int ret;
+	struct decode_cache *c = &ctxt->decode;
+	int rc = X86EMUL_CONTINUE;
+	unsigned long temp_eip = 0;
+	unsigned long temp_eflags = 0;
+	unsigned long cs = 0;
+	unsigned long mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_TF |
+			     EFLG_IF | EFLG_DF | EFLG_OF | EFLG_IOPL | EFLG_NT | EFLG_RF |
+			     EFLG_AC | EFLG_ID | (1 << 1); /* Last one is the reserved bit */
+	unsigned long vm86_mask = EFLG_VM | EFLG_VIF | EFLG_VIP;
 
-	memset(&seg_desc, 0, sizeof seg_desc);
+	/* TODO: Add stack limit check */
 
-	if ((seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86)
-	    || ctxt->mode == X86EMUL_MODE_REAL) {
-		/* set real mode segment descriptor */
-		set_desc_base(&seg_desc, selector << 4);
-		set_desc_limit(&seg_desc, 0xffff);
-		seg_desc.type = 3;
-		seg_desc.p = 1;
-		seg_desc.s = 1;
-		goto load;
-	}
+	rc = emulate_pop(ctxt, ops, &temp_eip, c->op_bytes);
 
-	/* NULL selector is not valid for TR, CS and SS */
-	if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR)
-	    && null_selector)
-		goto exception;
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
 
-	/* TR should be in GDT only */
-	if (seg == VCPU_SREG_TR && (selector & (1 << 2)))
-		goto exception;
+	if (temp_eip & ~0xffff) {
+		emulate_gp(ctxt, 0);
+		return X86EMUL_PROPAGATE_FAULT;
+	}
 
-	if (null_selector) /* for NULL selector skip all following checks */
-		goto load;
+	rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
 
-	ret = read_segment_descriptor(ctxt, ops, selector, &seg_desc);
-	if (ret != X86EMUL_CONTINUE)
-		return ret;
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
 
-	err_code = selector & 0xfffc;
-	err_vec = GP_VECTOR;
+	rc = emulate_pop(ctxt, ops, &temp_eflags, c->op_bytes);
 
-	/* can't load system descriptor into segment selecor */
-	if (seg <= VCPU_SREG_GS && !seg_desc.s)
-		goto exception;
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
 
-	if (!seg_desc.p) {
-		err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
-		goto exception;
-	}
+	rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS);
 
-	rpl = selector & 3;
-	dpl = seg_desc.dpl;
-	cpl = ops->cpl(ctxt->vcpu);
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
 
-	switch (seg) {
-	case VCPU_SREG_SS:
-		/*
-		 * segment is not a writable data segment or segment
-		 * selector's RPL != CPL or segment selector's RPL != CPL
-		 */
-		if (rpl != cpl || (seg_desc.type & 0xa) != 0x2 || dpl != cpl)
-			goto exception;
-		break;
-	case VCPU_SREG_CS:
-		if (!(seg_desc.type & 8))
-			goto exception;
+	c->eip = temp_eip;
 
-		if (seg_desc.type & 4) {
-			/* conforming */
-			if (dpl > cpl)
-				goto exception;
-		} else {
-			/* nonconforming */
-			if (rpl > cpl || dpl != cpl)
-				goto exception;
-		}
-		/* CS(RPL) <- CPL */
-		selector = (selector & 0xfffc) | cpl;
-		break;
-	case VCPU_SREG_TR:
-		if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9))
-			goto exception;
-		break;
-	case VCPU_SREG_LDTR:
-		if (seg_desc.s || seg_desc.type != 2)
-			goto exception;
-		break;
-	default: /*  DS, ES, FS, or GS */
-		/*
-		 * segment is not a data or readable code segment or
-		 * ((segment is a data or nonconforming code segment)
-		 * and (both RPL and CPL > DPL))
-		 */
-		if ((seg_desc.type & 0xa) == 0x8 ||
-		    (((seg_desc.type & 0xc) != 0xc) &&
-		     (rpl > dpl && cpl > dpl)))
-			goto exception;
-		break;
-	}
 
-	if (seg_desc.s) {
-		/* mark segment as accessed */
-		seg_desc.type |= 1;
-		ret = write_segment_descriptor(ctxt, ops, selector, &seg_desc);
-		if (ret != X86EMUL_CONTINUE)
-			return ret;
+	if (c->op_bytes == 4)
+		ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask));
+	else if (c->op_bytes == 2) {
+		ctxt->eflags &= ~0xffff;
+		ctxt->eflags |= temp_eflags;
 	}
-load:
-	ops->set_segment_selector(selector, seg, ctxt->vcpu);
-	ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu);
-	return X86EMUL_CONTINUE;
-exception:
-	emulate_exception(ctxt, err_vec, err_code, true);
-	return X86EMUL_PROPAGATE_FAULT;
+
+	ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */
+	ctxt->eflags |= EFLG_RESERVED_ONE_MASK;
+
+	return rc;
 }
 
-static inline int writeback(struct x86_emulate_ctxt *ctxt,
-			    struct x86_emulate_ops *ops)
+static inline int emulate_iret(struct x86_emulate_ctxt *ctxt,
+				    struct x86_emulate_ops* ops)
 {
-	int rc;
-	struct decode_cache *c = &ctxt->decode;
-	u32 err;
-
-	switch (c->dst.type) {
-	case OP_REG:
-		/* The 4-byte case *is* correct:
-		 * in 64-bit mode we zero-extend.
-		 */
-		switch (c->dst.bytes) {
-		case 1:
-			*(u8 *)c->dst.ptr = (u8)c->dst.val;
-			break;
-		case 2:
-			*(u16 *)c->dst.ptr = (u16)c->dst.val;
-			break;
-		case 4:
-			*c->dst.ptr = (u32)c->dst.val;
-			break;	/* 64b: zero-ext */
-		case 8:
-			*c->dst.ptr = c->dst.val;
-			break;
-		}
-		break;
-	case OP_MEM:
-		if (c->lock_prefix)
-			rc = ops->cmpxchg_emulated(
-					(unsigned long)c->dst.ptr,
-					&c->dst.orig_val,
-					&c->dst.val,
-					c->dst.bytes,
-					&err,
-					ctxt->vcpu);
-		else
-			rc = ops->write_emulated(
-					(unsigned long)c->dst.ptr,
-					&c->dst.val,
-					c->dst.bytes,
-					&err,
-					ctxt->vcpu);
-		if (rc == X86EMUL_PROPAGATE_FAULT)
-			emulate_pf(ctxt,
-					      (unsigned long)c->dst.ptr, err);
-		if (rc != X86EMUL_CONTINUE)
-			return rc;
-		break;
-	case OP_NONE:
-		/* no writeback */
-		break;
+	switch(ctxt->mode) {
+	case X86EMUL_MODE_REAL:
+		return emulate_iret_real(ctxt, ops);
+	case X86EMUL_MODE_VM86:
+	case X86EMUL_MODE_PROT16:
+	case X86EMUL_MODE_PROT32:
+	case X86EMUL_MODE_PROT64:
 	default:
-		break;
+		/* iret from protected mode unimplemented yet */
+		return X86EMUL_UNHANDLEABLE;
 	}
-	return X86EMUL_CONTINUE;
 }
 
-static inline void emulate_push(struct x86_emulate_ctxt *ctxt,
+static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
 				struct x86_emulate_ops *ops)
 {
 	struct decode_cache *c = &ctxt->decode;
 
-	c->dst.type  = OP_MEM;
-	c->dst.bytes = c->op_bytes;
-	c->dst.val = c->src.val;
-	register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
-	c->dst.ptr = (void *) register_address(c, ss_base(ctxt, ops),
-					       c->regs[VCPU_REGS_RSP]);
+	return emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes);
 }
 
-static int emulate_pop(struct x86_emulate_ctxt *ctxt,
-		       struct x86_emulate_ops *ops,
-		       void *dest, int len)
-{
-	struct decode_cache *c = &ctxt->decode;
-	int rc;
-
-	rc = read_emulated(ctxt, ops, register_address(c, ss_base(ctxt, ops),
-						       c->regs[VCPU_REGS_RSP]),
-			   dest, len);
-	if (rc != X86EMUL_CONTINUE)
-		return rc;
-
-	register_address_increment(c, &c->regs[VCPU_REGS_RSP], len);
-	return rc;
-}
-
-static int emulate_popf(struct x86_emulate_ctxt *ctxt,
-		       struct x86_emulate_ops *ops,
-		       void *dest, int len)
-{
-	int rc;
-	unsigned long val, change_mask;
-	int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
-	int cpl = ops->cpl(ctxt->vcpu);
-
-	rc = emulate_pop(ctxt, ops, &val, len);
-	if (rc != X86EMUL_CONTINUE)
-		return rc;
-
-	change_mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_OF
-		| EFLG_TF | EFLG_DF | EFLG_NT | EFLG_RF | EFLG_AC | EFLG_ID;
-
-	switch(ctxt->mode) {
-	case X86EMUL_MODE_PROT64:
-	case X86EMUL_MODE_PROT32:
-	case X86EMUL_MODE_PROT16:
-		if (cpl == 0)
-			change_mask |= EFLG_IOPL;
-		if (cpl <= iopl)
-			change_mask |= EFLG_IF;
-		break;
-	case X86EMUL_MODE_VM86:
-		if (iopl < 3) {
-			emulate_gp(ctxt, 0);
-			return X86EMUL_PROPAGATE_FAULT;
-		}
-		change_mask |= EFLG_IF;
-		break;
-	default: /* real mode */
-		change_mask |= (EFLG_IOPL | EFLG_IF);
-		break;
-	}
-
-	*(unsigned long *)dest =
-		(ctxt->eflags & ~change_mask) | (val & change_mask);
-
-	return rc;
-}
-
-static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt,
-			      struct x86_emulate_ops *ops, int seg)
-{
-	struct decode_cache *c = &ctxt->decode;
-
-	c->src.val = ops->get_segment_selector(seg, ctxt->vcpu);
-
-	emulate_push(ctxt, ops);
-}
-
-static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
-			     struct x86_emulate_ops *ops, int seg)
-{
-	struct decode_cache *c = &ctxt->decode;
-	unsigned long selector;
-	int rc;
-
-	rc = emulate_pop(ctxt, ops, &selector, c->op_bytes);
-	if (rc != X86EMUL_CONTINUE)
-		return rc;
-
-	rc = load_segment_descriptor(ctxt, ops, (u16)selector, seg);
-	return rc;
-}
-
-static int emulate_pusha(struct x86_emulate_ctxt *ctxt,
-			  struct x86_emulate_ops *ops)
-{
-	struct decode_cache *c = &ctxt->decode;
-	unsigned long old_esp = c->regs[VCPU_REGS_RSP];
-	int rc = X86EMUL_CONTINUE;
-	int reg = VCPU_REGS_RAX;
-
-	while (reg <= VCPU_REGS_RDI) {
-		(reg == VCPU_REGS_RSP) ?
-		(c->src.val = old_esp) : (c->src.val = c->regs[reg]);
-
-		emulate_push(ctxt, ops);
-
-		rc = writeback(ctxt, ops);
-		if (rc != X86EMUL_CONTINUE)
-			return rc;
-
-		++reg;
-	}
-
-	/* Disable writeback. */
-	c->dst.type = OP_NONE;
-
-	return rc;
-}
-
-static int emulate_popa(struct x86_emulate_ctxt *ctxt,
-			struct x86_emulate_ops *ops)
-{
-	struct decode_cache *c = &ctxt->decode;
-	int rc = X86EMUL_CONTINUE;
-	int reg = VCPU_REGS_RDI;
-
-	while (reg >= VCPU_REGS_RAX) {
-		if (reg == VCPU_REGS_RSP) {
-			register_address_increment(c, &c->regs[VCPU_REGS_RSP],
-							c->op_bytes);
-			--reg;
-		}
-
-		rc = emulate_pop(ctxt, ops, &c->regs[reg], c->op_bytes);
-		if (rc != X86EMUL_CONTINUE)
-			break;
-		--reg;
-	}
-	return rc;
-}
-
-static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
-			     struct x86_emulate_ops *ops)
-{
-	struct decode_cache *c = &ctxt->decode;
-	int rc = X86EMUL_CONTINUE;
-	unsigned long temp_eip = 0;
-	unsigned long temp_eflags = 0;
-	unsigned long cs = 0;
-	unsigned long mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_TF |
-			     EFLG_IF | EFLG_DF | EFLG_OF | EFLG_IOPL | EFLG_NT | EFLG_RF |
-			     EFLG_AC | EFLG_ID | (1 << 1); /* Last one is the reserved bit */
-	unsigned long vm86_mask = EFLG_VM | EFLG_VIF | EFLG_VIP;
-
-	/* TODO: Add stack limit check */
-
-	rc = emulate_pop(ctxt, ops, &temp_eip, c->op_bytes);
-
-	if (rc != X86EMUL_CONTINUE)
-		return rc;
-
-	if (temp_eip & ~0xffff) {
-		emulate_gp(ctxt, 0);
-		return X86EMUL_PROPAGATE_FAULT;
-	}
-
-	rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
-
-	if (rc != X86EMUL_CONTINUE)
-		return rc;
-
-	rc = emulate_pop(ctxt, ops, &temp_eflags, c->op_bytes);
-
-	if (rc != X86EMUL_CONTINUE)
-		return rc;
-
-	rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS);
-
-	if (rc != X86EMUL_CONTINUE)
-		return rc;
-
-	c->eip = temp_eip;
-
-
-	if (c->op_bytes == 4)
-		ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask));
-	else if (c->op_bytes == 2) {
-		ctxt->eflags &= ~0xffff;
-		ctxt->eflags |= temp_eflags;
-	}
-
-	ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */
-	ctxt->eflags |= EFLG_RESERVED_ONE_MASK;
-
-	return rc;
-}
-
-static inline int emulate_iret(struct x86_emulate_ctxt *ctxt,
-				    struct x86_emulate_ops* ops)
-{
-	switch(ctxt->mode) {
-	case X86EMUL_MODE_REAL:
-		return emulate_iret_real(ctxt, ops);
-	case X86EMUL_MODE_VM86:
-	case X86EMUL_MODE_PROT16:
-	case X86EMUL_MODE_PROT32:
-	case X86EMUL_MODE_PROT64:
-	default:
-		/* iret from protected mode unimplemented yet */
-		return X86EMUL_UNHANDLEABLE;
-	}
-}
-
-static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
-				struct x86_emulate_ops *ops)
-{
-	struct decode_cache *c = &ctxt->decode;
-
-	return emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes);
-}
-
-static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt)
+static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt)
 {
 	struct decode_cache *c = &ctxt->decode;
 	switch (c->modrm_reg) {
@@ -2624,6 +2252,378 @@ static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base,
 	op->ptr = (unsigned long *)register_address(c,  base, c->regs[reg]);
 }
 
+int
+x86_decode_insn(struct x86_emulate_ctxt *ctxt)
+{
+	struct x86_emulate_ops *ops = ctxt->ops;
+	struct decode_cache *c = &ctxt->decode;
+	int rc = X86EMUL_CONTINUE;
+	int mode = ctxt->mode;
+	int def_op_bytes, def_ad_bytes, dual, goffset;
+	struct opcode opcode, *g_mod012, *g_mod3;
+
+	/* we cannot decode insn before we complete previous rep insn */
+	WARN_ON(ctxt->restart);
+
+	c->eip = ctxt->eip;
+	c->fetch.start = c->fetch.end = c->eip;
+	ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS);
+
+	switch (mode) {
+	case X86EMUL_MODE_REAL:
+	case X86EMUL_MODE_VM86:
+	case X86EMUL_MODE_PROT16:
+		def_op_bytes = def_ad_bytes = 2;
+		break;
+	case X86EMUL_MODE_PROT32:
+		def_op_bytes = def_ad_bytes = 4;
+		break;
+#ifdef CONFIG_X86_64
+	case X86EMUL_MODE_PROT64:
+		def_op_bytes = 4;
+		def_ad_bytes = 8;
+		break;
+#endif
+	default:
+		return -1;
+	}
+
+	c->op_bytes = def_op_bytes;
+	c->ad_bytes = def_ad_bytes;
+
+	/* Legacy prefixes. */
+	for (;;) {
+		switch (c->b = insn_fetch(u8, 1, c->eip)) {
+		case 0x66:	/* operand-size override */
+			/* switch between 2/4 bytes */
+			c->op_bytes = def_op_bytes ^ 6;
+			break;
+		case 0x67:	/* address-size override */
+			if (mode == X86EMUL_MODE_PROT64)
+				/* switch between 4/8 bytes */
+				c->ad_bytes = def_ad_bytes ^ 12;
+			else
+				/* switch between 2/4 bytes */
+				c->ad_bytes = def_ad_bytes ^ 6;
+			break;
+		case 0x26:	/* ES override */
+		case 0x2e:	/* CS override */
+		case 0x36:	/* SS override */
+		case 0x3e:	/* DS override */
+			set_seg_override(c, (c->b >> 3) & 3);
+			break;
+		case 0x64:	/* FS override */
+		case 0x65:	/* GS override */
+			set_seg_override(c, c->b & 7);
+			break;
+		case 0x40 ... 0x4f: /* REX */
+			if (mode != X86EMUL_MODE_PROT64)
+				goto done_prefixes;
+			c->rex_prefix = c->b;
+			continue;
+		case 0xf0:	/* LOCK */
+			c->lock_prefix = 1;
+			break;
+		case 0xf2:	/* REPNE/REPNZ */
+			c->rep_prefix = REPNE_PREFIX;
+			break;
+		case 0xf3:	/* REP/REPE/REPZ */
+			c->rep_prefix = REPE_PREFIX;
+			break;
+		default:
+			goto done_prefixes;
+		}
+
+		/* Any legacy prefix after a REX prefix nullifies its effect. */
+
+		c->rex_prefix = 0;
+	}
+
+done_prefixes:
+
+	/* REX prefix. */
+	if (c->rex_prefix)
+		if (c->rex_prefix & 8)
+			c->op_bytes = 8;	/* REX.W */
+
+	/* Opcode byte(s). */
+	opcode = opcode_table[c->b];
+	if (opcode.flags == 0) {
+		/* Two-byte opcode? */
+		if (c->b == 0x0f) {
+			c->twobyte = 1;
+			c->b = insn_fetch(u8, 1, c->eip);
+			opcode = twobyte_table[c->b];
+		}
+	}
+	c->d = opcode.flags;
+
+	if (c->d & Group) {
+		dual = c->d & GroupDual;
+		c->modrm = insn_fetch(u8, 1, c->eip);
+		--c->eip;
+
+		if (c->d & GroupDual) {
+			g_mod012 = opcode.u.gdual->mod012;
+			g_mod3 = opcode.u.gdual->mod3;
+		} else
+			g_mod012 = g_mod3 = opcode.u.group;
+
+		c->d &= ~(Group | GroupDual);
+
+		goffset = (c->modrm >> 3) & 7;
+
+		if ((c->modrm >> 6) == 3)
+			opcode = g_mod3[goffset];
+		else
+			opcode = g_mod012[goffset];
+		c->d |= opcode.flags;
+	}
+
+	c->execute = opcode.u.execute;
+
+	/* Unrecognised? */
+	if (c->d == 0 || (c->d & Undefined)) {
+		DPRINTF("Cannot emulate %02x\n", c->b);
+		return -1;
+	}
+
+	if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
+		c->op_bytes = 8;
+
+	/* ModRM and SIB bytes. */
+	if (c->d & ModRM)
+		rc = decode_modrm(ctxt, ops);
+	else if (c->d & MemAbs)
+		rc = decode_abs(ctxt, ops);
+	if (rc != X86EMUL_CONTINUE)
+		goto done;
+
+	if (!c->has_seg_override)
+		set_seg_override(c, VCPU_SREG_DS);
+
+	if (!(!c->twobyte && c->b == 0x8d))
+		c->modrm_ea += seg_override_base(ctxt, ops, c);
+
+	if (c->ad_bytes != 8)
+		c->modrm_ea = (u32)c->modrm_ea;
+
+	if (c->rip_relative)
+		c->modrm_ea += c->eip;
+
+	/*
+	 * Decode and fetch the source operand: register, memory
+	 * or immediate.
+	 */
+	switch (c->d & SrcMask) {
+	case SrcNone:
+		break;
+	case SrcReg:
+		decode_register_operand(&c->src, c, 0);
+		break;
+	case SrcMem16:
+		c->src.bytes = 2;
+		goto srcmem_common;
+	case SrcMem32:
+		c->src.bytes = 4;
+		goto srcmem_common;
+	case SrcMem:
+		c->src.bytes = (c->d & ByteOp) ? 1 :
+							   c->op_bytes;
+		/* Don't fetch the address for invlpg: it could be unmapped. */
+		if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7)
+			break;
+	srcmem_common:
+		/*
+		 * For instructions with a ModR/M byte, switch to register
+		 * access if Mod = 3.
+		 */
+		if ((c->d & ModRM) && c->modrm_mod == 3) {
+			c->src.type = OP_REG;
+			c->src.val = c->modrm_val;
+			c->src.ptr = c->modrm_ptr;
+			break;
+		}
+		c->src.type = OP_MEM;
+		c->src.ptr = (unsigned long *)c->modrm_ea;
+		c->src.val = 0;
+		break;
+	case SrcImm:
+	case SrcImmU:
+		c->src.type = OP_IMM;
+		c->src.ptr = (unsigned long *)c->eip;
+		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+		if (c->src.bytes == 8)
+			c->src.bytes = 4;
+		/* NB. Immediates are sign-extended as necessary. */
+		switch (c->src.bytes) {
+		case 1:
+			c->src.val = insn_fetch(s8, 1, c->eip);
+			break;
+		case 2:
+			c->src.val = insn_fetch(s16, 2, c->eip);
+			break;
+		case 4:
+			c->src.val = insn_fetch(s32, 4, c->eip);
+			break;
+		}
+		if ((c->d & SrcMask) == SrcImmU) {
+			switch (c->src.bytes) {
+			case 1:
+				c->src.val &= 0xff;
+				break;
+			case 2:
+				c->src.val &= 0xffff;
+				break;
+			case 4:
+				c->src.val &= 0xffffffff;
+				break;
+			}
+		}
+		break;
+	case SrcImmByte:
+	case SrcImmUByte:
+		c->src.type = OP_IMM;
+		c->src.ptr = (unsigned long *)c->eip;
+		c->src.bytes = 1;
+		if ((c->d & SrcMask) == SrcImmByte)
+			c->src.val = insn_fetch(s8, 1, c->eip);
+		else
+			c->src.val = insn_fetch(u8, 1, c->eip);
+		break;
+	case SrcAcc:
+		c->src.type = OP_REG;
+		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+		c->src.ptr = &c->regs[VCPU_REGS_RAX];
+		switch (c->src.bytes) {
+			case 1:
+				c->src.val = *(u8 *)c->src.ptr;
+				break;
+			case 2:
+				c->src.val = *(u16 *)c->src.ptr;
+				break;
+			case 4:
+				c->src.val = *(u32 *)c->src.ptr;
+				break;
+			case 8:
+				c->src.val = *(u64 *)c->src.ptr;
+				break;
+		}
+		break;
+	case SrcOne:
+		c->src.bytes = 1;
+		c->src.val = 1;
+		break;
+	case SrcSI:
+		c->src.type = OP_MEM;
+		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+		c->src.ptr = (unsigned long *)
+			register_address(c,  seg_override_base(ctxt, ops, c),
+					 c->regs[VCPU_REGS_RSI]);
+		c->src.val = 0;
+		break;
+	case SrcImmFAddr:
+		c->src.type = OP_IMM;
+		c->src.ptr = (unsigned long *)c->eip;
+		c->src.bytes = c->op_bytes + 2;
+		insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip);
+		break;
+	case SrcMemFAddr:
+		c->src.type = OP_MEM;
+		c->src.ptr = (unsigned long *)c->modrm_ea;
+		c->src.bytes = c->op_bytes + 2;
+		break;
+	}
+
+	/*
+	 * Decode and fetch the second source operand: register, memory
+	 * or immediate.
+	 */
+	switch (c->d & Src2Mask) {
+	case Src2None:
+		break;
+	case Src2CL:
+		c->src2.bytes = 1;
+		c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8;
+		break;
+	case Src2ImmByte:
+		c->src2.type = OP_IMM;
+		c->src2.ptr = (unsigned long *)c->eip;
+		c->src2.bytes = 1;
+		c->src2.val = insn_fetch(u8, 1, c->eip);
+		break;
+	case Src2One:
+		c->src2.bytes = 1;
+		c->src2.val = 1;
+		break;
+	}
+
+	/* Decode and fetch the destination operand: register or memory. */
+	switch (c->d & DstMask) {
+	case ImplicitOps:
+		/* Special instructions do their own operand decoding. */
+		return 0;
+	case DstReg:
+		decode_register_operand(&c->dst, c,
+			 c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
+		break;
+	case DstMem:
+	case DstMem64:
+		if ((c->d & ModRM) && c->modrm_mod == 3) {
+			c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+			c->dst.type = OP_REG;
+			c->dst.val = c->dst.orig_val = c->modrm_val;
+			c->dst.ptr = c->modrm_ptr;
+			break;
+		}
+		c->dst.type = OP_MEM;
+		c->dst.ptr = (unsigned long *)c->modrm_ea;
+		if ((c->d & DstMask) == DstMem64)
+			c->dst.bytes = 8;
+		else
+			c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+		c->dst.val = 0;
+		if (c->d & BitOp) {
+			unsigned long mask = ~(c->dst.bytes * 8 - 1);
+
+			c->dst.ptr = (void *)c->dst.ptr +
+						   (c->src.val & mask) / 8;
+		}
+		break;
+	case DstAcc:
+		c->dst.type = OP_REG;
+		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+		c->dst.ptr = &c->regs[VCPU_REGS_RAX];
+		switch (c->dst.bytes) {
+			case 1:
+				c->dst.val = *(u8 *)c->dst.ptr;
+				break;
+			case 2:
+				c->dst.val = *(u16 *)c->dst.ptr;
+				break;
+			case 4:
+				c->dst.val = *(u32 *)c->dst.ptr;
+				break;
+			case 8:
+				c->dst.val = *(u64 *)c->dst.ptr;
+				break;
+		}
+		c->dst.orig_val = c->dst.val;
+		break;
+	case DstDI:
+		c->dst.type = OP_MEM;
+		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+		c->dst.ptr = (unsigned long *)
+			register_address(c, es_base(ctxt, ops),
+					 c->regs[VCPU_REGS_RDI]);
+		c->dst.val = 0;
+		break;
+	}
+
+done:
+	return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
+}
+
 int
 x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 {
-- 
cgit v1.2.3-18-g5258


From 73fba5f4fe3e08bd7acb18a65b53643445c8f028 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 29 Jul 2010 15:11:53 +0300
Subject: KVM: x86 emulator: move decode tables downwards

So they can reference execution functions.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 472 ++++++++++++++++++++++++-------------------------
 1 file changed, 236 insertions(+), 236 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index c6f43591753..70a7cb49ff8 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -117,242 +117,6 @@ struct group_dual {
 	struct opcode mod3[8];
 };
 
-#define D(_y) { .flags = (_y) }
-#define N    D(0)
-#define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) }
-#define GD(_f, _g) { .flags = ((_f) | Group | GroupDual), .u.gdual = (_g) }
-#define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
-
-static struct opcode group1[] = {
-	X7(D(Lock)), N
-};
-
-static struct opcode group1A[] = {
-	D(DstMem | SrcNone | ModRM | Mov | Stack), N, N, N, N, N, N, N,
-};
-
-static struct opcode group3[] = {
-	D(DstMem | SrcImm | ModRM), D(DstMem | SrcImm | ModRM),
-	D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
-	X4(D(Undefined)),
-};
-
-static struct opcode group4[] = {
-	D(ByteOp | DstMem | SrcNone | ModRM | Lock), D(ByteOp | DstMem | SrcNone | ModRM | Lock),
-	N, N, N, N, N, N,
-};
-
-static struct opcode group5[] = {
-	D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
-	D(SrcMem | ModRM | Stack), N,
-	D(SrcMem | ModRM | Stack), D(SrcMemFAddr | ModRM | ImplicitOps),
-	D(SrcMem | ModRM | Stack), N,
-};
-
-static struct group_dual group7 = { {
-	N, N, D(ModRM | SrcMem | Priv), D(ModRM | SrcMem | Priv),
-	D(SrcNone | ModRM | DstMem | Mov), N,
-	D(SrcMem16 | ModRM | Mov | Priv), D(SrcMem | ModRM | ByteOp | Priv),
-}, {
-	D(SrcNone | ModRM | Priv), N, N, D(SrcNone | ModRM | Priv),
-	D(SrcNone | ModRM | DstMem | Mov), N,
-	D(SrcMem16 | ModRM | Mov | Priv), N,
-} };
-
-static struct opcode group8[] = {
-	N, N, N, N,
-	D(DstMem | SrcImmByte | ModRM), D(DstMem | SrcImmByte | ModRM | Lock),
-	D(DstMem | SrcImmByte | ModRM | Lock), D(DstMem | SrcImmByte | ModRM | Lock),
-};
-
-static struct group_dual group9 = { {
-	N, D(DstMem64 | ModRM | Lock), N, N, N, N, N, N,
-}, {
-	N, N, N, N, N, N, N, N,
-} };
-
-static struct opcode opcode_table[256] = {
-	/* 0x00 - 0x07 */
-	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
-	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
-	D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm),
-	D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
-	/* 0x08 - 0x0F */
-	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
-	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
-	D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm),
-	D(ImplicitOps | Stack | No64), N,
-	/* 0x10 - 0x17 */
-	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
-	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
-	D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm),
-	D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
-	/* 0x18 - 0x1F */
-	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
-	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
-	D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm),
-	D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
-	/* 0x20 - 0x27 */
-	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
-	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
-	D(ByteOp | DstAcc | SrcImmByte), D(DstAcc | SrcImm), N, N,
-	/* 0x28 - 0x2F */
-	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
-	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
-	D(ByteOp | DstAcc | SrcImmByte), D(DstAcc | SrcImm), N, N,
-	/* 0x30 - 0x37 */
-	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
-	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
-	D(ByteOp | DstAcc | SrcImmByte), D(DstAcc | SrcImm), N, N,
-	/* 0x38 - 0x3F */
-	D(ByteOp | DstMem | SrcReg | ModRM), D(DstMem | SrcReg | ModRM),
-	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
-	D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm),
-	N, N,
-	/* 0x40 - 0x4F */
-	X16(D(DstReg)),
-	/* 0x50 - 0x57 */
-	X8(D(SrcReg | Stack)),
-	/* 0x58 - 0x5F */
-	X8(D(DstReg | Stack)),
-	/* 0x60 - 0x67 */
-	D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
-	N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ ,
-	N, N, N, N,
-	/* 0x68 - 0x6F */
-	D(SrcImm | Mov | Stack), N, D(SrcImmByte | Mov | Stack), N,
-	D(DstDI | ByteOp | Mov | String), D(DstDI | Mov | String), /* insb, insw/insd */
-	D(SrcSI | ByteOp | ImplicitOps | String), D(SrcSI | ImplicitOps | String), /* outsb, outsw/outsd */
-	/* 0x70 - 0x7F */
-	X16(D(SrcImmByte)),
-	/* 0x80 - 0x87 */
-	G(ByteOp | DstMem | SrcImm | ModRM | Group, group1),
-	G(DstMem | SrcImm | ModRM | Group, group1),
-	G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1),
-	G(DstMem | SrcImmByte | ModRM | Group, group1),
-	D(ByteOp | DstMem | SrcReg | ModRM), D(DstMem | SrcReg | ModRM),
-	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
-	/* 0x88 - 0x8F */
-	D(ByteOp | DstMem | SrcReg | ModRM | Mov), D(DstMem | SrcReg | ModRM | Mov),
-	D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem | ModRM | Mov),
-	D(DstMem | SrcNone | ModRM | Mov), D(ModRM | DstReg),
-	D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A),
-	/* 0x90 - 0x97 */
-	D(DstReg), D(DstReg), D(DstReg), D(DstReg),	D(DstReg), D(DstReg), D(DstReg), D(DstReg),
-	/* 0x98 - 0x9F */
-	N, N, D(SrcImmFAddr | No64), N,
-	D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, N,
-	/* 0xA0 - 0xA7 */
-	D(ByteOp | DstAcc | SrcMem | Mov | MemAbs), D(DstAcc | SrcMem | Mov | MemAbs),
-	D(ByteOp | DstMem | SrcAcc | Mov | MemAbs), D(DstMem | SrcAcc | Mov | MemAbs),
-	D(ByteOp | SrcSI | DstDI | Mov | String), D(SrcSI | DstDI | Mov | String),
-	D(ByteOp | SrcSI | DstDI | String), D(SrcSI | DstDI | String),
-	/* 0xA8 - 0xAF */
-	D(DstAcc | SrcImmByte | ByteOp), D(DstAcc | SrcImm), D(ByteOp | DstDI | Mov | String), D(DstDI | Mov | String),
-	D(ByteOp | SrcSI | DstAcc | Mov | String), D(SrcSI | DstAcc | Mov | String),
-	D(ByteOp | DstDI | String), D(DstDI | String),
-	/* 0xB0 - 0xB7 */
-	X8(D(ByteOp | DstReg | SrcImm | Mov)),
-	/* 0xB8 - 0xBF */
-	X8(D(DstReg | SrcImm | Mov)),
-	/* 0xC0 - 0xC7 */
-	D(ByteOp | DstMem | SrcImm | ModRM), D(DstMem | SrcImmByte | ModRM),
-	N, D(ImplicitOps | Stack), N, N,
-	D(ByteOp | DstMem | SrcImm | ModRM | Mov), D(DstMem | SrcImm | ModRM | Mov),
-	/* 0xC8 - 0xCF */
-	N, N, N, D(ImplicitOps | Stack),
-	D(ImplicitOps), D(SrcImmByte), D(ImplicitOps | No64), D(ImplicitOps),
-	/* 0xD0 - 0xD7 */
-	D(ByteOp | DstMem | SrcImplicit | ModRM), D(DstMem | SrcImplicit | ModRM),
-	D(ByteOp | DstMem | SrcImplicit | ModRM), D(DstMem | SrcImplicit | ModRM),
-	N, N, N, N,
-	/* 0xD8 - 0xDF */
-	N, N, N, N, N, N, N, N,
-	/* 0xE0 - 0xE7 */
-	N, N, N, N,
-	D(ByteOp | SrcImmUByte | DstAcc), D(SrcImmUByte | DstAcc),
-	D(ByteOp | SrcImmUByte | DstAcc), D(SrcImmUByte | DstAcc),
-	/* 0xE8 - 0xEF */
-	D(SrcImm | Stack), D(SrcImm | ImplicitOps),
-	D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps),
-	D(SrcNone | ByteOp | DstAcc), D(SrcNone | DstAcc),
-	D(SrcNone | ByteOp | DstAcc), D(SrcNone | DstAcc),
-	/* 0xF0 - 0xF7 */
-	N, N, N, N,
-	D(ImplicitOps | Priv), D(ImplicitOps), G(ByteOp, group3), G(0, group3),
-	/* 0xF8 - 0xFF */
-	D(ImplicitOps), N, D(ImplicitOps), D(ImplicitOps),
-	D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5),
-};
-
-static struct opcode twobyte_table[256] = {
-	/* 0x00 - 0x0F */
-	N, GD(0, &group7), N, N,
-	N, D(ImplicitOps), D(ImplicitOps | Priv), N,
-	D(ImplicitOps | Priv), D(ImplicitOps | Priv), N, N,
-	N, D(ImplicitOps | ModRM), N, N,
-	/* 0x10 - 0x1F */
-	N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, N,
-	/* 0x20 - 0x2F */
-	D(ModRM | ImplicitOps | Priv), D(ModRM | Priv),
-	D(ModRM | ImplicitOps | Priv), D(ModRM | Priv),
-	N, N, N, N,
-	N, N, N, N, N, N, N, N,
-	/* 0x30 - 0x3F */
-	D(ImplicitOps | Priv), N, D(ImplicitOps | Priv), N,
-	D(ImplicitOps), D(ImplicitOps | Priv), N, N,
-	N, N, N, N, N, N, N, N,
-	/* 0x40 - 0x4F */
-	X16(D(DstReg | SrcMem | ModRM | Mov)),
-	/* 0x50 - 0x5F */
-	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
-	/* 0x60 - 0x6F */
-	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
-	/* 0x70 - 0x7F */
-	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
-	/* 0x80 - 0x8F */
-	X16(D(SrcImm)),
-	/* 0x90 - 0x9F */
-	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
-	/* 0xA0 - 0xA7 */
-	D(ImplicitOps | Stack), D(ImplicitOps | Stack),
-	N, D(DstMem | SrcReg | ModRM | BitOp),
-	D(DstMem | SrcReg | Src2ImmByte | ModRM),
-	D(DstMem | SrcReg | Src2CL | ModRM), N, N,
-	/* 0xA8 - 0xAF */
-	D(ImplicitOps | Stack), D(ImplicitOps | Stack),
-	N, D(DstMem | SrcReg | ModRM | BitOp | Lock),
-	D(DstMem | SrcReg | Src2ImmByte | ModRM),
-	D(DstMem | SrcReg | Src2CL | ModRM),
-	D(ModRM), N,
-	/* 0xB0 - 0xB7 */
-	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
-	N, D(DstMem | SrcReg | ModRM | BitOp | Lock),
-	N, N, D(ByteOp | DstReg | SrcMem | ModRM | Mov),
-	    D(DstReg | SrcMem16 | ModRM | Mov),
-	/* 0xB8 - 0xBF */
-	N, N,
-	G(0, group8), D(DstMem | SrcReg | ModRM | BitOp | Lock),
-	N, N, D(ByteOp | DstReg | SrcMem | ModRM | Mov),
-	    D(DstReg | SrcMem16 | ModRM | Mov),
-	/* 0xC0 - 0xCF */
-	N, N, N, D(DstMem | SrcReg | ModRM | Mov),
-	N, N, N, GD(0, &group9),
-	N, N, N, N, N, N, N, N,
-	/* 0xD0 - 0xDF */
-	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
-	/* 0xE0 - 0xEF */
-	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
-	/* 0xF0 - 0xFF */
-	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N
-};
-
-#undef D
-#undef N
-#undef G
-#undef GD
-#undef I
-
 /* EFLAGS bit definitions. */
 #define EFLG_ID (1<<21)
 #define EFLG_VIP (1<<20)
@@ -2252,6 +2016,242 @@ static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base,
 	op->ptr = (unsigned long *)register_address(c,  base, c->regs[reg]);
 }
 
+#define D(_y) { .flags = (_y) }
+#define N    D(0)
+#define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) }
+#define GD(_f, _g) { .flags = ((_f) | Group | GroupDual), .u.gdual = (_g) }
+#define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
+
+static struct opcode group1[] = {
+	X7(D(Lock)), N
+};
+
+static struct opcode group1A[] = {
+	D(DstMem | SrcNone | ModRM | Mov | Stack), N, N, N, N, N, N, N,
+};
+
+static struct opcode group3[] = {
+	D(DstMem | SrcImm | ModRM), D(DstMem | SrcImm | ModRM),
+	D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
+	X4(D(Undefined)),
+};
+
+static struct opcode group4[] = {
+	D(ByteOp | DstMem | SrcNone | ModRM | Lock), D(ByteOp | DstMem | SrcNone | ModRM | Lock),
+	N, N, N, N, N, N,
+};
+
+static struct opcode group5[] = {
+	D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
+	D(SrcMem | ModRM | Stack), N,
+	D(SrcMem | ModRM | Stack), D(SrcMemFAddr | ModRM | ImplicitOps),
+	D(SrcMem | ModRM | Stack), N,
+};
+
+static struct group_dual group7 = { {
+	N, N, D(ModRM | SrcMem | Priv), D(ModRM | SrcMem | Priv),
+	D(SrcNone | ModRM | DstMem | Mov), N,
+	D(SrcMem16 | ModRM | Mov | Priv), D(SrcMem | ModRM | ByteOp | Priv),
+}, {
+	D(SrcNone | ModRM | Priv), N, N, D(SrcNone | ModRM | Priv),
+	D(SrcNone | ModRM | DstMem | Mov), N,
+	D(SrcMem16 | ModRM | Mov | Priv), N,
+} };
+
+static struct opcode group8[] = {
+	N, N, N, N,
+	D(DstMem | SrcImmByte | ModRM), D(DstMem | SrcImmByte | ModRM | Lock),
+	D(DstMem | SrcImmByte | ModRM | Lock), D(DstMem | SrcImmByte | ModRM | Lock),
+};
+
+static struct group_dual group9 = { {
+	N, D(DstMem64 | ModRM | Lock), N, N, N, N, N, N,
+}, {
+	N, N, N, N, N, N, N, N,
+} };
+
+static struct opcode opcode_table[256] = {
+	/* 0x00 - 0x07 */
+	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
+	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
+	D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm),
+	D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
+	/* 0x08 - 0x0F */
+	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
+	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
+	D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm),
+	D(ImplicitOps | Stack | No64), N,
+	/* 0x10 - 0x17 */
+	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
+	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
+	D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm),
+	D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
+	/* 0x18 - 0x1F */
+	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
+	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
+	D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm),
+	D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
+	/* 0x20 - 0x27 */
+	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
+	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
+	D(ByteOp | DstAcc | SrcImmByte), D(DstAcc | SrcImm), N, N,
+	/* 0x28 - 0x2F */
+	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
+	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
+	D(ByteOp | DstAcc | SrcImmByte), D(DstAcc | SrcImm), N, N,
+	/* 0x30 - 0x37 */
+	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
+	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
+	D(ByteOp | DstAcc | SrcImmByte), D(DstAcc | SrcImm), N, N,
+	/* 0x38 - 0x3F */
+	D(ByteOp | DstMem | SrcReg | ModRM), D(DstMem | SrcReg | ModRM),
+	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
+	D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm),
+	N, N,
+	/* 0x40 - 0x4F */
+	X16(D(DstReg)),
+	/* 0x50 - 0x57 */
+	X8(D(SrcReg | Stack)),
+	/* 0x58 - 0x5F */
+	X8(D(DstReg | Stack)),
+	/* 0x60 - 0x67 */
+	D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
+	N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ ,
+	N, N, N, N,
+	/* 0x68 - 0x6F */
+	D(SrcImm | Mov | Stack), N, D(SrcImmByte | Mov | Stack), N,
+	D(DstDI | ByteOp | Mov | String), D(DstDI | Mov | String), /* insb, insw/insd */
+	D(SrcSI | ByteOp | ImplicitOps | String), D(SrcSI | ImplicitOps | String), /* outsb, outsw/outsd */
+	/* 0x70 - 0x7F */
+	X16(D(SrcImmByte)),
+	/* 0x80 - 0x87 */
+	G(ByteOp | DstMem | SrcImm | ModRM | Group, group1),
+	G(DstMem | SrcImm | ModRM | Group, group1),
+	G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1),
+	G(DstMem | SrcImmByte | ModRM | Group, group1),
+	D(ByteOp | DstMem | SrcReg | ModRM), D(DstMem | SrcReg | ModRM),
+	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
+	/* 0x88 - 0x8F */
+	D(ByteOp | DstMem | SrcReg | ModRM | Mov), D(DstMem | SrcReg | ModRM | Mov),
+	D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem | ModRM | Mov),
+	D(DstMem | SrcNone | ModRM | Mov), D(ModRM | DstReg),
+	D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A),
+	/* 0x90 - 0x97 */
+	D(DstReg), D(DstReg), D(DstReg), D(DstReg),	D(DstReg), D(DstReg), D(DstReg), D(DstReg),
+	/* 0x98 - 0x9F */
+	N, N, D(SrcImmFAddr | No64), N,
+	D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, N,
+	/* 0xA0 - 0xA7 */
+	D(ByteOp | DstAcc | SrcMem | Mov | MemAbs), D(DstAcc | SrcMem | Mov | MemAbs),
+	D(ByteOp | DstMem | SrcAcc | Mov | MemAbs), D(DstMem | SrcAcc | Mov | MemAbs),
+	D(ByteOp | SrcSI | DstDI | Mov | String), D(SrcSI | DstDI | Mov | String),
+	D(ByteOp | SrcSI | DstDI | String), D(SrcSI | DstDI | String),
+	/* 0xA8 - 0xAF */
+	D(DstAcc | SrcImmByte | ByteOp), D(DstAcc | SrcImm), D(ByteOp | DstDI | Mov | String), D(DstDI | Mov | String),
+	D(ByteOp | SrcSI | DstAcc | Mov | String), D(SrcSI | DstAcc | Mov | String),
+	D(ByteOp | DstDI | String), D(DstDI | String),
+	/* 0xB0 - 0xB7 */
+	X8(D(ByteOp | DstReg | SrcImm | Mov)),
+	/* 0xB8 - 0xBF */
+	X8(D(DstReg | SrcImm | Mov)),
+	/* 0xC0 - 0xC7 */
+	D(ByteOp | DstMem | SrcImm | ModRM), D(DstMem | SrcImmByte | ModRM),
+	N, D(ImplicitOps | Stack), N, N,
+	D(ByteOp | DstMem | SrcImm | ModRM | Mov), D(DstMem | SrcImm | ModRM | Mov),
+	/* 0xC8 - 0xCF */
+	N, N, N, D(ImplicitOps | Stack),
+	D(ImplicitOps), D(SrcImmByte), D(ImplicitOps | No64), D(ImplicitOps),
+	/* 0xD0 - 0xD7 */
+	D(ByteOp | DstMem | SrcImplicit | ModRM), D(DstMem | SrcImplicit | ModRM),
+	D(ByteOp | DstMem | SrcImplicit | ModRM), D(DstMem | SrcImplicit | ModRM),
+	N, N, N, N,
+	/* 0xD8 - 0xDF */
+	N, N, N, N, N, N, N, N,
+	/* 0xE0 - 0xE7 */
+	N, N, N, N,
+	D(ByteOp | SrcImmUByte | DstAcc), D(SrcImmUByte | DstAcc),
+	D(ByteOp | SrcImmUByte | DstAcc), D(SrcImmUByte | DstAcc),
+	/* 0xE8 - 0xEF */
+	D(SrcImm | Stack), D(SrcImm | ImplicitOps),
+	D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps),
+	D(SrcNone | ByteOp | DstAcc), D(SrcNone | DstAcc),
+	D(SrcNone | ByteOp | DstAcc), D(SrcNone | DstAcc),
+	/* 0xF0 - 0xF7 */
+	N, N, N, N,
+	D(ImplicitOps | Priv), D(ImplicitOps), G(ByteOp, group3), G(0, group3),
+	/* 0xF8 - 0xFF */
+	D(ImplicitOps), N, D(ImplicitOps), D(ImplicitOps),
+	D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5),
+};
+
+static struct opcode twobyte_table[256] = {
+	/* 0x00 - 0x0F */
+	N, GD(0, &group7), N, N,
+	N, D(ImplicitOps), D(ImplicitOps | Priv), N,
+	D(ImplicitOps | Priv), D(ImplicitOps | Priv), N, N,
+	N, D(ImplicitOps | ModRM), N, N,
+	/* 0x10 - 0x1F */
+	N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, N,
+	/* 0x20 - 0x2F */
+	D(ModRM | ImplicitOps | Priv), D(ModRM | Priv),
+	D(ModRM | ImplicitOps | Priv), D(ModRM | Priv),
+	N, N, N, N,
+	N, N, N, N, N, N, N, N,
+	/* 0x30 - 0x3F */
+	D(ImplicitOps | Priv), N, D(ImplicitOps | Priv), N,
+	D(ImplicitOps), D(ImplicitOps | Priv), N, N,
+	N, N, N, N, N, N, N, N,
+	/* 0x40 - 0x4F */
+	X16(D(DstReg | SrcMem | ModRM | Mov)),
+	/* 0x50 - 0x5F */
+	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
+	/* 0x60 - 0x6F */
+	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
+	/* 0x70 - 0x7F */
+	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
+	/* 0x80 - 0x8F */
+	X16(D(SrcImm)),
+	/* 0x90 - 0x9F */
+	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
+	/* 0xA0 - 0xA7 */
+	D(ImplicitOps | Stack), D(ImplicitOps | Stack),
+	N, D(DstMem | SrcReg | ModRM | BitOp),
+	D(DstMem | SrcReg | Src2ImmByte | ModRM),
+	D(DstMem | SrcReg | Src2CL | ModRM), N, N,
+	/* 0xA8 - 0xAF */
+	D(ImplicitOps | Stack), D(ImplicitOps | Stack),
+	N, D(DstMem | SrcReg | ModRM | BitOp | Lock),
+	D(DstMem | SrcReg | Src2ImmByte | ModRM),
+	D(DstMem | SrcReg | Src2CL | ModRM),
+	D(ModRM), N,
+	/* 0xB0 - 0xB7 */
+	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
+	N, D(DstMem | SrcReg | ModRM | BitOp | Lock),
+	N, N, D(ByteOp | DstReg | SrcMem | ModRM | Mov),
+	    D(DstReg | SrcMem16 | ModRM | Mov),
+	/* 0xB8 - 0xBF */
+	N, N,
+	G(0, group8), D(DstMem | SrcReg | ModRM | BitOp | Lock),
+	N, N, D(ByteOp | DstReg | SrcMem | ModRM | Mov),
+	    D(DstReg | SrcMem16 | ModRM | Mov),
+	/* 0xC0 - 0xCF */
+	N, N, N, D(DstMem | SrcReg | ModRM | Mov),
+	N, N, N, GD(0, &group9),
+	N, N, N, N, N, N, N, N,
+	/* 0xD0 - 0xDF */
+	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
+	/* 0xE0 - 0xEF */
+	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
+	/* 0xF0 - 0xFF */
+	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N
+};
+
+#undef D
+#undef N
+#undef G
+#undef GD
+#undef I
+
 int
 x86_decode_insn(struct x86_emulate_ctxt *ctxt)
 {
-- 
cgit v1.2.3-18-g5258


From d0e533255d3811382c97b594ff7ab19b9b036814 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 29 Jul 2010 15:11:54 +0300
Subject: KVM: x86 emulator: allow repeat macro arguments to contain commas

Needed for repeating instructions with execution functions.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 70a7cb49ff8..7e9bcda3937 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -94,14 +94,14 @@
 #define Src2One     (3<<29)
 #define Src2Mask    (7<<29)
 
-#define X2(x) x, x
-#define X3(x) X2(x), x
-#define X4(x) X2(x), X2(x)
-#define X5(x) X4(x), x
-#define X6(x) X4(x), X2(x)
-#define X7(x) X4(x), X3(x)
-#define X8(x) X4(x), X4(x)
-#define X16(x) X8(x), X8(x)
+#define X2(x...) x, x
+#define X3(x...) X2(x), x
+#define X4(x...) X2(x), X2(x)
+#define X5(x...) X4(x), x
+#define X6(x...) X4(x), X2(x)
+#define X7(x...) X4(x), X3(x)
+#define X8(x...) X4(x), X4(x)
+#define X16(x...) X8(x), X8(x)
 
 struct opcode {
 	u32 flags;
-- 
cgit v1.2.3-18-g5258


From 63540382ccb83d2857964858c1ac7eb7d37de497 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 29 Jul 2010 15:11:55 +0300
Subject: KVM: x86 emulator: convert some push instructions to direct decode

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 7e9bcda3937..904fc1c99b9 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2016,6 +2016,12 @@ static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base,
 	op->ptr = (unsigned long *)register_address(c,  base, c->regs[reg]);
 }
 
+static int em_push(struct x86_emulate_ctxt *ctxt)
+{
+	emulate_push(ctxt, ctxt->ops);
+	return X86EMUL_CONTINUE;
+}
+
 #define D(_y) { .flags = (_y) }
 #define N    D(0)
 #define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) }
@@ -2111,7 +2117,7 @@ static struct opcode opcode_table[256] = {
 	/* 0x40 - 0x4F */
 	X16(D(DstReg)),
 	/* 0x50 - 0x57 */
-	X8(D(SrcReg | Stack)),
+	X8(I(SrcReg | Stack, em_push)),
 	/* 0x58 - 0x5F */
 	X8(D(DstReg | Stack)),
 	/* 0x60 - 0x67 */
@@ -2119,7 +2125,8 @@ static struct opcode opcode_table[256] = {
 	N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ ,
 	N, N, N, N,
 	/* 0x68 - 0x6F */
-	D(SrcImm | Mov | Stack), N, D(SrcImmByte | Mov | Stack), N,
+	I(SrcImm | Mov | Stack, em_push), N,
+	I(SrcImmByte | Mov | Stack, em_push), N,
 	D(DstDI | ByteOp | Mov | String), D(DstDI | Mov | String), /* insb, insw/insd */
 	D(SrcSI | ByteOp | ImplicitOps | String), D(SrcSI | ImplicitOps | String), /* outsb, outsw/outsd */
 	/* 0x70 - 0x7F */
@@ -2786,9 +2793,6 @@ special_insn:
 	case 0x48 ... 0x4f: /* dec r16/r32 */
 		emulate_1op("dec", c->dst, ctxt->eflags);
 		break;
-	case 0x50 ... 0x57:  /* push reg */
-		emulate_push(ctxt, ops);
-		break;
 	case 0x58 ... 0x5f: /* pop reg */
 	pop_instruction:
 		rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes);
@@ -2810,10 +2814,6 @@ special_insn:
 			goto cannot_emulate;
 		c->dst.val = (s32) c->src.val;
 		break;
-	case 0x68: /* push imm */
-	case 0x6a: /* push imm8 */
-		emulate_push(ctxt, ops);
-		break;
 	case 0x6c:		/* insb */
 	case 0x6d:		/* insw/insd */
 		c->dst.bytes = min(c->dst.bytes, 4u);
-- 
cgit v1.2.3-18-g5258


From e85d28f8e8cef09b8e424448ccedb7244cfbf147 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 29 Jul 2010 15:11:52 +0300
Subject: KVM: x86 emulator: don't update vcpu state if instruction is
 restarted

No need to update vcpu state since instruction is in the middle of the
emulation.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 31 +++++++++++++------------------
 1 file changed, 13 insertions(+), 18 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 33deb75f16e..3cbe8032394 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4057,32 +4057,27 @@ restart:
 		return handle_emulation_failure(vcpu);
 	}
 
-	toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility);
-	kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
-	memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
-	kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
+	r = EMULATE_DONE;
 
-	if (vcpu->arch.emulate_ctxt.exception >= 0) {
+	if (vcpu->arch.emulate_ctxt.exception >= 0)
 		inject_emulated_exception(vcpu);
-		return EMULATE_DONE;
-	}
-
-	if (vcpu->arch.pio.count) {
+	else if (vcpu->arch.pio.count) {
 		if (!vcpu->arch.pio.in)
 			vcpu->arch.pio.count = 0;
-		return EMULATE_DO_MMIO;
-	}
-
-	if (vcpu->mmio_needed) {
+		r = EMULATE_DO_MMIO;
+	} else if (vcpu->mmio_needed) {
 		if (vcpu->mmio_is_write)
 			vcpu->mmio_needed = 0;
-		return EMULATE_DO_MMIO;
-	}
-
-	if (vcpu->arch.emulate_ctxt.restart)
+		r = EMULATE_DO_MMIO;
+	} else if (vcpu->arch.emulate_ctxt.restart)
 		goto restart;
 
-	return EMULATE_DONE;
+	toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility);
+	kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+	memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
+	kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
+
+	return r;
 }
 EXPORT_SYMBOL_GPL(emulate_instruction);
 
-- 
cgit v1.2.3-18-g5258


From 9928ff608b1b6ba10fafde85f57970a83a181331 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 1 Aug 2010 18:35:24 +0300
Subject: KVM: x86 emulator: fix LMSW able to clear cr0.pe

LMSW is documented not to be able to clear cr0.pe; make it so.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 904fc1c99b9..4d49514a919 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3211,7 +3211,7 @@ twobyte_insn:
 			c->dst.val = ops->get_cr(0, ctxt->vcpu);
 			break;
 		case 6: /* lmsw */
-			ops->set_cr(0, (ops->get_cr(0, ctxt->vcpu) & ~0x0ful) |
+			ops->set_cr(0, (ops->get_cr(0, ctxt->vcpu) & ~0x0eul) |
 				    (c->src.val & 0x0f), ctxt->vcpu);
 			c->dst.type = OP_NONE;
 			break;
-- 
cgit v1.2.3-18-g5258


From 4fc40f076f4fa289dd546990b597351c9cdad985 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 2 Aug 2010 12:47:51 +0300
Subject: KVM: x86 emulator: check io permissions only once for string pio

Do not recheck io permission on every iteration.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h | 1 +
 arch/x86/kvm/emulate.c             | 6 ++++++
 arch/x86/kvm/x86.c                 | 1 +
 3 files changed, 8 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 0f901c16cf1..8762411fe9b 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -224,6 +224,7 @@ struct x86_emulate_ctxt {
 	int interruptibility;
 
 	bool restart; /* restart string instruction after writeback */
+	bool perm_ok; /* do not check permissions if true */
 
 	int exception; /* exception that happens during emulation or -1 */
 	u32 error_code; /* error code for exception */
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 4d49514a919..760e2b030e6 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1621,9 +1621,15 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
 				 struct x86_emulate_ops *ops,
 				 u16 port, u16 len)
 {
+	if (ctxt->perm_ok)
+		return true;
+
 	if (emulator_bad_iopl(ctxt, ops))
 		if (!emulator_io_port_access_allowed(ctxt, ops, port, len))
 			return false;
+
+	ctxt->perm_ok = true;
+
 	return true;
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3cbe8032394..35c0f4e4a62 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3997,6 +3997,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 		memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
 		vcpu->arch.emulate_ctxt.interruptibility = 0;
 		vcpu->arch.emulate_ctxt.exception = -1;
+		vcpu->arch.emulate_ctxt.perm_ok = false;
 
 		r = x86_decode_insn(&vcpu->arch.emulate_ctxt);
 		trace_kvm_emulate_insn_start(vcpu);
-- 
cgit v1.2.3-18-g5258


From 251464c464cf7df7d6d548f1065f49a3ecd08118 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Mon, 2 Aug 2010 16:12:08 +0800
Subject: KVM: MMU: using kvm_set_pfn_accessed() instead of
 mark_page_accessed()

It's a small cleanup that using using kvm_set_pfn_accessed() instead
of mark_page_accessed()

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 82f7622c17d..e430a383ad1 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -309,7 +309,7 @@ static void update_spte(u64 *sptep, u64 new_spte)
 	else {
 		old_spte = __xchg_spte(sptep, new_spte);
 		if (old_spte & shadow_accessed_mask)
-			mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte)));
+			kvm_set_pfn_accessed(spte_to_pfn(old_spte));
 	}
 }
 
-- 
cgit v1.2.3-18-g5258


From 8672b7217a234c41d425a63b171af809e1169842 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Mon, 2 Aug 2010 16:14:04 +0800
Subject: KVM: MMU: move bits lost judgement into a separate function

Introduce spte_has_volatile_bits() function to judge whether spte
bits will miss, it's more readable and can help us to cleanup code
later

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e430a383ad1..c07b9a200bc 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -299,6 +299,20 @@ static u64 __xchg_spte(u64 *sptep, u64 new_spte)
 #endif
 }
 
+static bool spte_has_volatile_bits(u64 spte)
+{
+	if (!shadow_accessed_mask)
+		return false;
+
+	if (!is_shadow_present_pte(spte))
+		return false;
+
+	if (spte & shadow_accessed_mask)
+		return false;
+
+	return true;
+}
+
 static void update_spte(u64 *sptep, u64 new_spte)
 {
 	u64 old_spte;
@@ -679,14 +693,14 @@ static void set_spte_track_bits(u64 *sptep, u64 new_spte)
 	pfn_t pfn;
 	u64 old_spte = *sptep;
 
-	if (!shadow_accessed_mask || !is_shadow_present_pte(old_spte) ||
-	      old_spte & shadow_accessed_mask) {
+	if (!spte_has_volatile_bits(old_spte))
 		__set_spte(sptep, new_spte);
-	} else
+	else
 		old_spte = __xchg_spte(sptep, new_spte);
 
 	if (!is_rmap_spte(old_spte))
 		return;
+
 	pfn = spte_to_pfn(old_spte);
 	if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
 		kvm_set_pfn_accessed(pfn);
-- 
cgit v1.2.3-18-g5258


From 4132779b1718f066ec2d06a71c8958039865cd49 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Mon, 2 Aug 2010 16:15:08 +0800
Subject: KVM: MMU: mark page dirty only when page is really written

Mark page dirty only when this page is really written, it's more exacter,
and also can fix dirty page marking in speculation path

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 47 ++++++++++++++++++++++++++++-------------------
 1 file changed, 28 insertions(+), 19 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c07b9a200bc..ff95d418750 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -307,24 +307,42 @@ static bool spte_has_volatile_bits(u64 spte)
 	if (!is_shadow_present_pte(spte))
 		return false;
 
-	if (spte & shadow_accessed_mask)
+	if ((spte & shadow_accessed_mask) &&
+	      (!is_writable_pte(spte) || (spte & shadow_dirty_mask)))
 		return false;
 
 	return true;
 }
 
+static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask)
+{
+	return (old_spte & bit_mask) && !(new_spte & bit_mask);
+}
+
 static void update_spte(u64 *sptep, u64 new_spte)
 {
-	u64 old_spte;
+	u64 mask, old_spte = *sptep;
+
+	WARN_ON(!is_rmap_spte(new_spte));
 
-	if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask) ||
-	      !is_rmap_spte(*sptep))
+	new_spte |= old_spte & shadow_dirty_mask;
+
+	mask = shadow_accessed_mask;
+	if (is_writable_pte(old_spte))
+		mask |= shadow_dirty_mask;
+
+	if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask)
 		__set_spte(sptep, new_spte);
-	else {
+	else
 		old_spte = __xchg_spte(sptep, new_spte);
-		if (old_spte & shadow_accessed_mask)
-			kvm_set_pfn_accessed(spte_to_pfn(old_spte));
-	}
+
+	if (!shadow_accessed_mask)
+		return;
+
+	if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))
+		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
+	if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))
+		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
 }
 
 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
@@ -704,7 +722,7 @@ static void set_spte_track_bits(u64 *sptep, u64 new_spte)
 	pfn = spte_to_pfn(old_spte);
 	if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
 		kvm_set_pfn_accessed(pfn);
-	if (is_writable_pte(old_spte))
+	if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
 		kvm_set_pfn_dirty(pfn);
 }
 
@@ -759,13 +777,6 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
 		}
 		spte = rmap_next(kvm, rmapp, spte);
 	}
-	if (write_protected) {
-		pfn_t pfn;
-
-		spte = rmap_next(kvm, rmapp, NULL);
-		pfn = spte_to_pfn(*spte);
-		kvm_set_pfn_dirty(pfn);
-	}
 
 	/* check for huge page mappings */
 	for (i = PT_DIRECTORY_LEVEL;
@@ -1938,7 +1949,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	 * whether the guest actually used the pte (in order to detect
 	 * demand paging).
 	 */
-	spte = shadow_base_present_pte | shadow_dirty_mask;
+	spte = shadow_base_present_pte;
 	if (!speculative)
 		spte |= shadow_accessed_mask;
 	if (!dirty)
@@ -1999,8 +2010,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		mark_page_dirty(vcpu->kvm, gfn);
 
 set_pte:
-	if (is_writable_pte(*sptep) && !is_writable_pte(spte))
-		kvm_set_pfn_dirty(pfn);
 	update_spte(sptep, spte);
 done:
 	return ret;
-- 
cgit v1.2.3-18-g5258


From 52c65a30a5c6f31cd66dba57c22d18cafa5e327f Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 2 Aug 2010 16:46:44 +0200
Subject: KVM: SVM: Check for nested vmrun intercept before emulating vmrun

This patch lets the nested vmrun fail if the L1 hypervisor
has not intercepted vmrun. This fixes the "vmrun intercept
check" unit test.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 116e0341bf4..a0e5c7e2610 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2014,6 +2014,14 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
 	return true;
 }
 
+static bool nested_vmcb_checks(struct vmcb *vmcb)
+{
+	if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0)
+		return false;
+
+	return true;
+}
+
 static bool nested_svm_vmrun(struct vcpu_svm *svm)
 {
 	struct vmcb *nested_vmcb;
@@ -2028,6 +2036,17 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 	if (!nested_vmcb)
 		return false;
 
+	if (!nested_vmcb_checks(nested_vmcb)) {
+		nested_vmcb->control.exit_code    = SVM_EXIT_ERR;
+		nested_vmcb->control.exit_code_hi = 0;
+		nested_vmcb->control.exit_info_1  = 0;
+		nested_vmcb->control.exit_info_2  = 0;
+
+		nested_svm_unmap(page);
+
+		return false;
+	}
+
 	trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, vmcb_gpa,
 			       nested_vmcb->save.rip,
 			       nested_vmcb->control.int_ctl,
-- 
cgit v1.2.3-18-g5258


From dbe7758482a870f30a86bdeefebf4fc260afef11 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 2 Aug 2010 16:46:45 +0200
Subject: KVM: SVM: Check for asid != 0 on nested vmrun

This patch lets a nested vmrun fail if the L1 hypervisor
left the asid zero. This fixes the asid_zero unit test.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index a0e5c7e2610..af5b9ea5196 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2019,6 +2019,9 @@ static bool nested_vmcb_checks(struct vmcb *vmcb)
 	if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0)
 		return false;
 
+	if (vmcb->control.asid == 0)
+		return false;
+
 	return true;
 }
 
-- 
cgit v1.2.3-18-g5258


From 09ee57cdae3156aa3b74f378a0c57ef657c90f38 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 1 Aug 2010 12:07:29 +0300
Subject: KVM: x86 emulator: push segment override out of decode_modrm()

Let it compute modrm_seg instead, and have the caller apply it.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |  1 +
 arch/x86/kvm/emulate.c             | 10 ++++++----
 2 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 8762411fe9b..cbdf76722d7 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -198,6 +198,7 @@ struct decode_cache {
 	u8 modrm_mod;
 	u8 modrm_reg;
 	u8 modrm_rm;
+	u8 modrm_seg;
 	u8 use_modrm_ea;
 	bool rip_relative;
 	unsigned long modrm_ea;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 760e2b030e6..471f12ae29c 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -593,6 +593,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
 	c->modrm_rm |= (c->modrm & 0x07);
 	c->modrm_ea = 0;
 	c->use_modrm_ea = 1;
+	c->modrm_seg = VCPU_SREG_DS;
 
 	if (c->modrm_mod == 3) {
 		c->modrm_ptr = decode_register(c->modrm_rm,
@@ -649,8 +650,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
 		}
 		if (c->modrm_rm == 2 || c->modrm_rm == 3 ||
 		    (c->modrm_rm == 6 && c->modrm_mod != 0))
-			if (!c->has_seg_override)
-				set_seg_override(c, VCPU_SREG_SS);
+			c->modrm_seg = VCPU_SREG_SS;
 		c->modrm_ea = (u16)c->modrm_ea;
 	} else {
 		/* 32/64-bit ModR/M decode. */
@@ -2405,9 +2405,11 @@ done_prefixes:
 		c->op_bytes = 8;
 
 	/* ModRM and SIB bytes. */
-	if (c->d & ModRM)
+	if (c->d & ModRM) {
 		rc = decode_modrm(ctxt, ops);
-	else if (c->d & MemAbs)
+		if (!c->has_seg_override)
+			set_seg_override(c, c->modrm_seg);
+	} else if (c->d & MemAbs)
 		rc = decode_abs(ctxt, ops);
 	if (rc != X86EMUL_CONTINUE)
 		goto done;
-- 
cgit v1.2.3-18-g5258


From 1a6440aef6d63252e6c80aff651147b5f8c737e9 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 1 Aug 2010 12:35:10 +0300
Subject: KVM: x86 emulator: use correct type for memory address in operands

Currently we use a void pointer for memory addresses.  That's wrong since
these are guest virtual addresses which are not directly dereferencable by
the host.

Use the correct type, unsigned long.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |   5 +-
 arch/x86/kvm/emulate.c             | 117 ++++++++++++++++++-------------------
 2 files changed, 61 insertions(+), 61 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index cbdf76722d7..0c835f7eb30 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -156,7 +156,10 @@ struct operand {
 		unsigned long orig_val;
 		u64 orig_val64;
 	};
-	unsigned long *ptr;
+	union {
+		unsigned long *reg;
+		unsigned long mem;
+	} addr;
 	union {
 		unsigned long val;
 		u64 val64;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 471f12ae29c..5f45f66ed27 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -489,7 +489,7 @@ static void *decode_register(u8 modrm_reg, unsigned long *regs,
 
 static int read_descriptor(struct x86_emulate_ctxt *ctxt,
 			   struct x86_emulate_ops *ops,
-			   void *ptr,
+			   ulong addr,
 			   u16 *size, unsigned long *address, int op_bytes)
 {
 	int rc;
@@ -497,12 +497,10 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
 	if (op_bytes == 2)
 		op_bytes = 3;
 	*address = 0;
-	rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
-			   ctxt->vcpu, NULL);
+	rc = ops->read_std(addr, (unsigned long *)size, 2, ctxt->vcpu, NULL);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
-	rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
-			   ctxt->vcpu, NULL);
+	rc = ops->read_std(addr + 2, address, op_bytes, ctxt->vcpu, NULL);
 	return rc;
 }
 
@@ -552,21 +550,21 @@ static void decode_register_operand(struct operand *op,
 		reg = (c->b & 7) | ((c->rex_prefix & 1) << 3);
 	op->type = OP_REG;
 	if ((c->d & ByteOp) && !inhibit_bytereg) {
-		op->ptr = decode_register(reg, c->regs, highbyte_regs);
-		op->val = *(u8 *)op->ptr;
+		op->addr.reg = decode_register(reg, c->regs, highbyte_regs);
+		op->val = *(u8 *)op->addr.reg;
 		op->bytes = 1;
 	} else {
-		op->ptr = decode_register(reg, c->regs, 0);
+		op->addr.reg = decode_register(reg, c->regs, 0);
 		op->bytes = c->op_bytes;
 		switch (op->bytes) {
 		case 2:
-			op->val = *(u16 *)op->ptr;
+			op->val = *(u16 *)op->addr.reg;
 			break;
 		case 4:
-			op->val = *(u32 *)op->ptr;
+			op->val = *(u32 *)op->addr.reg;
 			break;
 		case 8:
-			op->val = *(u64 *) op->ptr;
+			op->val = *(u64 *) op->addr.reg;
 			break;
 		}
 	}
@@ -976,23 +974,23 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
 		 */
 		switch (c->dst.bytes) {
 		case 1:
-			*(u8 *)c->dst.ptr = (u8)c->dst.val;
+			*(u8 *)c->dst.addr.reg = (u8)c->dst.val;
 			break;
 		case 2:
-			*(u16 *)c->dst.ptr = (u16)c->dst.val;
+			*(u16 *)c->dst.addr.reg = (u16)c->dst.val;
 			break;
 		case 4:
-			*c->dst.ptr = (u32)c->dst.val;
+			*c->dst.addr.reg = (u32)c->dst.val;
 			break;	/* 64b: zero-ext */
 		case 8:
-			*c->dst.ptr = c->dst.val;
+			*c->dst.addr.reg = c->dst.val;
 			break;
 		}
 		break;
 	case OP_MEM:
 		if (c->lock_prefix)
 			rc = ops->cmpxchg_emulated(
-					(unsigned long)c->dst.ptr,
+					c->dst.addr.mem,
 					&c->dst.orig_val,
 					&c->dst.val,
 					c->dst.bytes,
@@ -1000,14 +998,13 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
 					ctxt->vcpu);
 		else
 			rc = ops->write_emulated(
-					(unsigned long)c->dst.ptr,
+					c->dst.addr.mem,
 					&c->dst.val,
 					c->dst.bytes,
 					&err,
 					ctxt->vcpu);
 		if (rc == X86EMUL_PROPAGATE_FAULT)
-			emulate_pf(ctxt,
-					      (unsigned long)c->dst.ptr, err);
+			emulate_pf(ctxt, c->dst.addr.mem, err);
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
 		break;
@@ -1029,8 +1026,8 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt,
 	c->dst.bytes = c->op_bytes;
 	c->dst.val = c->src.val;
 	register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
-	c->dst.ptr = (void *) register_address(c, ss_base(ctxt, ops),
-					       c->regs[VCPU_REGS_RSP]);
+	c->dst.addr.mem = register_address(c, ss_base(ctxt, ops),
+					   c->regs[VCPU_REGS_RSP]);
 }
 
 static int emulate_pop(struct x86_emulate_ctxt *ctxt,
@@ -2019,7 +2016,7 @@ static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base,
 	int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
 
 	register_address_increment(c, &c->regs[reg], df * op->bytes);
-	op->ptr = (unsigned long *)register_address(c,  base, c->regs[reg]);
+	op->addr.mem = register_address(c,  base, c->regs[reg]);
 }
 
 static int em_push(struct x86_emulate_ctxt *ctxt)
@@ -2456,17 +2453,17 @@ done_prefixes:
 		if ((c->d & ModRM) && c->modrm_mod == 3) {
 			c->src.type = OP_REG;
 			c->src.val = c->modrm_val;
-			c->src.ptr = c->modrm_ptr;
+			c->src.addr.reg = c->modrm_ptr;
 			break;
 		}
 		c->src.type = OP_MEM;
-		c->src.ptr = (unsigned long *)c->modrm_ea;
+		c->src.addr.mem = c->modrm_ea;
 		c->src.val = 0;
 		break;
 	case SrcImm:
 	case SrcImmU:
 		c->src.type = OP_IMM;
-		c->src.ptr = (unsigned long *)c->eip;
+		c->src.addr.mem = c->eip;
 		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
 		if (c->src.bytes == 8)
 			c->src.bytes = 4;
@@ -2499,7 +2496,7 @@ done_prefixes:
 	case SrcImmByte:
 	case SrcImmUByte:
 		c->src.type = OP_IMM;
-		c->src.ptr = (unsigned long *)c->eip;
+		c->src.addr.mem = c->eip;
 		c->src.bytes = 1;
 		if ((c->d & SrcMask) == SrcImmByte)
 			c->src.val = insn_fetch(s8, 1, c->eip);
@@ -2509,19 +2506,19 @@ done_prefixes:
 	case SrcAcc:
 		c->src.type = OP_REG;
 		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->src.ptr = &c->regs[VCPU_REGS_RAX];
+		c->src.addr.reg = &c->regs[VCPU_REGS_RAX];
 		switch (c->src.bytes) {
 			case 1:
-				c->src.val = *(u8 *)c->src.ptr;
+				c->src.val = *(u8 *)c->src.addr.reg;
 				break;
 			case 2:
-				c->src.val = *(u16 *)c->src.ptr;
+				c->src.val = *(u16 *)c->src.addr.reg;
 				break;
 			case 4:
-				c->src.val = *(u32 *)c->src.ptr;
+				c->src.val = *(u32 *)c->src.addr.reg;
 				break;
 			case 8:
-				c->src.val = *(u64 *)c->src.ptr;
+				c->src.val = *(u64 *)c->src.addr.reg;
 				break;
 		}
 		break;
@@ -2532,20 +2529,20 @@ done_prefixes:
 	case SrcSI:
 		c->src.type = OP_MEM;
 		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->src.ptr = (unsigned long *)
+		c->src.addr.mem =
 			register_address(c,  seg_override_base(ctxt, ops, c),
 					 c->regs[VCPU_REGS_RSI]);
 		c->src.val = 0;
 		break;
 	case SrcImmFAddr:
 		c->src.type = OP_IMM;
-		c->src.ptr = (unsigned long *)c->eip;
+		c->src.addr.mem = c->eip;
 		c->src.bytes = c->op_bytes + 2;
 		insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip);
 		break;
 	case SrcMemFAddr:
 		c->src.type = OP_MEM;
-		c->src.ptr = (unsigned long *)c->modrm_ea;
+		c->src.addr.mem = c->modrm_ea;
 		c->src.bytes = c->op_bytes + 2;
 		break;
 	}
@@ -2563,7 +2560,7 @@ done_prefixes:
 		break;
 	case Src2ImmByte:
 		c->src2.type = OP_IMM;
-		c->src2.ptr = (unsigned long *)c->eip;
+		c->src2.addr.mem = c->eip;
 		c->src2.bytes = 1;
 		c->src2.val = insn_fetch(u8, 1, c->eip);
 		break;
@@ -2588,11 +2585,11 @@ done_prefixes:
 			c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
 			c->dst.type = OP_REG;
 			c->dst.val = c->dst.orig_val = c->modrm_val;
-			c->dst.ptr = c->modrm_ptr;
+			c->dst.addr.reg = c->modrm_ptr;
 			break;
 		}
 		c->dst.type = OP_MEM;
-		c->dst.ptr = (unsigned long *)c->modrm_ea;
+		c->dst.addr.mem = c->modrm_ea;
 		if ((c->d & DstMask) == DstMem64)
 			c->dst.bytes = 8;
 		else
@@ -2601,26 +2598,26 @@ done_prefixes:
 		if (c->d & BitOp) {
 			unsigned long mask = ~(c->dst.bytes * 8 - 1);
 
-			c->dst.ptr = (void *)c->dst.ptr +
+			c->dst.addr.mem = c->dst.addr.mem +
 						   (c->src.val & mask) / 8;
 		}
 		break;
 	case DstAcc:
 		c->dst.type = OP_REG;
 		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->dst.ptr = &c->regs[VCPU_REGS_RAX];
+		c->dst.addr.reg = &c->regs[VCPU_REGS_RAX];
 		switch (c->dst.bytes) {
 			case 1:
-				c->dst.val = *(u8 *)c->dst.ptr;
+				c->dst.val = *(u8 *)c->dst.addr.reg;
 				break;
 			case 2:
-				c->dst.val = *(u16 *)c->dst.ptr;
+				c->dst.val = *(u16 *)c->dst.addr.reg;
 				break;
 			case 4:
-				c->dst.val = *(u32 *)c->dst.ptr;
+				c->dst.val = *(u32 *)c->dst.addr.reg;
 				break;
 			case 8:
-				c->dst.val = *(u64 *)c->dst.ptr;
+				c->dst.val = *(u64 *)c->dst.addr.reg;
 				break;
 		}
 		c->dst.orig_val = c->dst.val;
@@ -2628,7 +2625,7 @@ done_prefixes:
 	case DstDI:
 		c->dst.type = OP_MEM;
 		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->dst.ptr = (unsigned long *)
+		c->dst.addr.mem =
 			register_address(c, es_base(ctxt, ops),
 					 c->regs[VCPU_REGS_RDI]);
 		c->dst.val = 0;
@@ -2696,7 +2693,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 	}
 
 	if (c->src.type == OP_MEM) {
-		rc = read_emulated(ctxt, ops, (unsigned long)c->src.ptr,
+		rc = read_emulated(ctxt, ops, c->src.addr.mem,
 					c->src.valptr, c->src.bytes);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
@@ -2704,7 +2701,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 	}
 
 	if (c->src2.type == OP_MEM) {
-		rc = read_emulated(ctxt, ops, (unsigned long)c->src2.ptr,
+		rc = read_emulated(ctxt, ops, c->src2.addr.mem,
 					&c->src2.val, c->src2.bytes);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
@@ -2716,7 +2713,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 
 	if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
 		/* optimisation - avoid slow emulated read if Mov */
-		rc = read_emulated(ctxt, ops, (unsigned long)c->dst.ptr,
+		rc = read_emulated(ctxt, ops, c->dst.addr.mem,
 				   &c->dst.val, c->dst.bytes);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
@@ -2880,16 +2877,16 @@ special_insn:
 		/* Write back the register source. */
 		switch (c->dst.bytes) {
 		case 1:
-			*(u8 *) c->src.ptr = (u8) c->dst.val;
+			*(u8 *) c->src.addr.reg = (u8) c->dst.val;
 			break;
 		case 2:
-			*(u16 *) c->src.ptr = (u16) c->dst.val;
+			*(u16 *) c->src.addr.reg = (u16) c->dst.val;
 			break;
 		case 4:
-			*c->src.ptr = (u32) c->dst.val;
+			*c->src.addr.reg = (u32) c->dst.val;
 			break;	/* 64b reg: zero-extend */
 		case 8:
-			*c->src.ptr = c->dst.val;
+			*c->src.addr.reg = c->dst.val;
 			break;
 		}
 		/*
@@ -2936,15 +2933,15 @@ special_insn:
 			goto done;
 		break;
 	case 0x90: /* nop / xchg r8,rax */
-		if (c->dst.ptr == (unsigned long *)&c->regs[VCPU_REGS_RAX]) {
+		if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX]) {
 			c->dst.type = OP_NONE;  /* nop */
 			break;
 		}
 	case 0x91 ... 0x97: /* xchg reg,rax */
 		c->src.type = OP_REG;
 		c->src.bytes = c->op_bytes;
-		c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX];
-		c->src.val = *(c->src.ptr);
+		c->src.addr.reg = &c->regs[VCPU_REGS_RAX];
+		c->src.val = *(c->src.addr.reg);
 		goto xchg;
 	case 0x9c: /* pushf */
 		c->src.val =  (unsigned long) ctxt->eflags;
@@ -2952,7 +2949,7 @@ special_insn:
 		break;
 	case 0x9d: /* popf */
 		c->dst.type = OP_REG;
-		c->dst.ptr = (unsigned long *) &ctxt->eflags;
+		c->dst.addr.reg = &ctxt->eflags;
 		c->dst.bytes = c->op_bytes;
 		rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes);
 		if (rc != X86EMUL_CONTINUE)
@@ -2963,7 +2960,7 @@ special_insn:
 		goto mov;
 	case 0xa6 ... 0xa7:	/* cmps */
 		c->dst.type = OP_NONE; /* Disable writeback. */
-		DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
+		DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.addr.mem, c->dst.addr.mem);
 		goto cmp;
 	case 0xa8 ... 0xa9:	/* test ax, imm */
 		goto test;
@@ -2982,7 +2979,7 @@ special_insn:
 		break;
 	case 0xc3: /* ret */
 		c->dst.type = OP_REG;
-		c->dst.ptr = &c->eip;
+		c->dst.addr.reg = &c->eip;
 		c->dst.bytes = c->op_bytes;
 		goto pop_instruction;
 	case 0xc6 ... 0xc7:	/* mov (sole member of Grp11) */
@@ -3184,7 +3181,7 @@ twobyte_insn:
 			c->dst.type = OP_NONE;
 			break;
 		case 2: /* lgdt */
-			rc = read_descriptor(ctxt, ops, c->src.ptr,
+			rc = read_descriptor(ctxt, ops, c->src.addr.mem,
 					     &size, &address, c->op_bytes);
 			if (rc != X86EMUL_CONTINUE)
 				goto done;
@@ -3204,7 +3201,7 @@ twobyte_insn:
 					goto cannot_emulate;
 				}
 			} else {
-				rc = read_descriptor(ctxt, ops, c->src.ptr,
+				rc = read_descriptor(ctxt, ops, c->src.addr.mem,
 						     &size, &address,
 						     c->op_bytes);
 				if (rc != X86EMUL_CONTINUE)
@@ -3399,7 +3396,7 @@ twobyte_insn:
 		} else {
 			/* Failure: write the value we saw to EAX. */
 			c->dst.type = OP_REG;
-			c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
+			c->dst.addr.reg = (unsigned long *)&c->regs[VCPU_REGS_RAX];
 		}
 		break;
 	case 0xb3:
-- 
cgit v1.2.3-18-g5258


From 4515453964e78ce556a98c56aeb675ed8d48b8de Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 1 Aug 2010 12:39:53 +0300
Subject: KVM: x86 emulator: simplify xchg decode tables

Use X8() to avoid repetition.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5f45f66ed27..c7176df9ced 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2147,7 +2147,7 @@ static struct opcode opcode_table[256] = {
 	D(DstMem | SrcNone | ModRM | Mov), D(ModRM | DstReg),
 	D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A),
 	/* 0x90 - 0x97 */
-	D(DstReg), D(DstReg), D(DstReg), D(DstReg),	D(DstReg), D(DstReg), D(DstReg), D(DstReg),
+	X8(D(DstReg)),
 	/* 0x98 - 0x9F */
 	N, N, D(SrcImmFAddr | No64), N,
 	D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, N,
-- 
cgit v1.2.3-18-g5258


From 3d9e77dff81c8be21ec0e7950ae06d1bddff8066 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 1 Aug 2010 12:41:59 +0300
Subject: KVM: x86 emulator: use SrcAcc to simplify xchg decoding

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index c7176df9ced..b7da0e3e0cc 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2147,7 +2147,7 @@ static struct opcode opcode_table[256] = {
 	D(DstMem | SrcNone | ModRM | Mov), D(ModRM | DstReg),
 	D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A),
 	/* 0x90 - 0x97 */
-	X8(D(DstReg)),
+	X8(D(SrcAcc | DstReg)),
 	/* 0x98 - 0x9F */
 	N, N, D(SrcImmFAddr | No64), N,
 	D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, N,
@@ -2932,16 +2932,9 @@ special_insn:
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		break;
-	case 0x90: /* nop / xchg r8,rax */
-		if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX]) {
-			c->dst.type = OP_NONE;  /* nop */
-			break;
-		}
-	case 0x91 ... 0x97: /* xchg reg,rax */
-		c->src.type = OP_REG;
-		c->src.bytes = c->op_bytes;
-		c->src.addr.reg = &c->regs[VCPU_REGS_RAX];
-		c->src.val = *(c->src.addr.reg);
+	case 0x90 ... 0x97: /* nop / xchg reg, rax */
+		if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX])
+			goto done;
 		goto xchg;
 	case 0x9c: /* pushf */
 		c->src.val =  (unsigned long) ctxt->eflags;
-- 
cgit v1.2.3-18-g5258


From 91ff3cb43cb3dd8810d726dfa1f3736dc9aea1df Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 1 Aug 2010 12:53:09 +0300
Subject: KVM: x86 emulator: put register operand fetch into a function

The code is repeated three times, put it into fetch_register_operand()

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 61 +++++++++++++++++---------------------------------
 1 file changed, 21 insertions(+), 40 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index b7da0e3e0cc..898a55ba3e1 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -539,6 +539,24 @@ static int test_cc(unsigned int condition, unsigned int flags)
 	return (!!rc ^ (condition & 1));
 }
 
+static void fetch_register_operand(struct operand *op)
+{
+	switch (op->bytes) {
+	case 1:
+		op->val = *(u8 *)op->addr.reg;
+		break;
+	case 2:
+		op->val = *(u16 *)op->addr.reg;
+		break;
+	case 4:
+		op->val = *(u32 *)op->addr.reg;
+		break;
+	case 8:
+		op->val = *(u64 *)op->addr.reg;
+		break;
+	}
+}
+
 static void decode_register_operand(struct operand *op,
 				    struct decode_cache *c,
 				    int inhibit_bytereg)
@@ -551,23 +569,12 @@ static void decode_register_operand(struct operand *op,
 	op->type = OP_REG;
 	if ((c->d & ByteOp) && !inhibit_bytereg) {
 		op->addr.reg = decode_register(reg, c->regs, highbyte_regs);
-		op->val = *(u8 *)op->addr.reg;
 		op->bytes = 1;
 	} else {
 		op->addr.reg = decode_register(reg, c->regs, 0);
 		op->bytes = c->op_bytes;
-		switch (op->bytes) {
-		case 2:
-			op->val = *(u16 *)op->addr.reg;
-			break;
-		case 4:
-			op->val = *(u32 *)op->addr.reg;
-			break;
-		case 8:
-			op->val = *(u64 *) op->addr.reg;
-			break;
-		}
 	}
+	fetch_register_operand(op);
 	op->orig_val = op->val;
 }
 
@@ -2507,20 +2514,7 @@ done_prefixes:
 		c->src.type = OP_REG;
 		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
 		c->src.addr.reg = &c->regs[VCPU_REGS_RAX];
-		switch (c->src.bytes) {
-			case 1:
-				c->src.val = *(u8 *)c->src.addr.reg;
-				break;
-			case 2:
-				c->src.val = *(u16 *)c->src.addr.reg;
-				break;
-			case 4:
-				c->src.val = *(u32 *)c->src.addr.reg;
-				break;
-			case 8:
-				c->src.val = *(u64 *)c->src.addr.reg;
-				break;
-		}
+		fetch_register_operand(&c->src);
 		break;
 	case SrcOne:
 		c->src.bytes = 1;
@@ -2606,20 +2600,7 @@ done_prefixes:
 		c->dst.type = OP_REG;
 		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
 		c->dst.addr.reg = &c->regs[VCPU_REGS_RAX];
-		switch (c->dst.bytes) {
-			case 1:
-				c->dst.val = *(u8 *)c->dst.addr.reg;
-				break;
-			case 2:
-				c->dst.val = *(u16 *)c->dst.addr.reg;
-				break;
-			case 4:
-				c->dst.val = *(u32 *)c->dst.addr.reg;
-				break;
-			case 8:
-				c->dst.val = *(u64 *)c->dst.addr.reg;
-				break;
-		}
+		fetch_register_operand(&c->dst);
 		c->dst.orig_val = c->dst.val;
 		break;
 	case DstDI:
-- 
cgit v1.2.3-18-g5258


From d4709c78eeff2b272e0b9727748b72371b0e71ab Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 1 Aug 2010 13:53:19 +0300
Subject: KVM: x86 emulator: drop use_modrm_ea

Unused (and has never been).

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h | 1 -
 arch/x86/kvm/emulate.c             | 1 -
 2 files changed, 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 0c835f7eb30..e425444658e 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -202,7 +202,6 @@ struct decode_cache {
 	u8 modrm_reg;
 	u8 modrm_rm;
 	u8 modrm_seg;
-	u8 use_modrm_ea;
 	bool rip_relative;
 	unsigned long modrm_ea;
 	void *modrm_ptr;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 898a55ba3e1..7d2c715f1a2 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -597,7 +597,6 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
 	c->modrm_reg |= (c->modrm & 0x38) >> 3;
 	c->modrm_rm |= (c->modrm & 0x07);
 	c->modrm_ea = 0;
-	c->use_modrm_ea = 1;
 	c->modrm_seg = VCPU_SREG_DS;
 
 	if (c->modrm_mod == 3) {
-- 
cgit v1.2.3-18-g5258


From 1e87e3efe764285133866a14ddc71cf211f022c2 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 1 Aug 2010 14:42:51 +0300
Subject: KVM: x86 emulator: simplify REX.W check

(x && (x & y)) == (x & y)

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 7d2c715f1a2..a832019138f 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2358,9 +2358,8 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt)
 done_prefixes:
 
 	/* REX prefix. */
-	if (c->rex_prefix)
-		if (c->rex_prefix & 8)
-			c->op_bytes = 8;	/* REX.W */
+	if (c->rex_prefix & 8)
+		c->op_bytes = 8;	/* REX.W */
 
 	/* Opcode byte(s). */
 	opcode = opcode_table[c->b];
-- 
cgit v1.2.3-18-g5258


From 7f9b4b75be866de938a3094413a60554f7e66e4d Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 1 Aug 2010 14:46:54 +0300
Subject: KVM: x86 emulator: introduce Op3264 for mov cr and mov dr
 instructions

The operands for these instructions are 32 bits or 64 bits, depending on
long mode, and ignoring REX prefixes, or the operand size prefix.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index a832019138f..b7adfcc2f74 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -83,6 +83,7 @@
 #define Group       (1<<14)     /* Bits 3:5 of modrm byte extend opcode */
 #define GroupDual   (1<<15)     /* Alternate decoding of mod == 3 */
 /* Misc flags */
+#define Op3264      (1<<24) /* Operand is 64b in long mode, 32b otherwise */
 #define Undefined   (1<<25) /* No Such Instruction */
 #define Lock        (1<<26) /* lock prefix is allowed for the instruction */
 #define Priv        (1<<27) /* instruction generates #GP if current CPL != 0 */
@@ -2406,6 +2407,13 @@ done_prefixes:
 	if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
 		c->op_bytes = 8;
 
+	if (c->d & Op3264) {
+		if (mode == X86EMUL_MODE_PROT64)
+			c->op_bytes = 8;
+		else
+			c->op_bytes = 4;
+	}
+
 	/* ModRM and SIB bytes. */
 	if (c->d & ModRM) {
 		rc = decode_modrm(ctxt, ops);
-- 
cgit v1.2.3-18-g5258


From cecc9e39161898eb767a6b797e27a1660b3eb27e Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 1 Aug 2010 14:48:44 +0300
Subject: KVM: x86 emulator: mark mov cr and mov dr as 64-bit instructions in
 long mode

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index b7adfcc2f74..20752dc84f1 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2210,8 +2210,8 @@ static struct opcode twobyte_table[256] = {
 	/* 0x10 - 0x1F */
 	N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, N,
 	/* 0x20 - 0x2F */
-	D(ModRM | ImplicitOps | Priv), D(ModRM | Priv),
-	D(ModRM | ImplicitOps | Priv), D(ModRM | Priv),
+	D(ModRM | ImplicitOps | Priv | Op3264), D(ModRM | Priv | Op3264),
+	D(ModRM | ImplicitOps | Priv | Op3264), D(ModRM | Priv | Op3264),
 	N, N, N, N,
 	N, N, N, N, N, N, N, N,
 	/* 0x30 - 0x3F */
-- 
cgit v1.2.3-18-g5258


From 1a0c7d44e4553ffb4902ec15549a9b855cd05a59 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 1 Aug 2010 14:25:22 +0300
Subject: KVM: x86 emulator: use struct operand for mov reg,cr and mov cr,reg
 for reg op

This is an ordinary modrm source or destination; use the standard structure
representing it.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 20752dc84f1..562e0343e2a 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2210,8 +2210,8 @@ static struct opcode twobyte_table[256] = {
 	/* 0x10 - 0x1F */
 	N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, N,
 	/* 0x20 - 0x2F */
-	D(ModRM | ImplicitOps | Priv | Op3264), D(ModRM | Priv | Op3264),
-	D(ModRM | ImplicitOps | Priv | Op3264), D(ModRM | Priv | Op3264),
+	D(ModRM | DstMem | Priv | Op3264), D(ModRM | Priv | Op3264),
+	D(ModRM | SrcMem | Priv | Op3264), D(ModRM | Priv | Op3264),
 	N, N, N, N,
 	N, N, N, N, N, N, N, N,
 	/* 0x30 - 0x3F */
@@ -3240,8 +3240,7 @@ twobyte_insn:
 			emulate_ud(ctxt);
 			goto done;
 		}
-		c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu);
-		c->dst.type = OP_NONE;	/* no writeback */
+		c->dst.val = ops->get_cr(c->modrm_reg, ctxt->vcpu);
 		break;
 	case 0x21: /* mov from dr to reg */
 		if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
@@ -3253,7 +3252,7 @@ twobyte_insn:
 		c->dst.type = OP_NONE;	/* no writeback */
 		break;
 	case 0x22: /* mov reg, cr */
-		if (ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu)) {
+		if (ops->set_cr(c->modrm_reg, c->src.val, ctxt->vcpu)) {
 			emulate_gp(ctxt, 0);
 			goto done;
 		}
-- 
cgit v1.2.3-18-g5258


From b27f38563d956135a5e80aca749b399ac5f3158a Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 1 Aug 2010 14:25:22 +0300
Subject: KVM: x86 emulator: use struct operand for mov reg,dr and mov dr,reg
 for reg op

This is an ordinary modrm source or destination; use the standard structure
representing it.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 562e0343e2a..628fb5de6a4 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2210,8 +2210,8 @@ static struct opcode twobyte_table[256] = {
 	/* 0x10 - 0x1F */
 	N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, N,
 	/* 0x20 - 0x2F */
-	D(ModRM | DstMem | Priv | Op3264), D(ModRM | Priv | Op3264),
-	D(ModRM | SrcMem | Priv | Op3264), D(ModRM | Priv | Op3264),
+	D(ModRM | DstMem | Priv | Op3264), D(ModRM | DstMem | Priv | Op3264),
+	D(ModRM | SrcMem | Priv | Op3264), D(ModRM | SrcMem | Priv | Op3264),
 	N, N, N, N,
 	N, N, N, N, N, N, N, N,
 	/* 0x30 - 0x3F */
@@ -3248,8 +3248,7 @@ twobyte_insn:
 			emulate_ud(ctxt);
 			goto done;
 		}
-		ops->get_dr(c->modrm_reg, &c->regs[c->modrm_rm], ctxt->vcpu);
-		c->dst.type = OP_NONE;	/* no writeback */
+		ops->get_dr(c->modrm_reg, &c->dst.val, ctxt->vcpu);
 		break;
 	case 0x22: /* mov reg, cr */
 		if (ops->set_cr(c->modrm_reg, c->src.val, ctxt->vcpu)) {
@@ -3265,7 +3264,7 @@ twobyte_insn:
 			goto done;
 		}
 
-		if (ops->set_dr(c->modrm_reg, c->regs[c->modrm_rm] &
+		if (ops->set_dr(c->modrm_reg, c->src.val &
 				((ctxt->mode == X86EMUL_MODE_PROT64) ?
 				 ~0ULL : ~0U), ctxt->vcpu) < 0) {
 			/* #UD condition is already handled by the code above */
-- 
cgit v1.2.3-18-g5258


From 5a506b125f1c97c846654ebacc913a136284e42b Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 1 Aug 2010 15:10:29 +0300
Subject: KVM: x86 emulator: add NoAccess flag for memory instructions that
 skip access

Use for INVLPG, which accesses the tlb, not memory.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 628fb5de6a4..80efe76c1ab 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -83,6 +83,7 @@
 #define Group       (1<<14)     /* Bits 3:5 of modrm byte extend opcode */
 #define GroupDual   (1<<15)     /* Alternate decoding of mod == 3 */
 /* Misc flags */
+#define NoAccess    (1<<23) /* Don't access memory (lea/invlpg/verr etc) */
 #define Op3264      (1<<24) /* Operand is 64b in long mode, 32b otherwise */
 #define Undefined   (1<<25) /* No Such Instruction */
 #define Lock        (1<<26) /* lock prefix is allowed for the instruction */
@@ -2067,7 +2068,8 @@ static struct opcode group5[] = {
 static struct group_dual group7 = { {
 	N, N, D(ModRM | SrcMem | Priv), D(ModRM | SrcMem | Priv),
 	D(SrcNone | ModRM | DstMem | Mov), N,
-	D(SrcMem16 | ModRM | Mov | Priv), D(SrcMem | ModRM | ByteOp | Priv),
+	D(SrcMem16 | ModRM | Mov | Priv),
+	D(SrcMem | ModRM | ByteOp | Priv | NoAccess),
 }, {
 	D(SrcNone | ModRM | Priv), N, N, D(SrcNone | ModRM | Priv),
 	D(SrcNone | ModRM | DstMem | Mov), N,
@@ -2456,7 +2458,7 @@ done_prefixes:
 		c->src.bytes = (c->d & ByteOp) ? 1 :
 							   c->op_bytes;
 		/* Don't fetch the address for invlpg: it could be unmapped. */
-		if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7)
+		if (c->d & NoAccess)
 			break;
 	srcmem_common:
 		/*
-- 
cgit v1.2.3-18-g5258


From 342fc63095e2d676f209b202d41a3f670dd9bf08 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 1 Aug 2010 15:13:22 +0300
Subject: KVM: x86 emulator: switch LEA to use SrcMem decoding

The NoAccess flag will prevent memory from being accessed.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 80efe76c1ab..b8aa667b52b 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2153,7 +2153,7 @@ static struct opcode opcode_table[256] = {
 	/* 0x88 - 0x8F */
 	D(ByteOp | DstMem | SrcReg | ModRM | Mov), D(DstMem | SrcReg | ModRM | Mov),
 	D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem | ModRM | Mov),
-	D(DstMem | SrcNone | ModRM | Mov), D(ModRM | DstReg),
+	D(DstMem | SrcNone | ModRM | Mov), D(ModRM | SrcMem | NoAccess | DstReg),
 	D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A),
 	/* 0x90 - 0x97 */
 	X8(D(SrcAcc | DstReg)),
@@ -2895,7 +2895,7 @@ special_insn:
 		c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu);
 		break;
 	case 0x8d: /* lea r16/r32, m */
-		c->dst.val = c->modrm_ea;
+		c->dst.val = c->src.addr.mem;
 		break;
 	case 0x8e: { /* mov seg, r/m16 */
 		uint16_t sel;
-- 
cgit v1.2.3-18-g5258


From 1f6f05800e2fdd815ac63e3264071d26d429f491 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 1 Aug 2010 15:19:22 +0300
Subject: KVM: x86 emulator: change invlpg emulation to use src.mem.addr

Instead of using modrm_ea, which will soon be gone.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index b8aa667b52b..eda69411d05 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3206,7 +3206,7 @@ twobyte_insn:
 			emulate_ud(ctxt);
 			goto done;
 		case 7: /* invlpg*/
-			emulate_invlpg(ctxt->vcpu, c->modrm_ea);
+			emulate_invlpg(ctxt->vcpu, c->src.addr.mem);
 			/* Disable writeback. */
 			c->dst.type = OP_NONE;
 			break;
-- 
cgit v1.2.3-18-g5258


From 2dbd0dd711e6c0ca6a2be9e6d93bbeb339386638 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 1 Aug 2010 15:40:19 +0300
Subject: KVM: x86 emulator: Decode memory operands directly into a 'struct
 operand'

Since modrm operand can be either register or memory, decoding it into
a 'struct operand', which can represent both, is simpler.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |   3 -
 arch/x86/kvm/emulate.c             | 125 +++++++++++++++++--------------------
 2 files changed, 57 insertions(+), 71 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index e425444658e..1e4a72ce301 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -203,9 +203,6 @@ struct decode_cache {
 	u8 modrm_rm;
 	u8 modrm_seg;
 	bool rip_relative;
-	unsigned long modrm_ea;
-	void *modrm_ptr;
-	unsigned long modrm_val;
 	struct fetch_cache fetch;
 	struct read_cache io_read;
 	struct read_cache mem_read;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index eda69411d05..955d4807464 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -581,12 +581,14 @@ static void decode_register_operand(struct operand *op,
 }
 
 static int decode_modrm(struct x86_emulate_ctxt *ctxt,
-			struct x86_emulate_ops *ops)
+			struct x86_emulate_ops *ops,
+			struct operand *op)
 {
 	struct decode_cache *c = &ctxt->decode;
 	u8 sib;
 	int index_reg = 0, base_reg = 0, scale;
 	int rc = X86EMUL_CONTINUE;
+	ulong modrm_ea = 0;
 
 	if (c->rex_prefix) {
 		c->modrm_reg = (c->rex_prefix & 4) << 1;	/* REX.R */
@@ -598,16 +600,19 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
 	c->modrm_mod |= (c->modrm & 0xc0) >> 6;
 	c->modrm_reg |= (c->modrm & 0x38) >> 3;
 	c->modrm_rm |= (c->modrm & 0x07);
-	c->modrm_ea = 0;
 	c->modrm_seg = VCPU_SREG_DS;
 
 	if (c->modrm_mod == 3) {
-		c->modrm_ptr = decode_register(c->modrm_rm,
+		op->type = OP_REG;
+		op->bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+		op->addr.reg = decode_register(c->modrm_rm,
 					       c->regs, c->d & ByteOp);
-		c->modrm_val = *(unsigned long *)c->modrm_ptr;
+		fetch_register_operand(op);
 		return rc;
 	}
 
+	op->type = OP_MEM;
+
 	if (c->ad_bytes == 2) {
 		unsigned bx = c->regs[VCPU_REGS_RBX];
 		unsigned bp = c->regs[VCPU_REGS_RBP];
@@ -618,46 +623,46 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
 		switch (c->modrm_mod) {
 		case 0:
 			if (c->modrm_rm == 6)
-				c->modrm_ea += insn_fetch(u16, 2, c->eip);
+				modrm_ea += insn_fetch(u16, 2, c->eip);
 			break;
 		case 1:
-			c->modrm_ea += insn_fetch(s8, 1, c->eip);
+			modrm_ea += insn_fetch(s8, 1, c->eip);
 			break;
 		case 2:
-			c->modrm_ea += insn_fetch(u16, 2, c->eip);
+			modrm_ea += insn_fetch(u16, 2, c->eip);
 			break;
 		}
 		switch (c->modrm_rm) {
 		case 0:
-			c->modrm_ea += bx + si;
+			modrm_ea += bx + si;
 			break;
 		case 1:
-			c->modrm_ea += bx + di;
+			modrm_ea += bx + di;
 			break;
 		case 2:
-			c->modrm_ea += bp + si;
+			modrm_ea += bp + si;
 			break;
 		case 3:
-			c->modrm_ea += bp + di;
+			modrm_ea += bp + di;
 			break;
 		case 4:
-			c->modrm_ea += si;
+			modrm_ea += si;
 			break;
 		case 5:
-			c->modrm_ea += di;
+			modrm_ea += di;
 			break;
 		case 6:
 			if (c->modrm_mod != 0)
-				c->modrm_ea += bp;
+				modrm_ea += bp;
 			break;
 		case 7:
-			c->modrm_ea += bx;
+			modrm_ea += bx;
 			break;
 		}
 		if (c->modrm_rm == 2 || c->modrm_rm == 3 ||
 		    (c->modrm_rm == 6 && c->modrm_mod != 0))
 			c->modrm_seg = VCPU_SREG_SS;
-		c->modrm_ea = (u16)c->modrm_ea;
+		modrm_ea = (u16)modrm_ea;
 	} else {
 		/* 32/64-bit ModR/M decode. */
 		if ((c->modrm_rm & 7) == 4) {
@@ -667,48 +672,51 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
 			scale = sib >> 6;
 
 			if ((base_reg & 7) == 5 && c->modrm_mod == 0)
-				c->modrm_ea += insn_fetch(s32, 4, c->eip);
+				modrm_ea += insn_fetch(s32, 4, c->eip);
 			else
-				c->modrm_ea += c->regs[base_reg];
+				modrm_ea += c->regs[base_reg];
 			if (index_reg != 4)
-				c->modrm_ea += c->regs[index_reg] << scale;
+				modrm_ea += c->regs[index_reg] << scale;
 		} else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) {
 			if (ctxt->mode == X86EMUL_MODE_PROT64)
 				c->rip_relative = 1;
 		} else
-			c->modrm_ea += c->regs[c->modrm_rm];
+			modrm_ea += c->regs[c->modrm_rm];
 		switch (c->modrm_mod) {
 		case 0:
 			if (c->modrm_rm == 5)
-				c->modrm_ea += insn_fetch(s32, 4, c->eip);
+				modrm_ea += insn_fetch(s32, 4, c->eip);
 			break;
 		case 1:
-			c->modrm_ea += insn_fetch(s8, 1, c->eip);
+			modrm_ea += insn_fetch(s8, 1, c->eip);
 			break;
 		case 2:
-			c->modrm_ea += insn_fetch(s32, 4, c->eip);
+			modrm_ea += insn_fetch(s32, 4, c->eip);
 			break;
 		}
 	}
+	op->addr.mem = modrm_ea;
 done:
 	return rc;
 }
 
 static int decode_abs(struct x86_emulate_ctxt *ctxt,
-		      struct x86_emulate_ops *ops)
+		      struct x86_emulate_ops *ops,
+		      struct operand *op)
 {
 	struct decode_cache *c = &ctxt->decode;
 	int rc = X86EMUL_CONTINUE;
 
+	op->type = OP_MEM;
 	switch (c->ad_bytes) {
 	case 2:
-		c->modrm_ea = insn_fetch(u16, 2, c->eip);
+		op->addr.mem = insn_fetch(u16, 2, c->eip);
 		break;
 	case 4:
-		c->modrm_ea = insn_fetch(u32, 4, c->eip);
+		op->addr.mem = insn_fetch(u32, 4, c->eip);
 		break;
 	case 8:
-		c->modrm_ea = insn_fetch(u64, 8, c->eip);
+		op->addr.mem = insn_fetch(u64, 8, c->eip);
 		break;
 	}
 done:
@@ -2280,6 +2288,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt)
 	int mode = ctxt->mode;
 	int def_op_bytes, def_ad_bytes, dual, goffset;
 	struct opcode opcode, *g_mod012, *g_mod3;
+	struct operand memop = { .type = OP_NONE };
 
 	/* we cannot decode insn before we complete previous rep insn */
 	WARN_ON(ctxt->restart);
@@ -2418,25 +2427,25 @@ done_prefixes:
 
 	/* ModRM and SIB bytes. */
 	if (c->d & ModRM) {
-		rc = decode_modrm(ctxt, ops);
+		rc = decode_modrm(ctxt, ops, &memop);
 		if (!c->has_seg_override)
 			set_seg_override(c, c->modrm_seg);
 	} else if (c->d & MemAbs)
-		rc = decode_abs(ctxt, ops);
+		rc = decode_abs(ctxt, ops, &memop);
 	if (rc != X86EMUL_CONTINUE)
 		goto done;
 
 	if (!c->has_seg_override)
 		set_seg_override(c, VCPU_SREG_DS);
 
-	if (!(!c->twobyte && c->b == 0x8d))
-		c->modrm_ea += seg_override_base(ctxt, ops, c);
+	if (memop.type == OP_MEM && !(!c->twobyte && c->b == 0x8d))
+		memop.addr.mem += seg_override_base(ctxt, ops, c);
 
-	if (c->ad_bytes != 8)
-		c->modrm_ea = (u32)c->modrm_ea;
+	if (memop.type == OP_MEM && c->ad_bytes != 8)
+		memop.addr.mem = (u32)memop.addr.mem;
 
-	if (c->rip_relative)
-		c->modrm_ea += c->eip;
+	if (memop.type == OP_MEM && c->rip_relative)
+		memop.addr.mem += c->eip;
 
 	/*
 	 * Decode and fetch the source operand: register, memory
@@ -2449,31 +2458,16 @@ done_prefixes:
 		decode_register_operand(&c->src, c, 0);
 		break;
 	case SrcMem16:
-		c->src.bytes = 2;
+		memop.bytes = 2;
 		goto srcmem_common;
 	case SrcMem32:
-		c->src.bytes = 4;
+		memop.bytes = 4;
 		goto srcmem_common;
 	case SrcMem:
-		c->src.bytes = (c->d & ByteOp) ? 1 :
+		memop.bytes = (c->d & ByteOp) ? 1 :
 							   c->op_bytes;
-		/* Don't fetch the address for invlpg: it could be unmapped. */
-		if (c->d & NoAccess)
-			break;
 	srcmem_common:
-		/*
-		 * For instructions with a ModR/M byte, switch to register
-		 * access if Mod = 3.
-		 */
-		if ((c->d & ModRM) && c->modrm_mod == 3) {
-			c->src.type = OP_REG;
-			c->src.val = c->modrm_val;
-			c->src.addr.reg = c->modrm_ptr;
-			break;
-		}
-		c->src.type = OP_MEM;
-		c->src.addr.mem = c->modrm_ea;
-		c->src.val = 0;
+		c->src = memop;
 		break;
 	case SrcImm:
 	case SrcImmU:
@@ -2543,9 +2537,8 @@ done_prefixes:
 		insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip);
 		break;
 	case SrcMemFAddr:
-		c->src.type = OP_MEM;
-		c->src.addr.mem = c->modrm_ea;
-		c->src.bytes = c->op_bytes + 2;
+		memop.bytes = c->op_bytes + 2;
+		goto srcmem_common;
 		break;
 	}
 
@@ -2583,26 +2576,18 @@ done_prefixes:
 		break;
 	case DstMem:
 	case DstMem64:
-		if ((c->d & ModRM) && c->modrm_mod == 3) {
-			c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-			c->dst.type = OP_REG;
-			c->dst.val = c->dst.orig_val = c->modrm_val;
-			c->dst.addr.reg = c->modrm_ptr;
-			break;
-		}
-		c->dst.type = OP_MEM;
-		c->dst.addr.mem = c->modrm_ea;
+		c->dst = memop;
 		if ((c->d & DstMask) == DstMem64)
 			c->dst.bytes = 8;
 		else
 			c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->dst.val = 0;
-		if (c->d & BitOp) {
+		if (c->dst.type == OP_MEM && (c->d & BitOp)) {
 			unsigned long mask = ~(c->dst.bytes * 8 - 1);
 
 			c->dst.addr.mem = c->dst.addr.mem +
 						   (c->src.val & mask) / 8;
 		}
+		c->dst.orig_val = c->dst.val;
 		break;
 	case DstAcc:
 		c->dst.type = OP_REG;
@@ -2682,11 +2667,15 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 	}
 
 	if (c->src.type == OP_MEM) {
+		if (c->d & NoAccess)
+			goto no_fetch;
 		rc = read_emulated(ctxt, ops, c->src.addr.mem,
 					c->src.valptr, c->src.bytes);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		c->src.orig_val64 = c->src.val64;
+	no_fetch:
+		;
 	}
 
 	if (c->src2.type == OP_MEM) {
-- 
cgit v1.2.3-18-g5258


From 34698d8c61bd3fc86b2e99c3d1ad9ef140b3eb0d Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <m.gamal005@gmail.com>
Date: Wed, 4 Aug 2010 14:41:04 +0300
Subject: KVM: x86 emulator: Fix nop emulation

If a nop instruction is encountered, we jump directly to the done label.
This skip updating rip. Break from the switch case instead

Signed-off-by: Mohammed Gamal <m.gamal005@gmail.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 955d4807464..ddbad15c948 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2912,7 +2912,7 @@ special_insn:
 		break;
 	case 0x90 ... 0x97: /* nop / xchg reg, rax */
 		if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX])
-			goto done;
+			break;
 		goto xchg;
 	case 0x9c: /* pushf */
 		c->src.val =  (unsigned long) ctxt->eflags;
-- 
cgit v1.2.3-18-g5258


From ba492962363a02c45836be205f339be48093e1be Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 29 Jul 2010 14:47:56 +0200
Subject: KVM: Move kvm_guest_init out of generic code

Currently x86 is the only architecture that uses kvm_guest_init(). With
PowerPC we're getting a second user, but the signature is different there
and we don't need to export it, as it uses the normal kernel init framework.

So let's move the x86 specific definition of that function over to the x86
specfic header file.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_para.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 05eba5e9a8e..7b562b6184b 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -158,6 +158,12 @@ static inline unsigned int kvm_arch_para_features(void)
 	return cpuid_eax(KVM_CPUID_FEATURES);
 }
 
+#ifdef CONFIG_KVM_GUEST
+void __init kvm_guest_init(void);
+#else
+#define kvm_guest_init() do { } while (0)
 #endif
 
+#endif /* __KERNEL__ */
+
 #endif /* _ASM_X86_KVM_PARA_H */
-- 
cgit v1.2.3-18-g5258


From d3ad6243293d92c82530a50c77d71bb0a0a42fdc Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Thu, 5 Aug 2010 16:34:39 +0800
Subject: KVM: x86 emulator: simplify two-byte opcode check

Two-byte opcode always start with 0x0F and the decode flags
of opcode 0xF0 is always 0, so remove dup check.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index ddbad15c948..a9a4a0b78a7 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2375,13 +2375,11 @@ done_prefixes:
 
 	/* Opcode byte(s). */
 	opcode = opcode_table[c->b];
-	if (opcode.flags == 0) {
-		/* Two-byte opcode? */
-		if (c->b == 0x0f) {
-			c->twobyte = 1;
-			c->b = insn_fetch(u8, 1, c->eip);
-			opcode = twobyte_table[c->b];
-		}
+	/* Two-byte opcode? */
+	if (c->b == 0x0f) {
+		c->twobyte = 1;
+		c->b = insn_fetch(u8, 1, c->eip);
+		opcode = twobyte_table[c->b];
 	}
 	c->d = opcode.flags;
 
-- 
cgit v1.2.3-18-g5258


From 160ce1f1a8fe64b3e2686ae73fbf051ccfe7c7ef Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <m.gamal005@gmail.com>
Date: Wed, 4 Aug 2010 05:44:24 +0300
Subject: KVM: x86 emulator: Allow accessing IDT via emulator ops

The patch adds a new member get_idt() to x86_emulate_ops.
It also adds a function to get the idt in order to be used by the emulator.

This is needed for real mode interrupt injection and the emulation of int
instructions.

Signed-off-by: Mohammed Gamal <m.gamal005@gmail.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h | 1 +
 arch/x86/kvm/x86.c                 | 6 ++++++
 2 files changed, 7 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 1e4a72ce301..1bbf2b6f2a7 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -139,6 +139,7 @@ struct x86_emulate_ops {
 	void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu);
 	unsigned long (*get_cached_segment_base)(int seg, struct kvm_vcpu *vcpu);
 	void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu);
+	void (*get_idt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu);
 	ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu);
 	int (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu);
 	int (*cpl)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 35c0f4e4a62..768197a34d3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3790,6 +3790,11 @@ static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu)
 	kvm_x86_ops->get_gdt(vcpu, dt);
 }
 
+static void emulator_get_idt(struct desc_ptr *dt, struct kvm_vcpu *vcpu)
+{
+	kvm_x86_ops->get_idt(vcpu, dt);
+}
+
 static unsigned long emulator_get_cached_segment_base(int seg,
 						      struct kvm_vcpu *vcpu)
 {
@@ -3883,6 +3888,7 @@ static struct x86_emulate_ops emulate_ops = {
 	.set_segment_selector = emulator_set_segment_selector,
 	.get_cached_segment_base = emulator_get_cached_segment_base,
 	.get_gdt             = emulator_get_gdt,
+	.get_idt	     = emulator_get_idt,
 	.get_cr              = emulator_get_cr,
 	.set_cr              = emulator_set_cr,
 	.cpl                 = emulator_get_cpl,
-- 
cgit v1.2.3-18-g5258


From 6e154e56b4d7a6a28c54f0984e13d3f8defc4755 Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <m.gamal005@gmail.com>
Date: Wed, 4 Aug 2010 14:38:06 +0300
Subject: KVM: x86 emulator: Add into, int, and int3 instructions (opcodes
 0xcc-0xce)

This adds support for int instructions to the emulator.

Signed-off-by: Mohammed Gamal <m.gamal005@gmail.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index a9a4a0b78a7..5205d689082 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1180,6 +1180,67 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt,
 	return rc;
 }
 
+int emulate_int_real(struct x86_emulate_ctxt *ctxt,
+			       struct x86_emulate_ops *ops, int irq)
+{
+	struct decode_cache *c = &ctxt->decode;
+	int rc = X86EMUL_CONTINUE;
+	struct desc_ptr dt;
+	gva_t cs_addr;
+	gva_t eip_addr;
+	u16 cs, eip;
+	u32 err;
+
+	/* TODO: Add limit checks */
+	c->src.val = ctxt->eflags;
+	emulate_push(ctxt, ops);
+
+	ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC);
+
+	c->src.val = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
+	emulate_push(ctxt, ops);
+
+	c->src.val = c->eip;
+	emulate_push(ctxt, ops);
+
+	ops->get_idt(&dt, ctxt->vcpu);
+
+	eip_addr = dt.address + (irq << 2);
+	cs_addr = dt.address + (irq << 2) + 2;
+
+	rc = ops->read_std(cs_addr, &cs, 2, ctxt->vcpu, &err);
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+
+	rc = ops->read_std(eip_addr, &eip, 2, ctxt->vcpu, &err);
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+
+	rc = load_segment_descriptor(ctxt, ops, cs, VCPU_SREG_CS);
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+
+	c->eip = eip;
+
+	return rc;
+}
+
+static int emulate_int(struct x86_emulate_ctxt *ctxt,
+		       struct x86_emulate_ops *ops, int irq)
+{
+	switch(ctxt->mode) {
+	case X86EMUL_MODE_REAL:
+		return emulate_int_real(ctxt, ops, irq);
+	case X86EMUL_MODE_VM86:
+	case X86EMUL_MODE_PROT16:
+	case X86EMUL_MODE_PROT32:
+	case X86EMUL_MODE_PROT64:
+	default:
+		/* Protected mode interrupts unimplemented yet */
+		return X86EMUL_UNHANDLEABLE;
+	}
+}
+
 static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
 			     struct x86_emulate_ops *ops)
 {
@@ -2616,6 +2677,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 	struct decode_cache *c = &ctxt->decode;
 	int rc = X86EMUL_CONTINUE;
 	int saved_dst_type = c->dst.type;
+	int irq; /* Used for int 3, int, and into */
 
 	ctxt->decode.mem_read.pos = 0;
 
@@ -2960,6 +3022,22 @@ special_insn:
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		break;
+	case 0xcc:		/* int3 */
+		irq = 3;
+		goto do_interrupt;
+	case 0xcd:		/* int n */
+		irq = c->src.val;
+	do_interrupt:
+		rc = emulate_int(ctxt, ops, irq);
+		if (rc != X86EMUL_CONTINUE)
+			goto done;
+		break;
+	case 0xce:		/* into */
+		if (ctxt->eflags & EFLG_OF) {
+			irq = 4;
+			goto do_interrupt;
+		}
+		break;
 	case 0xcf:		/* iret */
 		rc = emulate_iret(ctxt, ops);
 
-- 
cgit v1.2.3-18-g5258


From 06cb704611caf40e531a3835809283f14f5307d5 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Wed, 4 Aug 2010 15:36:53 +0800
Subject: KVM: x86 emulator: use SrcAcc to simplify stos decoding

Use SrcAcc to simplify stos decoding.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5205d689082..6c1e4d6c12c 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2235,7 +2235,8 @@ static struct opcode opcode_table[256] = {
 	D(ByteOp | SrcSI | DstDI | Mov | String), D(SrcSI | DstDI | Mov | String),
 	D(ByteOp | SrcSI | DstDI | String), D(SrcSI | DstDI | String),
 	/* 0xA8 - 0xAF */
-	D(DstAcc | SrcImmByte | ByteOp), D(DstAcc | SrcImm), D(ByteOp | DstDI | Mov | String), D(DstDI | Mov | String),
+	D(DstAcc | SrcImmByte | ByteOp), D(DstAcc | SrcImm),
+	D(ByteOp | SrcAcc | DstDI | Mov | String), D(SrcAcc | DstDI | Mov | String),
 	D(ByteOp | SrcSI | DstAcc | Mov | String), D(SrcSI | DstAcc | Mov | String),
 	D(ByteOp | DstDI | String), D(DstDI | String),
 	/* 0xB0 - 0xB7 */
@@ -2996,8 +2997,6 @@ special_insn:
 	case 0xa8 ... 0xa9:	/* test ax, imm */
 		goto test;
 	case 0xaa ... 0xab:	/* stos */
-		c->dst.val = c->regs[VCPU_REGS_RAX];
-		break;
 	case 0xac ... 0xad:	/* lods */
 		goto mov;
 	case 0xae ... 0xaf:	/* scas */
-- 
cgit v1.2.3-18-g5258


From 36089fed70337f4d96a5c3aa7fadc4095b707f73 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Wed, 4 Aug 2010 15:38:18 +0800
Subject: KVM: x86 emulator: disable writeback when decode dest operand

This patch change to disable writeback when decode dest
operand if the dest type is ImplicitOps or not specified.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 23 ++++++-----------------
 1 file changed, 6 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 6c1e4d6c12c..e0216eb8b57 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2627,9 +2627,6 @@ done_prefixes:
 
 	/* Decode and fetch the destination operand: register or memory. */
 	switch (c->d & DstMask) {
-	case ImplicitOps:
-		/* Special instructions do their own operand decoding. */
-		return 0;
 	case DstReg:
 		decode_register_operand(&c->dst, c,
 			 c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
@@ -2664,6 +2661,11 @@ done_prefixes:
 					 c->regs[VCPU_REGS_RDI]);
 		c->dst.val = 0;
 		break;
+	case ImplicitOps:
+		/* Special instructions do their own operand decoding. */
+	default:
+		c->dst.type = OP_NONE; /* Disable writeback. */
+		return 0;
 	}
 
 done:
@@ -3115,7 +3117,6 @@ special_insn:
 	case 0xf5:	/* cmc */
 		/* complement carry flag from eflags reg */
 		ctxt->eflags ^= EFLG_CF;
-		c->dst.type = OP_NONE;	/* Disable writeback. */
 		break;
 	case 0xf6 ... 0xf7:	/* Grp3 */
 		if (!emulate_grp3(ctxt, ops))
@@ -3123,16 +3124,13 @@ special_insn:
 		break;
 	case 0xf8: /* clc */
 		ctxt->eflags &= ~EFLG_CF;
-		c->dst.type = OP_NONE;	/* Disable writeback. */
 		break;
 	case 0xfa: /* cli */
 		if (emulator_bad_iopl(ctxt, ops)) {
 			emulate_gp(ctxt, 0);
 			goto done;
-		} else {
+		} else
 			ctxt->eflags &= ~X86_EFLAGS_IF;
-			c->dst.type = OP_NONE;	/* Disable writeback. */
-		}
 		break;
 	case 0xfb: /* sti */
 		if (emulator_bad_iopl(ctxt, ops)) {
@@ -3141,16 +3139,13 @@ special_insn:
 		} else {
 			ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
 			ctxt->eflags |= X86_EFLAGS_IF;
-			c->dst.type = OP_NONE;	/* Disable writeback. */
 		}
 		break;
 	case 0xfc: /* cld */
 		ctxt->eflags &= ~EFLG_DF;
-		c->dst.type = OP_NONE;	/* Disable writeback. */
 		break;
 	case 0xfd: /* std */
 		ctxt->eflags |= EFLG_DF;
-		c->dst.type = OP_NONE;	/* Disable writeback. */
 		break;
 	case 0xfe: /* Grp4 */
 	grp45:
@@ -3287,16 +3282,13 @@ twobyte_insn:
 		break;
 	case 0x06:
 		emulate_clts(ctxt->vcpu);
-		c->dst.type = OP_NONE;
 		break;
 	case 0x09:		/* wbinvd */
 		kvm_emulate_wbinvd(ctxt->vcpu);
-		c->dst.type = OP_NONE;
 		break;
 	case 0x08:		/* invd */
 	case 0x0d:		/* GrpP (prefetch) */
 	case 0x18:		/* Grp16 (prefetch/nop) */
-		c->dst.type = OP_NONE;
 		break;
 	case 0x20: /* mov cr, reg */
 		switch (c->modrm_reg) {
@@ -3349,7 +3341,6 @@ twobyte_insn:
 			goto done;
 		}
 		rc = X86EMUL_CONTINUE;
-		c->dst.type = OP_NONE;
 		break;
 	case 0x32:
 		/* rdmsr */
@@ -3361,7 +3352,6 @@ twobyte_insn:
 			c->regs[VCPU_REGS_RDX] = msr_data >> 32;
 		}
 		rc = X86EMUL_CONTINUE;
-		c->dst.type = OP_NONE;
 		break;
 	case 0x34:		/* sysenter */
 		rc = emulate_sysenter(ctxt, ops);
@@ -3385,7 +3375,6 @@ twobyte_insn:
 	case 0x80 ... 0x8f: /* jnz rel, etc*/
 		if (test_cc(c->b, ctxt->eflags))
 			jmp_rel(c, c->src.val);
-		c->dst.type = OP_NONE;
 		break;
 	case 0xa0:	  /* push fs */
 		emulate_push_sreg(ctxt, ops, VCPU_SREG_FS);
-- 
cgit v1.2.3-18-g5258


From c034da8b927dc682fe7944895d67f99f07e3740f Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Wed, 4 Aug 2010 15:38:59 +0800
Subject: KVM: x86 emulator: using SrcOne for instruction d0/d1 decoding

Using SrcOne for instruction d0/d1 decoding.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index e0216eb8b57..d711d6aa816 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2251,7 +2251,7 @@ static struct opcode opcode_table[256] = {
 	N, N, N, D(ImplicitOps | Stack),
 	D(ImplicitOps), D(SrcImmByte), D(ImplicitOps | No64), D(ImplicitOps),
 	/* 0xD0 - 0xD7 */
-	D(ByteOp | DstMem | SrcImplicit | ModRM), D(DstMem | SrcImplicit | ModRM),
+	D(ByteOp | DstMem | SrcOne | ModRM), D(DstMem | SrcOne | ModRM),
 	D(ByteOp | DstMem | SrcImplicit | ModRM), D(DstMem | SrcImplicit | ModRM),
 	N, N, N, N,
 	/* 0xD8 - 0xDF */
@@ -3046,7 +3046,6 @@ special_insn:
 			goto done;
 		break;
 	case 0xd0 ... 0xd1:	/* Grp2 */
-		c->src.val = 1;
 		emulate_grp2(ctxt);
 		break;
 	case 0xd2 ... 0xd3:	/* Grp2 */
-- 
cgit v1.2.3-18-g5258


From 8744aa9aad56be756a58126b429f176898631c3f Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <m.gamal005@gmail.com>
Date: Thu, 5 Aug 2010 15:42:49 +0300
Subject: KVM: x86 emulator: Add stc instruction (opcode 0xf9)

Signed-off-by: Mohammed Gamal <m.gamal005@gmail.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index d711d6aa816..175b41690d6 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2269,7 +2269,7 @@ static struct opcode opcode_table[256] = {
 	N, N, N, N,
 	D(ImplicitOps | Priv), D(ImplicitOps), G(ByteOp, group3), G(0, group3),
 	/* 0xF8 - 0xFF */
-	D(ImplicitOps), N, D(ImplicitOps), D(ImplicitOps),
+	D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), D(ImplicitOps),
 	D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5),
 };
 
@@ -3124,6 +3124,9 @@ special_insn:
 	case 0xf8: /* clc */
 		ctxt->eflags &= ~EFLG_CF;
 		break;
+	case 0xf9: /* stc */
+		ctxt->eflags |= EFLG_CF;
+		break;
 	case 0xfa: /* cli */
 		if (emulator_bad_iopl(ctxt, ops)) {
 			emulate_gp(ctxt, 0);
-- 
cgit v1.2.3-18-g5258


From 35c843c4857e2a818d1d951d87c40ee2cf5c1be8 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Mon, 9 Aug 2010 11:34:56 +0800
Subject: KVM: x86 emulator: fix negative bit offset BitOp instruction
 emulation

If bit offset operands is a negative number, BitOp instruction
will return wrong value. This patch fix it.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 175b41690d6..5fc441c064b 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -723,6 +723,22 @@ done:
 	return rc;
 }
 
+static void fetch_bit_operand(struct decode_cache *c)
+{
+	long sv, mask;
+
+	if (c->dst.type == OP_MEM) {
+		mask = ~(c->dst.bytes * 8 - 1);
+
+		if (c->src.bytes == 2)
+			sv = (s16)c->src.val & (s16)mask;
+		else if (c->src.bytes == 4)
+			sv = (s32)c->src.val & (s32)mask;
+
+		c->dst.addr.mem += (sv >> 3);
+	}
+}
+
 static int read_emulated(struct x86_emulate_ctxt *ctxt,
 			 struct x86_emulate_ops *ops,
 			 unsigned long addr, void *dest, unsigned size)
@@ -2638,12 +2654,8 @@ done_prefixes:
 			c->dst.bytes = 8;
 		else
 			c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		if (c->dst.type == OP_MEM && (c->d & BitOp)) {
-			unsigned long mask = ~(c->dst.bytes * 8 - 1);
-
-			c->dst.addr.mem = c->dst.addr.mem +
-						   (c->src.val & mask) / 8;
-		}
+		if (c->d & BitOp)
+			fetch_bit_operand(c);
 		c->dst.orig_val = c->dst.val;
 		break;
 	case DstAcc:
-- 
cgit v1.2.3-18-g5258


From 3885f18fe3034a10b3e3923885d70d31ba522844 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Mon, 9 Aug 2010 11:37:37 +0800
Subject: KVM: x86 emulator: do not adjust the address for immediate source

adjust the dst address for a register source but not adjust the
address for an immediate source.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5fc441c064b..9b81cde8ffa 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -727,7 +727,7 @@ static void fetch_bit_operand(struct decode_cache *c)
 {
 	long sv, mask;
 
-	if (c->dst.type == OP_MEM) {
+	if (c->dst.type == OP_MEM && c->src.type == OP_REG) {
 		mask = ~(c->dst.bytes * 8 - 1);
 
 		if (c->src.bytes == 2)
-- 
cgit v1.2.3-18-g5258


From ba7ff2b76dcf05c4681c2648019b8301ada6f3df Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Mon, 9 Aug 2010 11:39:14 +0800
Subject: KVM: x86 emulator: mask group 8 instruction as BitOp

Mask group 8 instruction as BitOp, so we can share the
code for adjust the source operand.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 9b81cde8ffa..a9b2b9e6a3f 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -737,6 +737,9 @@ static void fetch_bit_operand(struct decode_cache *c)
 
 		c->dst.addr.mem += (sv >> 3);
 	}
+
+	/* only subword offset */
+	c->src.val &= (c->dst.bytes << 3) - 1;
 }
 
 static int read_emulated(struct x86_emulate_ctxt *ctxt,
@@ -2336,7 +2339,7 @@ static struct opcode twobyte_table[256] = {
 	    D(DstReg | SrcMem16 | ModRM | Mov),
 	/* 0xB8 - 0xBF */
 	N, N,
-	G(0, group8), D(DstMem | SrcReg | ModRM | BitOp | Lock),
+	G(BitOp, group8), D(DstMem | SrcReg | ModRM | BitOp | Lock),
 	N, N, D(ByteOp | DstReg | SrcMem | ModRM | Mov),
 	    D(DstReg | SrcMem16 | ModRM | Mov),
 	/* 0xC0 - 0xCF */
@@ -3419,8 +3422,6 @@ twobyte_insn:
 		break;
 	case 0xab:
 	      bts:		/* bts */
-		/* only subword offset */
-		c->src.val &= (c->dst.bytes << 3) - 1;
 		emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags);
 		break;
 	case 0xac: /* shrd imm8, r, r/m */
@@ -3448,8 +3449,6 @@ twobyte_insn:
 		break;
 	case 0xb3:
 	      btr:		/* btr */
-		/* only subword offset */
-		c->src.val &= (c->dst.bytes << 3) - 1;
 		emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags);
 		break;
 	case 0xb6 ... 0xb7:	/* movzx */
@@ -3471,8 +3470,6 @@ twobyte_insn:
 		break;
 	case 0xbb:
 	      btc:		/* btc */
-		/* only subword offset */
-		c->src.val &= (c->dst.bytes << 3) - 1;
 		emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags);
 		break;
 	case 0xbe ... 0xbf:	/* movsx */
-- 
cgit v1.2.3-18-g5258


From 3f9f53b0d599aabb03db35208fb31768568ca83f Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <m.gamal005@gmail.com>
Date: Sun, 8 Aug 2010 21:11:37 +0300
Subject: KVM: x86 emulator: Add unary mul, imul, div, and idiv instructions

This adds unary mul, imul, div, and idiv instructions (group 3 r/m 4-7).

Signed-off-by: Mohammed Gamal <m.gamal005@gmail.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 41 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 40 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index a9b2b9e6a3f..f0415eab659 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -315,6 +315,31 @@ struct group_dual {
 		}							\
 	} while (0)
 
+#define __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, _suffix)		\
+	do {								\
+		unsigned long _tmp;					\
+									\
+		__asm__ __volatile__ (					\
+			_PRE_EFLAGS("0", "4", "1")			\
+			_op _suffix " %5; "				\
+			_POST_EFLAGS("0", "4", "1")			\
+			: "=m" (_eflags), "=&r" (_tmp),			\
+			  "+a" (_rax), "+d" (_rdx)			\
+			: "i" (EFLAGS_MASK), "m" ((_src).val),		\
+			  "a" (_rax), "d" (_rdx));			\
+	} while (0)
+
+/* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */
+#define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags)			\
+	do {									\
+		switch((_src).bytes) {						\
+		case 1: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "b"); break; \
+		case 2: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx,  _eflags, "w"); break; \
+		case 4: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "l"); break; \
+		case 8: ON64(__emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "q")); break; \
+		}							\
+	} while (0)
+
 /* Fetch next part of the instruction being emulated. */
 #define insn_fetch(_type, _size, _eip)                                  \
 ({	unsigned long _x;						\
@@ -1373,6 +1398,8 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
 			       struct x86_emulate_ops *ops)
 {
 	struct decode_cache *c = &ctxt->decode;
+	unsigned long *rax = &c->regs[VCPU_REGS_RAX];
+	unsigned long *rdx = &c->regs[VCPU_REGS_RDX];
 
 	switch (c->modrm_reg) {
 	case 0 ... 1:	/* test */
@@ -1384,6 +1411,18 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
 	case 3:	/* neg */
 		emulate_1op("neg", c->dst, ctxt->eflags);
 		break;
+	case 4: /* mul */
+		emulate_1op_rax_rdx("mul", c->src, *rax, *rdx, ctxt->eflags);
+		break;
+	case 5: /* imul */
+		emulate_1op_rax_rdx("imul", c->src, *rax, *rdx, ctxt->eflags);
+		break;
+	case 6: /* div */
+		emulate_1op_rax_rdx("div", c->src, *rax, *rdx, ctxt->eflags);
+		break;
+	case 7: /* idiv */
+		emulate_1op_rax_rdx("idiv", c->src, *rax, *rdx, ctxt->eflags);
+		break;
 	default:
 		return 0;
 	}
@@ -2138,7 +2177,7 @@ static struct opcode group1A[] = {
 static struct opcode group3[] = {
 	D(DstMem | SrcImm | ModRM), D(DstMem | SrcImm | ModRM),
 	D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
-	X4(D(Undefined)),
+	X4(D(SrcMem | ModRM)),
 };
 
 static struct opcode group4[] = {
-- 
cgit v1.2.3-18-g5258


From 8c5eee30a942cb3154f14f12407755ed7da74bbc Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <m.gamal005@gmail.com>
Date: Sun, 8 Aug 2010 21:11:38 +0300
Subject: KVM: x86 emulator: Fix emulate_grp3 return values

This patch lets emulate_grp3() return X86EMUL_* return codes instead
of hardcoded ones.

Signed-off-by: Mohammed Gamal <m.gamal005@gmail.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index f0415eab659..8617c344405 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1424,9 +1424,9 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
 		emulate_1op_rax_rdx("idiv", c->src, *rax, *rdx, ctxt->eflags);
 		break;
 	default:
-		return 0;
+		return X86EMUL_UNHANDLEABLE;
 	}
-	return 1;
+	return X86EMUL_CONTINUE;
 }
 
 static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
@@ -3172,7 +3172,7 @@ special_insn:
 		ctxt->eflags ^= EFLG_CF;
 		break;
 	case 0xf6 ... 0xf7:	/* Grp3 */
-		if (!emulate_grp3(ctxt, ops))
+		if (emulate_grp3(ctxt, ops) != X86EMUL_CONTINUE)
 			goto cannot_emulate;
 		break;
 	case 0xf8: /* clc */
-- 
cgit v1.2.3-18-g5258


From d9574a25afc3cd7ccd6a0bc05252bb84189e4021 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Tue, 10 Aug 2010 13:48:22 +0800
Subject: KVM: x86 emulator: add bsf/bsr instruction emulation

Add bsf/bsr instruction emulation (opcode 0x0f 0xbc~0xbd)

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 8617c344405..f6b124fcc3f 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2379,8 +2379,8 @@ static struct opcode twobyte_table[256] = {
 	/* 0xB8 - 0xBF */
 	N, N,
 	G(BitOp, group8), D(DstMem | SrcReg | ModRM | BitOp | Lock),
-	N, N, D(ByteOp | DstReg | SrcMem | ModRM | Mov),
-	    D(DstReg | SrcMem16 | ModRM | Mov),
+	D(DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
+	D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
 	/* 0xC0 - 0xCF */
 	N, N, N, D(DstMem | SrcReg | ModRM | Mov),
 	N, N, N, GD(0, &group9),
@@ -3511,6 +3511,30 @@ twobyte_insn:
 	      btc:		/* btc */
 		emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags);
 		break;
+	case 0xbc: {		/* bsf */
+		u8 zf;
+		__asm__ ("bsf %2, %0; setz %1"
+			 : "=r"(c->dst.val), "=q"(zf)
+			 : "r"(c->src.val));
+		ctxt->eflags &= ~X86_EFLAGS_ZF;
+		if (zf) {
+			ctxt->eflags |= X86_EFLAGS_ZF;
+			c->dst.type = OP_NONE;	/* Disable writeback. */
+		}
+		break;
+	}
+	case 0xbd: {		/* bsr */
+		u8 zf;
+		__asm__ ("bsr %2, %0; setz %1"
+			 : "=r"(c->dst.val), "=q"(zf)
+			 : "r"(c->src.val));
+		ctxt->eflags &= ~X86_EFLAGS_ZF;
+		if (zf) {
+			ctxt->eflags |= X86_EFLAGS_ZF;
+			c->dst.type = OP_NONE;	/* Disable writeback. */
+		}
+		break;
+	}
 	case 0xbe ... 0xbf:	/* movsx */
 		c->dst.bytes = c->op_bytes;
 		c->dst.val = (c->d & ByteOp) ? (s8) c->src.val :
-- 
cgit v1.2.3-18-g5258


From 8ec4722dd2aab9b69befb919549ea0a5bfc9e670 Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <m.gamal005@gmail.com>
Date: Mon, 16 Aug 2010 00:47:01 +0300
Subject: KVM: Separate emulation context initialization in a separate function

The code for initializing the emulation context is duplicated at two
locations (emulate_instruction() and kvm_task_switch()). Separate it
in a separate function and call it from there.

Signed-off-by: Mohammed Gamal <m.gamal005@gmail.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 54 +++++++++++++++++++++++++-----------------------------
 1 file changed, 25 insertions(+), 29 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 768197a34d3..c0004eb354d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3931,6 +3931,28 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu)
 		kvm_queue_exception(vcpu, ctxt->exception);
 }
 
+static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
+{
+	struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
+	int cs_db, cs_l;
+
+	cache_all_regs(vcpu);
+
+	kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+
+	vcpu->arch.emulate_ctxt.vcpu = vcpu;
+	vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
+	vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
+	vcpu->arch.emulate_ctxt.mode =
+		(!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
+		(vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
+		? X86EMUL_MODE_VM86 : cs_l
+		? X86EMUL_MODE_PROT64 :	cs_db
+		? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
+	memset(c, 0, sizeof(struct decode_cache));
+	memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
+}
+
 static int handle_emulation_failure(struct kvm_vcpu *vcpu)
 {
 	++vcpu->stat.insn_emulation_fail;
@@ -3987,20 +4009,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 	cache_all_regs(vcpu);
 
 	if (!(emulation_type & EMULTYPE_NO_DECODE)) {
-		int cs_db, cs_l;
-		kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
-
-		vcpu->arch.emulate_ctxt.vcpu = vcpu;
-		vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
-		vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
-		vcpu->arch.emulate_ctxt.mode =
-			(!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
-			(vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
-			? X86EMUL_MODE_VM86 : cs_l
-			? X86EMUL_MODE_PROT64 :	cs_db
-			? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
-		memset(c, 0, sizeof(struct decode_cache));
-		memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
+		init_emulate_ctxt(vcpu);
 		vcpu->arch.emulate_ctxt.interruptibility = 0;
 		vcpu->arch.emulate_ctxt.exception = -1;
 		vcpu->arch.emulate_ctxt.perm_ok = false;
@@ -5052,22 +5061,9 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
 		    bool has_error_code, u32 error_code)
 {
 	struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
-	int cs_db, cs_l, ret;
-	cache_all_regs(vcpu);
-
-	kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+	int ret;
 
-	vcpu->arch.emulate_ctxt.vcpu = vcpu;
-	vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
-	vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
-	vcpu->arch.emulate_ctxt.mode =
-		(!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
-		(vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
-		? X86EMUL_MODE_VM86 : cs_l
-		? X86EMUL_MODE_PROT64 :	cs_db
-		? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
-	memset(c, 0, sizeof(struct decode_cache));
-	memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
+	init_emulate_ctxt(vcpu);
 
 	ret = emulator_task_switch(&vcpu->arch.emulate_ctxt,
 				   tss_selector, reason, has_error_code,
-- 
cgit v1.2.3-18-g5258


From 31be40b3985f09c0c89b9e28a8206df32adba842 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Tue, 17 Aug 2010 09:17:30 +0800
Subject: KVM: x86 emulator: put register operand write back to a function

Introduce function write_register_operand() to write back the
register operand.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 55 +++++++++++++++++++++-----------------------------
 1 file changed, 23 insertions(+), 32 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index f6b124fcc3f..003713041ce 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1020,6 +1020,25 @@ exception:
 	return X86EMUL_PROPAGATE_FAULT;
 }
 
+static void write_register_operand(struct operand *op)
+{
+	/* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
+	switch (op->bytes) {
+	case 1:
+		*(u8 *)op->addr.reg = (u8)op->val;
+		break;
+	case 2:
+		*(u16 *)op->addr.reg = (u16)op->val;
+		break;
+	case 4:
+		*op->addr.reg = (u32)op->val;
+		break;	/* 64b: zero-extend */
+	case 8:
+		*op->addr.reg = op->val;
+		break;
+	}
+}
+
 static inline int writeback(struct x86_emulate_ctxt *ctxt,
 			    struct x86_emulate_ops *ops)
 {
@@ -1029,23 +1048,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
 
 	switch (c->dst.type) {
 	case OP_REG:
-		/* The 4-byte case *is* correct:
-		 * in 64-bit mode we zero-extend.
-		 */
-		switch (c->dst.bytes) {
-		case 1:
-			*(u8 *)c->dst.addr.reg = (u8)c->dst.val;
-			break;
-		case 2:
-			*(u16 *)c->dst.addr.reg = (u16)c->dst.val;
-			break;
-		case 4:
-			*c->dst.addr.reg = (u32)c->dst.val;
-			break;	/* 64b: zero-ext */
-		case 8:
-			*c->dst.addr.reg = c->dst.val;
-			break;
-		}
+		write_register_operand(&c->dst);
 		break;
 	case OP_MEM:
 		if (c->lock_prefix)
@@ -2970,25 +2973,13 @@ special_insn:
 	case 0x86 ... 0x87:	/* xchg */
 	xchg:
 		/* Write back the register source. */
-		switch (c->dst.bytes) {
-		case 1:
-			*(u8 *) c->src.addr.reg = (u8) c->dst.val;
-			break;
-		case 2:
-			*(u16 *) c->src.addr.reg = (u16) c->dst.val;
-			break;
-		case 4:
-			*c->src.addr.reg = (u32) c->dst.val;
-			break;	/* 64b reg: zero-extend */
-		case 8:
-			*c->src.addr.reg = c->dst.val;
-			break;
-		}
+		c->src.val = c->dst.val;
+		write_register_operand(&c->src);
 		/*
 		 * Write back the memory destination with implicit LOCK
 		 * prefix.
 		 */
-		c->dst.val = c->src.val;
+		c->dst.val = c->src.orig_val;
 		c->lock_prefix = 1;
 		break;
 	case 0x88 ... 0x8b:	/* mov */
-- 
cgit v1.2.3-18-g5258


From 92f738a52b53dc13b5dd5753634bdb8c59ac9815 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Tue, 17 Aug 2010 09:19:34 +0800
Subject: KVM: x86 emulator: add XADD instruction emulation

Add XADD instruction emulation (opcode 0x0f 0xc0~0xc1)

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 003713041ce..0c08bffe6cb 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2385,7 +2385,8 @@ static struct opcode twobyte_table[256] = {
 	D(DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
 	D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
 	/* 0xC0 - 0xCF */
-	N, N, N, D(DstMem | SrcReg | ModRM | Mov),
+	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
+	N, D(DstMem | SrcReg | ModRM | Mov),
 	N, N, N, GD(0, &group9),
 	N, N, N, N, N, N, N, N,
 	/* 0xD0 - 0xDF */
@@ -3531,6 +3532,12 @@ twobyte_insn:
 		c->dst.val = (c->d & ByteOp) ? (s8) c->src.val :
 							(s16) c->src.val;
 		break;
+	case 0xc0 ... 0xc1:	/* xadd */
+		emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
+		/* Write back the register source. */
+		c->src.val = c->dst.orig_val;
+		write_register_operand(&c->src);
+		break;
 	case 0xc3:		/* movnti */
 		c->dst.bytes = c->op_bytes;
 		c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val :
-- 
cgit v1.2.3-18-g5258


From ee45b58efebc826ea2ade310f6e311702d4a5ab9 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Fri, 6 Aug 2010 17:10:07 +0800
Subject: KVM: x86 emulator: add setcc instruction emulation

Add setcc instruction emulation (opcode 0x0f 0x90~0x9f)

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 0c08bffe6cb..df349f376da 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2362,7 +2362,7 @@ static struct opcode twobyte_table[256] = {
 	/* 0x80 - 0x8F */
 	X16(D(SrcImm)),
 	/* 0x90 - 0x9F */
-	N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
+	X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)),
 	/* 0xA0 - 0xA7 */
 	D(ImplicitOps | Stack), D(ImplicitOps | Stack),
 	N, D(DstMem | SrcReg | ModRM | BitOp),
@@ -3424,6 +3424,9 @@ twobyte_insn:
 		if (test_cc(c->b, ctxt->eflags))
 			jmp_rel(c, c->src.val);
 		break;
+	case 0x90 ... 0x9f:     /* setcc r/m8 */
+		c->dst.val = test_cc(c->b, ctxt->eflags);
+		break;
 	case 0xa0:	  /* push fs */
 		emulate_push_sreg(ctxt, ops, VCPU_SREG_FS);
 		break;
-- 
cgit v1.2.3-18-g5258


From c483c02ad35256206d6c45d7170fef1e33a43e9c Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Fri, 6 Aug 2010 15:36:36 +0800
Subject: KVM: x86 emulator: remove useless label from x86_emulate_insn()

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index df349f376da..78541e8fd14 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2787,16 +2787,12 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 		c->eip = ctxt->eip;
 	}
 
-	if (c->src.type == OP_MEM) {
-		if (c->d & NoAccess)
-			goto no_fetch;
+	if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) {
 		rc = read_emulated(ctxt, ops, c->src.addr.mem,
 					c->src.valptr, c->src.bytes);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		c->src.orig_val64 = c->src.val64;
-	no_fetch:
-		;
 	}
 
 	if (c->src2.type == OP_MEM) {
-- 
cgit v1.2.3-18-g5258


From 943858e27544cd10e6095093a40be911a31892b1 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Fri, 6 Aug 2010 11:36:51 +0800
Subject: KVM: x86 emulator: introduce DstImmUByte for dst operand decode

Introduce DstImmUByte for dst operand decode, which
will be used for out instruction.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 78541e8fd14..dc074a0c60c 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -54,6 +54,7 @@
 #define DstAcc      (4<<1)	/* Destination Accumulator */
 #define DstDI       (5<<1)	/* Destination is in ES:(E)DI */
 #define DstMem64    (6<<1)	/* 64bit memory operand */
+#define DstImmUByte (7<<1)	/* 8-bit unsigned immediate operand */
 #define DstMask     (7<<1)
 /* Source operand type. */
 #define SrcNone     (0<<4)	/* No source operand. */
@@ -2693,6 +2694,12 @@ done_prefixes:
 		decode_register_operand(&c->dst, c,
 			 c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
 		break;
+	case DstImmUByte:
+		c->dst.type = OP_IMM;
+		c->dst.addr.mem = c->eip;
+		c->dst.bytes = 1;
+		c->dst.val = insn_fetch(u8, 1, c->eip);
+		break;
 	case DstMem:
 	case DstMem64:
 		c->dst = memop;
-- 
cgit v1.2.3-18-g5258


From 41167be544603e077b866a2922737556dc2294e8 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Fri, 6 Aug 2010 11:45:12 +0800
Subject: KVM: x86 emulator: change OUT instruction to use dst instead of src

Change OUT instruction to use dst instead of src, so we can
reuse those code for all out instructions.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index dc074a0c60c..8e12e1b11ff 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2321,12 +2321,12 @@ static struct opcode opcode_table[256] = {
 	/* 0xE0 - 0xE7 */
 	N, N, N, N,
 	D(ByteOp | SrcImmUByte | DstAcc), D(SrcImmUByte | DstAcc),
-	D(ByteOp | SrcImmUByte | DstAcc), D(SrcImmUByte | DstAcc),
+	D(ByteOp | SrcAcc | DstImmUByte), D(SrcAcc | DstImmUByte),
 	/* 0xE8 - 0xEF */
 	D(SrcImm | Stack), D(SrcImm | ImplicitOps),
 	D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps),
 	D(SrcNone | ByteOp | DstAcc), D(SrcNone | DstAcc),
-	D(SrcNone | ByteOp | DstAcc), D(SrcNone | DstAcc),
+	D(ByteOp | SrcAcc | ImplicitOps), D(SrcAcc | ImplicitOps),
 	/* 0xF0 - 0xF7 */
 	N, N, N, N,
 	D(ImplicitOps | Priv), D(ImplicitOps), G(ByteOp, group3), G(0, group3),
@@ -3148,15 +3148,16 @@ special_insn:
 		break;
 	case 0xee: /* out dx,al */
 	case 0xef: /* out dx,(e/r)ax */
-		c->src.val = c->regs[VCPU_REGS_RDX];
+		c->dst.val = c->regs[VCPU_REGS_RDX];
 	do_io_out:
-		c->dst.bytes = min(c->dst.bytes, 4u);
-		if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
+		c->src.bytes = min(c->src.bytes, 4u);
+		if (!emulator_io_permited(ctxt, ops, c->dst.val,
+					  c->src.bytes)) {
 			emulate_gp(ctxt, 0);
 			goto done;
 		}
-		ops->pio_out_emulated(c->dst.bytes, c->src.val, &c->dst.val, 1,
-				      ctxt->vcpu);
+		ops->pio_out_emulated(c->src.bytes, c->dst.val,
+				      &c->src.val, 1, ctxt->vcpu);
 		c->dst.type = OP_NONE;	/* Disable writeback. */
 		break;
 	case 0xf4:              /* hlt */
-- 
cgit v1.2.3-18-g5258


From a13a63faa6237001ed80d4f4051fc028dace10d9 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Fri, 6 Aug 2010 11:46:12 +0800
Subject: KVM: x86 emulator: remove dup code of in/out instruction

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 24 ++++--------------------
 1 file changed, 4 insertions(+), 20 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 8e12e1b11ff..cffe7c2819e 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2923,28 +2923,12 @@ special_insn:
 		break;
 	case 0x6c:		/* insb */
 	case 0x6d:		/* insw/insd */
-		c->dst.bytes = min(c->dst.bytes, 4u);
-		if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
-					  c->dst.bytes)) {
-			emulate_gp(ctxt, 0);
-			goto done;
-		}
-		if (!pio_in_emulated(ctxt, ops, c->dst.bytes,
-				     c->regs[VCPU_REGS_RDX], &c->dst.val))
-			goto done; /* IO is needed, skip writeback */
-		break;
+		c->src.val = c->regs[VCPU_REGS_RDX];
+		goto do_io_in;
 	case 0x6e:		/* outsb */
 	case 0x6f:		/* outsw/outsd */
-		c->src.bytes = min(c->src.bytes, 4u);
-		if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
-					  c->src.bytes)) {
-			emulate_gp(ctxt, 0);
-			goto done;
-		}
-		ops->pio_out_emulated(c->src.bytes, c->regs[VCPU_REGS_RDX],
-				      &c->src.val, 1, ctxt->vcpu);
-
-		c->dst.type = OP_NONE; /* nothing to writeback */
+		c->dst.val = c->regs[VCPU_REGS_RDX];
+		goto do_io_out;
 		break;
 	case 0x70 ... 0x7f: /* jcc (short) */
 		if (test_cc(c->b, ctxt->eflags))
-- 
cgit v1.2.3-18-g5258


From 5c56e1cf7a758c4772e2470b4346a8219ec7f44e Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 17 Aug 2010 11:17:51 +0300
Subject: KVM: x86 emulator: fix INTn emulation not pushing EFLAGS and CS

emulate_push() only schedules a push; it doesn't actually push anything.
Call writeback() to flush out the write.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index cffe7c2819e..b89a20ec7c9 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1232,7 +1232,7 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt,
 			       struct x86_emulate_ops *ops, int irq)
 {
 	struct decode_cache *c = &ctxt->decode;
-	int rc = X86EMUL_CONTINUE;
+	int rc;
 	struct desc_ptr dt;
 	gva_t cs_addr;
 	gva_t eip_addr;
@@ -1242,14 +1242,25 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt,
 	/* TODO: Add limit checks */
 	c->src.val = ctxt->eflags;
 	emulate_push(ctxt, ops);
+	rc = writeback(ctxt, ops);
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
 
 	ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC);
 
 	c->src.val = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
 	emulate_push(ctxt, ops);
+	rc = writeback(ctxt, ops);
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
 
 	c->src.val = c->eip;
 	emulate_push(ctxt, ops);
+	rc = writeback(ctxt, ops);
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+
+	c->dst.type = OP_NONE;
 
 	ops->get_idt(&dt, ctxt->vcpu);
 
-- 
cgit v1.2.3-18-g5258


From f6b33fc5046642b669c3197bf08639172e4cffad Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 17 Aug 2010 11:20:37 +0300
Subject: KVM: x86 emulator: implement SCAS (opcodes AE, AF)

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index b89a20ec7c9..09c9210db75 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2311,7 +2311,7 @@ static struct opcode opcode_table[256] = {
 	D(DstAcc | SrcImmByte | ByteOp), D(DstAcc | SrcImm),
 	D(ByteOp | SrcAcc | DstDI | Mov | String), D(SrcAcc | DstDI | Mov | String),
 	D(ByteOp | SrcSI | DstAcc | Mov | String), D(SrcSI | DstAcc | Mov | String),
-	D(ByteOp | DstDI | String), D(DstDI | String),
+	D(ByteOp | SrcAcc | DstDI | String), D(SrcAcc | DstDI | String),
 	/* 0xB0 - 0xB7 */
 	X8(D(ByteOp | DstReg | SrcImm | Mov)),
 	/* 0xB8 - 0xBF */
@@ -3046,8 +3046,7 @@ special_insn:
 	case 0xac ... 0xad:	/* lods */
 		goto mov;
 	case 0xae ... 0xaf:	/* scas */
-		DPRINTF("Urk! I don't handle SCAS.\n");
-		goto cannot_emulate;
+		goto cmp;
 	case 0xb0 ... 0xbf: /* mov r, imm */
 		goto mov;
 	case 0xc0 ... 0xc1:
-- 
cgit v1.2.3-18-g5258


From 0fa6ccbd281221bc7d46aff82d846e1f4c1985df Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 17 Aug 2010 11:22:17 +0300
Subject: KVM: x86 emulator: fix REPZ/REPNZ termination condition

EFLAGS.ZF needs to be checked after each iteration, not before.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 41 ++++++++++++++++++++---------------------
 1 file changed, 20 insertions(+), 21 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 09c9210db75..aab62d50752 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2781,28 +2781,10 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 		ctxt->restart = true;
 		/* All REP prefixes have the same first termination condition */
 		if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) {
-		string_done:
 			ctxt->restart = false;
 			ctxt->eip = c->eip;
 			goto done;
 		}
-		/* The second termination condition only applies for REPE
-		 * and REPNE. Test if the repeat string operation prefix is
-		 * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
-		 * corresponding termination condition according to:
-		 * 	- if REPE/REPZ and ZF = 0 then done
-		 * 	- if REPNE/REPNZ and ZF = 1 then done
-		 */
-		if ((c->b == 0xa6) || (c->b == 0xa7) ||
-		    (c->b == 0xae) || (c->b == 0xaf)) {
-			if ((c->rep_prefix == REPE_PREFIX) &&
-			    ((ctxt->eflags & EFLG_ZF) == 0))
-				goto string_done;
-			if ((c->rep_prefix == REPNE_PREFIX) &&
-			    ((ctxt->eflags & EFLG_ZF) == EFLG_ZF))
-				goto string_done;
-		}
-		c->eip = ctxt->eip;
 	}
 
 	if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) {
@@ -3229,20 +3211,37 @@ writeback:
 	if (c->rep_prefix && (c->d & String)) {
 		struct read_cache *rc = &ctxt->decode.io_read;
 		register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
+		/* The second termination condition only applies for REPE
+		 * and REPNE. Test if the repeat string operation prefix is
+		 * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
+		 * corresponding termination condition according to:
+		 * 	- if REPE/REPZ and ZF = 0 then done
+		 * 	- if REPNE/REPNZ and ZF = 1 then done
+		 */
+		if (((c->b == 0xa6) || (c->b == 0xa7) ||
+		     (c->b == 0xae) || (c->b == 0xaf))
+		    && (((c->rep_prefix == REPE_PREFIX) &&
+			 ((ctxt->eflags & EFLG_ZF) == 0))
+			|| ((c->rep_prefix == REPNE_PREFIX) &&
+			    ((ctxt->eflags & EFLG_ZF) == EFLG_ZF))))
+			ctxt->restart = false;
 		/*
 		 * Re-enter guest when pio read ahead buffer is empty or,
 		 * if it is not used, after each 1024 iteration.
 		 */
-		if ((rc->end == 0 && !(c->regs[VCPU_REGS_RCX] & 0x3ff)) ||
-		    (rc->end != 0 && rc->end == rc->pos))
+		else if ((rc->end == 0 && !(c->regs[VCPU_REGS_RCX] & 0x3ff)) ||
+			 (rc->end != 0 && rc->end == rc->pos)) {
 			ctxt->restart = false;
+			c->eip = ctxt->eip;
+		}
 	}
 	/*
 	 * reset read cache here in case string instruction is restared
 	 * without decoding
 	 */
 	ctxt->decode.mem_read.end = 0;
-	ctxt->eip = c->eip;
+	if (!ctxt->restart)
+		ctxt->eip = c->eip;
 
 done:
 	return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
-- 
cgit v1.2.3-18-g5258


From e8b6fa70e3545f0afd63434dbd0c5220d47205f6 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Wed, 18 Aug 2010 16:43:13 +0800
Subject: KVM: x86 emulator: add CBW/CWDE/CDQE instruction emulation

Add CBW/CWDE/CDQE instruction emulation.(opcode 0x98)
Used by FreeBSD's boot loader.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index aab62d50752..312dda57f93 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2300,7 +2300,7 @@ static struct opcode opcode_table[256] = {
 	/* 0x90 - 0x97 */
 	X8(D(SrcAcc | DstReg)),
 	/* 0x98 - 0x9F */
-	N, N, D(SrcImmFAddr | No64), N,
+	D(DstAcc | SrcNone), N, D(SrcImmFAddr | No64), N,
 	D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, N,
 	/* 0xA0 - 0xA7 */
 	D(ByteOp | DstAcc | SrcMem | Mov | MemAbs), D(DstAcc | SrcMem | Mov | MemAbs),
@@ -3003,6 +3003,13 @@ special_insn:
 		if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX])
 			break;
 		goto xchg;
+	case 0x98: /* cbw/cwde/cdqe */
+		switch (c->op_bytes) {
+		case 2: c->dst.val = (s8)c->dst.val; break;
+		case 4: c->dst.val = (s16)c->dst.val; break;
+		case 8: c->dst.val = (s32)c->dst.val; break;
+		}
+		break;
 	case 0x9c: /* pushf */
 		c->src.val =  (unsigned long) ctxt->eflags;
 		emulate_push(ctxt, ops);
-- 
cgit v1.2.3-18-g5258


From f2f31845341d22e4f20438b05e83d58e71b723b5 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Wed, 18 Aug 2010 16:38:21 +0800
Subject: KVM: x86 emulator: add LOOP/LOOPcc instruction emulation

Add LOOP/LOOPcc instruction emulation (opcode 0xe0~0xe2).

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 312dda57f93..2f816edfe31 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2330,7 +2330,7 @@ static struct opcode opcode_table[256] = {
 	/* 0xD8 - 0xDF */
 	N, N, N, N, N, N, N, N,
 	/* 0xE0 - 0xE7 */
-	N, N, N, N,
+	X3(D(SrcImmByte)), N,
 	D(ByteOp | SrcImmUByte | DstAcc), D(SrcImmUByte | DstAcc),
 	D(ByteOp | SrcAcc | DstImmUByte), D(SrcAcc | DstImmUByte),
 	/* 0xE8 - 0xEF */
@@ -3084,6 +3084,12 @@ special_insn:
 		c->src.val = c->regs[VCPU_REGS_RCX];
 		emulate_grp2(ctxt);
 		break;
+	case 0xe0 ... 0xe2:	/* loop/loopz/loopnz */
+		register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
+		if (address_mask(c, c->regs[VCPU_REGS_RCX]) != 0 &&
+		    (c->b == 0xe2 || test_cc(c->b ^ 0x5, ctxt->eflags)))
+			jmp_rel(c, c->src.val);
+		break;
 	case 0xe4: 	/* inb */
 	case 0xe5: 	/* in */
 		goto do_io_in;
-- 
cgit v1.2.3-18-g5258


From b3b3d25a12986fb08666823db3e9a74649a71925 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 16 Aug 2010 17:49:52 +0300
Subject: KVM: x86 emulator: pass destination type to ____emulate_2op()

We'll need it later so we can use a register for the destination.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 2f816edfe31..7818c91deb6 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -194,13 +194,13 @@ struct group_dual {
 #define ON64(x)
 #endif
 
-#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix)	\
+#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix, _dsttype) \
 	do {								\
 		__asm__ __volatile__ (					\
 			_PRE_EFLAGS("0", "4", "2")			\
 			_op _suffix " %"_x"3,%1; "			\
 			_POST_EFLAGS("0", "4", "2")			\
-			: "=m" (_eflags), "=m" ((_dst).val),		\
+			: "=m" (_eflags), "=m" (*(_dsttype*)&(_dst).val),\
 			  "=&r" (_tmp)					\
 			: _y ((_src).val), "i" (EFLAGS_MASK));		\
 	} while (0)
@@ -213,13 +213,13 @@ struct group_dual {
 									\
 		switch ((_dst).bytes) {					\
 		case 2:							\
-			____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w"); \
+			____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w",u16);\
 			break;						\
 		case 4:							\
-			____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l"); \
+			____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l",u32);\
 			break;						\
 		case 8:							\
-			ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q")); \
+			ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q",u64)); \
 			break;						\
 		}							\
 	} while (0)
@@ -229,7 +229,7 @@ struct group_dual {
 		unsigned long _tmp;					     \
 		switch ((_dst).bytes) {				             \
 		case 1:							     \
-			____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b");  \
+			____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b",u8); \
 			break;						     \
 		default:						     \
 			__emulate_2op_nobyte(_op, _src, _dst, _eflags,	     \
-- 
cgit v1.2.3-18-g5258


From fb2c264105c64511dbd1a7488b482960895aace4 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 16 Aug 2010 17:50:56 +0300
Subject: KVM: x86 emulator: Use a register for ____emulate_2op() destination

Most x86 two operand instructions allow the destination to be a memory operand,
but IMUL (for example) requires that the destination be a register.  Change
____emulate_2op() to take a register for both source and destination so we
can invoke IMUL.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 7818c91deb6..81b0f884896 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -200,7 +200,7 @@ struct group_dual {
 			_PRE_EFLAGS("0", "4", "2")			\
 			_op _suffix " %"_x"3,%1; "			\
 			_POST_EFLAGS("0", "4", "2")			\
-			: "=m" (_eflags), "=m" (*(_dsttype*)&(_dst).val),\
+			: "=m" (_eflags), "+q" (*(_dsttype*)&(_dst).val),\
 			  "=&r" (_tmp)					\
 			: _y ((_src).val), "i" (EFLAGS_MASK));		\
 	} while (0)
-- 
cgit v1.2.3-18-g5258


From 7af04fc05cc185869271927eb470de3d25064b4a Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 18 Aug 2010 14:16:35 +0300
Subject: KVM: x86 emulator: implement DAS (opcode 2F)

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 42 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 41 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 81b0f884896..83ded7c03d1 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2175,6 +2175,45 @@ static int em_push(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
+static int em_das(struct x86_emulate_ctxt *ctxt)
+{
+	struct decode_cache *c = &ctxt->decode;
+	u8 al, old_al;
+	bool af, cf, old_cf;
+
+	cf = ctxt->eflags & X86_EFLAGS_CF;
+	al = c->dst.val;
+
+	old_al = al;
+	old_cf = cf;
+	cf = false;
+	af = ctxt->eflags & X86_EFLAGS_AF;
+	if ((al & 0x0f) > 9 || af) {
+		al -= 6;
+		cf = old_cf | (al >= 250);
+		af = true;
+	} else {
+		af = false;
+	}
+	if (old_al > 0x99 || old_cf) {
+		al -= 0x60;
+		cf = true;
+	}
+
+	c->dst.val = al;
+	/* Set PF, ZF, SF */
+	c->src.type = OP_IMM;
+	c->src.val = 0;
+	c->src.bytes = 1;
+	emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
+	ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF);
+	if (cf)
+		ctxt->eflags |= X86_EFLAGS_CF;
+	if (af)
+		ctxt->eflags |= X86_EFLAGS_AF;
+	return X86EMUL_CONTINUE;
+}
+
 #define D(_y) { .flags = (_y) }
 #define N    D(0)
 #define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) }
@@ -2258,7 +2297,8 @@ static struct opcode opcode_table[256] = {
 	/* 0x28 - 0x2F */
 	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
 	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
-	D(ByteOp | DstAcc | SrcImmByte), D(DstAcc | SrcImm), N, N,
+	D(ByteOp | DstAcc | SrcImmByte), D(DstAcc | SrcImm),
+	N, I(ByteOp | DstAcc | No64, em_das),
 	/* 0x30 - 0x37 */
 	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
 	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
-- 
cgit v1.2.3-18-g5258


From 0ef753b8c323f5b8d75d7dc57ceef6b35982afdb Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 18 Aug 2010 14:51:45 +0300
Subject: KVM: x86 emulator: implement CALL FAR (FF /3)

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 37 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 83ded7c03d1..31335779396 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2214,6 +2214,40 @@ static int em_das(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
+static int em_call_far(struct x86_emulate_ctxt *ctxt)
+{
+	struct decode_cache *c = &ctxt->decode;
+	u16 sel, old_cs;
+	ulong old_eip;
+	int rc;
+
+	old_cs = ctxt->ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
+	old_eip = c->eip;
+
+	memcpy(&sel, c->src.valptr + c->op_bytes, 2);
+	if (load_segment_descriptor(ctxt, ctxt->ops, sel, VCPU_SREG_CS))
+		return X86EMUL_CONTINUE;
+
+	c->eip = 0;
+	memcpy(&c->eip, c->src.valptr, c->op_bytes);
+
+	c->src.val = old_cs;
+	emulate_push(ctxt, ctxt->ops);
+	rc = writeback(ctxt, ctxt->ops);
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+
+	c->src.val = old_eip;
+	emulate_push(ctxt, ctxt->ops);
+	rc = writeback(ctxt, ctxt->ops);
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+
+	c->dst.type = OP_NONE;
+
+	return X86EMUL_CONTINUE;
+}
+
 #define D(_y) { .flags = (_y) }
 #define N    D(0)
 #define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) }
@@ -2241,7 +2275,8 @@ static struct opcode group4[] = {
 
 static struct opcode group5[] = {
 	D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
-	D(SrcMem | ModRM | Stack), N,
+	D(SrcMem | ModRM | Stack),
+	I(SrcMemFAddr | ModRM | ImplicitOps | Stack, em_call_far),
 	D(SrcMem | ModRM | Stack), D(SrcMemFAddr | ModRM | ImplicitOps),
 	D(SrcMem | ModRM | Stack), N,
 };
-- 
cgit v1.2.3-18-g5258


From b250e605895d02cede78922d034f7825af72a8b5 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 18 Aug 2010 15:11:24 +0300
Subject: KVM: x86 emulator: add SrcImmU16 operand type

Used for RET NEAR instructions.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 31335779396..db80e28471d 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -72,6 +72,7 @@
 #define SrcImmFAddr (0xb<<4)	/* Source is immediate far address */
 #define SrcMemFAddr (0xc<<4)	/* Source is far address in memory */
 #define SrcAcc      (0xd<<4)	/* Source Accumulator */
+#define SrcImmU16   (0xe<<4)    /* Immediate operand, unsigned, 16 bits */
 #define SrcMask     (0xf<<4)
 /* Generic ModRM decode. */
 #define ModRM       (1<<8)
@@ -2678,13 +2679,17 @@ done_prefixes:
 	srcmem_common:
 		c->src = memop;
 		break;
+	case SrcImmU16:
+		c->src.bytes = 2;
+		goto srcimm;
 	case SrcImm:
 	case SrcImmU:
-		c->src.type = OP_IMM;
-		c->src.addr.mem = c->eip;
 		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
 		if (c->src.bytes == 8)
 			c->src.bytes = 4;
+	srcimm:
+		c->src.type = OP_IMM;
+		c->src.addr.mem = c->eip;
 		/* NB. Immediates are sign-extended as necessary. */
 		switch (c->src.bytes) {
 		case 1:
@@ -2697,7 +2702,8 @@ done_prefixes:
 			c->src.val = insn_fetch(s32, 4, c->eip);
 			break;
 		}
-		if ((c->d & SrcMask) == SrcImmU) {
+		if ((c->d & SrcMask) == SrcImmU
+		    || (c->d & SrcMask) == SrcImmU16) {
 			switch (c->src.bytes) {
 			case 1:
 				c->src.val &= 0xff;
-- 
cgit v1.2.3-18-g5258


From 40ece7c7297da90e54e147d3bfbb4531f9fbc570 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 18 Aug 2010 15:12:09 +0300
Subject: KVM: x86 emulator: implement RET imm16 (opcode C2)

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index db80e28471d..9e58f5054c3 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2249,6 +2249,21 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
+static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
+{
+	struct decode_cache *c = &ctxt->decode;
+	int rc;
+
+	c->dst.type = OP_REG;
+	c->dst.addr.reg = &c->eip;
+	c->dst.bytes = c->op_bytes;
+	rc = emulate_pop(ctxt, ctxt->ops, &c->dst.val, c->op_bytes);
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+	register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.val);
+	return X86EMUL_CONTINUE;
+}
+
 #define D(_y) { .flags = (_y) }
 #define N    D(0)
 #define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) }
@@ -2394,7 +2409,9 @@ static struct opcode opcode_table[256] = {
 	X8(D(DstReg | SrcImm | Mov)),
 	/* 0xC0 - 0xC7 */
 	D(ByteOp | DstMem | SrcImm | ModRM), D(DstMem | SrcImmByte | ModRM),
-	N, D(ImplicitOps | Stack), N, N,
+	I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm),
+	D(ImplicitOps | Stack),
+	N, N,
 	D(ByteOp | DstMem | SrcImm | ModRM | Mov), D(DstMem | SrcImm | ModRM | Mov),
 	/* 0xC8 - 0xCF */
 	N, N, N, D(ImplicitOps | Stack),
-- 
cgit v1.2.3-18-g5258


From f3a1b9f49647133e8c6eb6a68399ed8dbd61554a Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 18 Aug 2010 18:25:25 +0300
Subject: KVM: x86 emulator: implement IMUL REG, R/M, imm8 (opcode 6B)

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 9e58f5054c3..618386f8051 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2264,6 +2264,15 @@ static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
+static int em_imul_3op(struct x86_emulate_ctxt *ctxt)
+{
+	struct decode_cache *c = &ctxt->decode;
+
+	c->dst.val = c->src2.val;
+	emulate_2op_SrcV_nobyte("imul", c->src, c->dst, ctxt->eflags);
+	return X86EMUL_CONTINUE;
+}
+
 #define D(_y) { .flags = (_y) }
 #define N    D(0)
 #define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) }
@@ -2371,7 +2380,8 @@ static struct opcode opcode_table[256] = {
 	N, N, N, N,
 	/* 0x68 - 0x6F */
 	I(SrcImm | Mov | Stack, em_push), N,
-	I(SrcImmByte | Mov | Stack, em_push), N,
+	I(SrcImmByte | Mov | Stack, em_push),
+	I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op),
 	D(DstDI | ByteOp | Mov | String), D(DstDI | Mov | String), /* insb, insw/insd */
 	D(SrcSI | ByteOp | ImplicitOps | String), D(SrcSI | ImplicitOps | String), /* outsb, outsw/outsd */
 	/* 0x70 - 0x7F */
-- 
cgit v1.2.3-18-g5258


From 5c82aa29988c0160d91f75cceebd0a07d8f2406b Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 18 Aug 2010 18:31:43 +0300
Subject: KVM: x86 emulator: implement IMUL REG, R/M (opcode 0F AF)

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 618386f8051..a4d2a469b4a 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2264,15 +2264,22 @@ static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
-static int em_imul_3op(struct x86_emulate_ctxt *ctxt)
+static int em_imul(struct x86_emulate_ctxt *ctxt)
 {
 	struct decode_cache *c = &ctxt->decode;
 
-	c->dst.val = c->src2.val;
 	emulate_2op_SrcV_nobyte("imul", c->src, c->dst, ctxt->eflags);
 	return X86EMUL_CONTINUE;
 }
 
+static int em_imul_3op(struct x86_emulate_ctxt *ctxt)
+{
+	struct decode_cache *c = &ctxt->decode;
+
+	c->dst.val = c->src2.val;
+	return em_imul(ctxt);
+}
+
 #define D(_y) { .flags = (_y) }
 #define N    D(0)
 #define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) }
@@ -2488,7 +2495,7 @@ static struct opcode twobyte_table[256] = {
 	N, D(DstMem | SrcReg | ModRM | BitOp | Lock),
 	D(DstMem | SrcReg | Src2ImmByte | ModRM),
 	D(DstMem | SrcReg | Src2CL | ModRM),
-	D(ModRM), N,
+	D(ModRM), I(DstReg | SrcMem | ModRM, em_imul),
 	/* 0xB0 - 0xB7 */
 	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
 	N, D(DstMem | SrcReg | ModRM | BitOp | Lock),
-- 
cgit v1.2.3-18-g5258


From 7077aec0bcd2f827aeb84ccc56c6f4367c376436 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 18 Aug 2010 18:53:43 +0300
Subject: KVM: x86 emulator: remove SrcImplicit

Useless.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index a4d2a469b4a..7f7fc646678 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -58,7 +58,6 @@
 #define DstMask     (7<<1)
 /* Source operand type. */
 #define SrcNone     (0<<4)	/* No source operand. */
-#define SrcImplicit (0<<4)	/* Source operand is implicit in the opcode. */
 #define SrcReg      (1<<4)	/* Register operand. */
 #define SrcMem      (2<<4)	/* Memory operand. */
 #define SrcMem16    (3<<4)	/* Memory operand (16-bit). */
@@ -2435,7 +2434,7 @@ static struct opcode opcode_table[256] = {
 	D(ImplicitOps), D(SrcImmByte), D(ImplicitOps | No64), D(ImplicitOps),
 	/* 0xD0 - 0xD7 */
 	D(ByteOp | DstMem | SrcOne | ModRM), D(DstMem | SrcOne | ModRM),
-	D(ByteOp | DstMem | SrcImplicit | ModRM), D(DstMem | SrcImplicit | ModRM),
+	D(ByteOp | DstMem | ModRM), D(DstMem | ModRM),
 	N, N, N, N,
 	/* 0xD8 - 0xDF */
 	N, N, N, N, N, N, N, N,
-- 
cgit v1.2.3-18-g5258


From 48bb5d3c401679e41e7a7f06ca31b3e54a6168f7 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 18 Aug 2010 18:54:34 +0300
Subject: KVM: x86 emulator: implement RDTSC (opcode 0F 31)

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 7f7fc646678..ed192d22020 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2279,6 +2279,22 @@ static int em_imul_3op(struct x86_emulate_ctxt *ctxt)
 	return em_imul(ctxt);
 }
 
+static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
+{
+	unsigned cpl = ctxt->ops->cpl(ctxt->vcpu);
+	struct decode_cache *c = &ctxt->decode;
+	u64 tsc = 0;
+
+	if (cpl > 0 && (ctxt->ops->get_cr(4, ctxt->vcpu) & X86_CR4_TSD)) {
+		emulate_gp(ctxt, 0);
+		return X86EMUL_PROPAGATE_FAULT;
+	}
+	ctxt->ops->get_msr(ctxt->vcpu, MSR_IA32_TSC, &tsc);
+	c->regs[VCPU_REGS_RAX] = (u32)tsc;
+	c->regs[VCPU_REGS_RDX] = tsc >> 32;
+	return X86EMUL_CONTINUE;
+}
+
 #define D(_y) { .flags = (_y) }
 #define N    D(0)
 #define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) }
@@ -2469,7 +2485,8 @@ static struct opcode twobyte_table[256] = {
 	N, N, N, N,
 	N, N, N, N, N, N, N, N,
 	/* 0x30 - 0x3F */
-	D(ImplicitOps | Priv), N, D(ImplicitOps | Priv), N,
+	D(ImplicitOps | Priv), I(ImplicitOps, em_rdtsc),
+	D(ImplicitOps | Priv), N,
 	D(ImplicitOps), D(ImplicitOps | Priv), N, N,
 	N, N, N, N, N, N, N, N,
 	/* 0x40 - 0x4F */
-- 
cgit v1.2.3-18-g5258


From 39f21ee546cf7d563d813c5fb4473431c1d8fce7 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 18 Aug 2010 19:20:21 +0300
Subject: KVM: x86 emulator: consolidate immediate decode into a function

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 109 +++++++++++++++++++++++++++++--------------------
 1 file changed, 64 insertions(+), 45 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index ed192d22020..95543a6beb5 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2541,6 +2541,55 @@ static struct opcode twobyte_table[256] = {
 #undef GD
 #undef I
 
+static unsigned imm_size(struct decode_cache *c)
+{
+	unsigned size;
+
+	size = (c->d & ByteOp) ? 1 : c->op_bytes;
+	if (size == 8)
+		size = 4;
+	return size;
+}
+
+static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op,
+		      unsigned size, bool sign_extension)
+{
+	struct decode_cache *c = &ctxt->decode;
+	struct x86_emulate_ops *ops = ctxt->ops;
+	int rc = X86EMUL_CONTINUE;
+
+	op->type = OP_IMM;
+	op->bytes = size;
+	op->addr.mem = c->eip;
+	/* NB. Immediates are sign-extended as necessary. */
+	switch (op->bytes) {
+	case 1:
+		op->val = insn_fetch(s8, 1, c->eip);
+		break;
+	case 2:
+		op->val = insn_fetch(s16, 2, c->eip);
+		break;
+	case 4:
+		op->val = insn_fetch(s32, 4, c->eip);
+		break;
+	}
+	if (!sign_extension) {
+		switch (op->bytes) {
+		case 1:
+			op->val &= 0xff;
+			break;
+		case 2:
+			op->val &= 0xffff;
+			break;
+		case 4:
+			op->val &= 0xffffffff;
+			break;
+		}
+	}
+done:
+	return rc;
+}
+
 int
 x86_decode_insn(struct x86_emulate_ctxt *ctxt)
 {
@@ -2730,52 +2779,19 @@ done_prefixes:
 		c->src = memop;
 		break;
 	case SrcImmU16:
-		c->src.bytes = 2;
-		goto srcimm;
+		rc = decode_imm(ctxt, &c->src, 2, false);
+		break;
 	case SrcImm:
+		rc = decode_imm(ctxt, &c->src, imm_size(c), true);
+		break;
 	case SrcImmU:
-		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		if (c->src.bytes == 8)
-			c->src.bytes = 4;
-	srcimm:
-		c->src.type = OP_IMM;
-		c->src.addr.mem = c->eip;
-		/* NB. Immediates are sign-extended as necessary. */
-		switch (c->src.bytes) {
-		case 1:
-			c->src.val = insn_fetch(s8, 1, c->eip);
-			break;
-		case 2:
-			c->src.val = insn_fetch(s16, 2, c->eip);
-			break;
-		case 4:
-			c->src.val = insn_fetch(s32, 4, c->eip);
-			break;
-		}
-		if ((c->d & SrcMask) == SrcImmU
-		    || (c->d & SrcMask) == SrcImmU16) {
-			switch (c->src.bytes) {
-			case 1:
-				c->src.val &= 0xff;
-				break;
-			case 2:
-				c->src.val &= 0xffff;
-				break;
-			case 4:
-				c->src.val &= 0xffffffff;
-				break;
-			}
-		}
+		rc = decode_imm(ctxt, &c->src, imm_size(c), false);
 		break;
 	case SrcImmByte:
+		rc = decode_imm(ctxt, &c->src, 1, true);
+		break;
 	case SrcImmUByte:
-		c->src.type = OP_IMM;
-		c->src.addr.mem = c->eip;
-		c->src.bytes = 1;
-		if ((c->d & SrcMask) == SrcImmByte)
-			c->src.val = insn_fetch(s8, 1, c->eip);
-		else
-			c->src.val = insn_fetch(u8, 1, c->eip);
+		rc = decode_imm(ctxt, &c->src, 1, false);
 		break;
 	case SrcAcc:
 		c->src.type = OP_REG;
@@ -2807,6 +2823,9 @@ done_prefixes:
 		break;
 	}
 
+	if (rc != X86EMUL_CONTINUE)
+		goto done;
+
 	/*
 	 * Decode and fetch the second source operand: register, memory
 	 * or immediate.
@@ -2819,10 +2838,7 @@ done_prefixes:
 		c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8;
 		break;
 	case Src2ImmByte:
-		c->src2.type = OP_IMM;
-		c->src2.addr.mem = c->eip;
-		c->src2.bytes = 1;
-		c->src2.val = insn_fetch(u8, 1, c->eip);
+		rc = decode_imm(ctxt, &c->src2, 1, true);
 		break;
 	case Src2One:
 		c->src2.bytes = 1;
@@ -2830,6 +2846,9 @@ done_prefixes:
 		break;
 	}
 
+	if (rc != X86EMUL_CONTINUE)
+		goto done;
+
 	/* Decode and fetch the destination operand: register or memory. */
 	switch (c->d & DstMask) {
 	case DstReg:
-- 
cgit v1.2.3-18-g5258


From 7db41eb76244ae623de842e818e459755968a33b Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 18 Aug 2010 19:25:28 +0300
Subject: KVM: x86 emulator: add Src2Imm decoding

Needed for 3-operand IMUL.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 95543a6beb5..f456d7e11b3 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -95,6 +95,7 @@
 #define Src2CL      (1<<29)
 #define Src2ImmByte (2<<29)
 #define Src2One     (3<<29)
+#define Src2Imm     (4<<29)
 #define Src2Mask    (7<<29)
 
 #define X2(x...) x, x
@@ -2844,6 +2845,9 @@ done_prefixes:
 		c->src2.bytes = 1;
 		c->src2.val = 1;
 		break;
+	case Src2Imm:
+		rc = decode_imm(ctxt, &c->src2, imm_size(c), true);
+		break;
 	}
 
 	if (rc != X86EMUL_CONTINUE)
-- 
cgit v1.2.3-18-g5258


From d46164dbd936bc11c7d2abed62f05b31c7a79ae7 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 18 Aug 2010 19:29:33 +0300
Subject: KVM: x86 emulator: implement IMUL REG, R/M, IMM (opcode 69)

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index f456d7e11b3..55849c3d5d8 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2402,7 +2402,8 @@ static struct opcode opcode_table[256] = {
 	N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ ,
 	N, N, N, N,
 	/* 0x68 - 0x6F */
-	I(SrcImm | Mov | Stack, em_push), N,
+	I(SrcImm | Mov | Stack, em_push),
+	I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op),
 	I(SrcImmByte | Mov | Stack, em_push),
 	I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op),
 	D(DstDI | ByteOp | Mov | String), D(DstDI | Mov | String), /* insb, insw/insd */
-- 
cgit v1.2.3-18-g5258


From 61429142802b068609ffd8ef48d891e05eeea0b9 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 19 Aug 2010 15:13:00 +0300
Subject: KVM: x86 emulator: implement CWD (opcode 99)

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 55849c3d5d8..e257f228686 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2280,6 +2280,18 @@ static int em_imul_3op(struct x86_emulate_ctxt *ctxt)
 	return em_imul(ctxt);
 }
 
+static int em_cwd(struct x86_emulate_ctxt *ctxt)
+{
+	struct decode_cache *c = &ctxt->decode;
+
+	c->dst.type = OP_REG;
+	c->dst.bytes = c->src.bytes;
+	c->dst.addr.reg = &c->regs[VCPU_REGS_RDX];
+	c->dst.val = ~((c->src.val >> (c->src.bytes * 8 - 1)) - 1);
+
+	return X86EMUL_CONTINUE;
+}
+
 static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
 {
 	unsigned cpl = ctxt->ops->cpl(ctxt->vcpu);
@@ -2425,7 +2437,8 @@ static struct opcode opcode_table[256] = {
 	/* 0x90 - 0x97 */
 	X8(D(SrcAcc | DstReg)),
 	/* 0x98 - 0x9F */
-	D(DstAcc | SrcNone), N, D(SrcImmFAddr | No64), N,
+	D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd),
+	D(SrcImmFAddr | No64), N,
 	D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, N,
 	/* 0xA0 - 0xA7 */
 	D(ByteOp | DstAcc | SrcMem | Mov | MemAbs), D(DstAcc | SrcMem | Mov | MemAbs),
-- 
cgit v1.2.3-18-g5258


From e0df7b9f6cee43c01d6f4a8491bccfd410cb86e1 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave@linux.vnet.ibm.com>
Date: Thu, 19 Aug 2010 18:11:05 -0700
Subject: KVM: abstract kvm x86 mmu->n_free_mmu_pages

"free" is a poor name for this value.  In this context, it means,
"the number of mmu pages which this kvm instance should be able to
allocate."  But "free" implies much more that the objects are there
and ready for use.  "available" is a much better description, especially
when you see how it is calculated.

In this patch, we abstract its use into a function.  We'll soon
replace the function's contents by calculating the value in a
different way.

All of the reads of n_free_mmu_pages are taken care of in this
patch.  The modification sites will be handled in a patch
later in the series.

Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
Signed-off-by: Tim Pepper <lnxninja@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 11 ++++-------
 arch/x86/kvm/mmu.h |  7 ++++++-
 2 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ff95d418750..625b1789466 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1696,7 +1696,7 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
 	int used_pages;
 	LIST_HEAD(invalid_list);
 
-	used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages;
+	used_pages = kvm->arch.n_alloc_mmu_pages - kvm_mmu_available_pages(kvm);
 	used_pages = max(0, used_pages);
 
 	/*
@@ -2959,18 +2959,15 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
 
 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
 {
-	int free_pages;
 	LIST_HEAD(invalid_list);
 
-	free_pages = vcpu->kvm->arch.n_free_mmu_pages;
-	while (free_pages < KVM_REFILL_PAGES &&
+	while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES &&
 	       !list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
 		struct kvm_mmu_page *sp;
 
 		sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
 				  struct kvm_mmu_page, link);
-		free_pages += kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
-						       &invalid_list);
+		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
 		++vcpu->kvm->stat.mmu_recycled;
 	}
 	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
@@ -3145,7 +3142,7 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
 		idx = srcu_read_lock(&kvm->srcu);
 		spin_lock(&kvm->mmu_lock);
 		npages = kvm->arch.n_alloc_mmu_pages -
-			 kvm->arch.n_free_mmu_pages;
+			 kvm_mmu_available_pages(kvm);
 		cache_count += npages;
 		if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
 			freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm,
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index be66759321a..c3a689ae7df 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -50,9 +50,14 @@
 
 int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
 
+static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
+{
+	return kvm->arch.n_free_mmu_pages;
+}
+
 static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
 {
-	if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
+	if (unlikely(kvm_mmu_available_pages(vcpu->kvm)< KVM_MIN_FREE_MMU_PAGES))
 		__kvm_mmu_free_some_pages(vcpu);
 }
 
-- 
cgit v1.2.3-18-g5258


From 39de71ec5397f374aed95e99509372d605e1407c Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave@linux.vnet.ibm.com>
Date: Thu, 19 Aug 2010 18:11:14 -0700
Subject: KVM: rename x86 kvm->arch.n_alloc_mmu_pages

arch.n_alloc_mmu_pages is a poor choice of name. This value truly
means, "the number of pages which _may_ be allocated".  But,
reading the name, "n_alloc_mmu_pages" implies "the number of allocated
mmu pages", which is dead wrong.

It's really the high watermark, so let's give it a name to match:
nr_max_mmu_pages.  This change will make the next few patches
much more obvious and easy to read.

Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
Signed-off-by: Tim Pepper <lnxninja@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 2 +-
 arch/x86/kvm/mmu.c              | 8 ++++----
 arch/x86/kvm/x86.c              | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c52e2eb40a1..02963684cd2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -369,7 +369,7 @@ struct kvm_vcpu_arch {
 struct kvm_arch {
 	unsigned int n_free_mmu_pages;
 	unsigned int n_requested_mmu_pages;
-	unsigned int n_alloc_mmu_pages;
+	unsigned int n_max_mmu_pages;
 	atomic_t invlpg_counter;
 	struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
 	/*
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 625b1789466..6979e7d1464 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1696,7 +1696,7 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
 	int used_pages;
 	LIST_HEAD(invalid_list);
 
-	used_pages = kvm->arch.n_alloc_mmu_pages - kvm_mmu_available_pages(kvm);
+	used_pages = kvm->arch.n_max_mmu_pages - kvm_mmu_available_pages(kvm);
 	used_pages = max(0, used_pages);
 
 	/*
@@ -1721,9 +1721,9 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
 	}
 	else
 		kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages
-					 - kvm->arch.n_alloc_mmu_pages;
+					 - kvm->arch.n_max_mmu_pages;
 
-	kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages;
+	kvm->arch.n_max_mmu_pages = kvm_nr_mmu_pages;
 }
 
 static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
@@ -3141,7 +3141,7 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
 
 		idx = srcu_read_lock(&kvm->srcu);
 		spin_lock(&kvm->mmu_lock);
-		npages = kvm->arch.n_alloc_mmu_pages -
+		npages = kvm->arch.n_max_mmu_pages -
 			 kvm_mmu_available_pages(kvm);
 		cache_count += npages;
 		if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c0004eb354d..4b4d2836240 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2759,7 +2759,7 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
 
 static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
 {
-	return kvm->arch.n_alloc_mmu_pages;
+	return kvm->arch.n_max_mmu_pages;
 }
 
 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
-- 
cgit v1.2.3-18-g5258


From 49d5ca26636cb8feb05aff92fc4dba3e494ec683 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave@linux.vnet.ibm.com>
Date: Thu, 19 Aug 2010 18:11:28 -0700
Subject: KVM: replace x86 kvm n_free_mmu_pages with n_used_mmu_pages

Doing this makes the code much more readable.  That's
borne out by the fact that this patch removes code.  "used"
also happens to be the number that we need to return back to
the slab code when our shrinker gets called.  Keeping this
value as opposed to free makes the next patch simpler.

So, 'struct kvm' is kzalloc()'d.  'struct kvm_arch' is a
structure member (and not a pointer) of 'struct kvm'.  That
means they start out zeroed.  I _think_ they get initialized
properly by kvm_mmu_change_mmu_pages().  But, that only happens
via kvm ioctls.

Another benefit of storing 'used' intead of 'free' is
that the values are consistent from the moment the structure is
allocated: no negative "used" value.

Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
Signed-off-by: Tim Pepper <lnxninja@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/mmu.c              | 27 +++++++++------------------
 arch/x86/kvm/mmu.h              |  3 ++-
 3 files changed, 12 insertions(+), 20 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 02963684cd2..e01b7282556 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -367,7 +367,7 @@ struct kvm_vcpu_arch {
 };
 
 struct kvm_arch {
-	unsigned int n_free_mmu_pages;
+	unsigned int n_used_mmu_pages;
 	unsigned int n_requested_mmu_pages;
 	unsigned int n_max_mmu_pages;
 	atomic_t invlpg_counter;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 6979e7d1464..ff39b85d7a4 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -980,7 +980,7 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 	if (!sp->role.direct)
 		__free_page(virt_to_page(sp->gfns));
 	kmem_cache_free(mmu_page_header_cache, sp);
-	++kvm->arch.n_free_mmu_pages;
+	--kvm->arch.n_used_mmu_pages;
 }
 
 static unsigned kvm_page_table_hashfn(gfn_t gfn)
@@ -1003,7 +1003,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
 	bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
 	sp->multimapped = 0;
 	sp->parent_pte = parent_pte;
-	--vcpu->kvm->arch.n_free_mmu_pages;
+	++vcpu->kvm->arch.n_used_mmu_pages;
 	return sp;
 }
 
@@ -1689,41 +1689,32 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 
 /*
  * Changing the number of mmu pages allocated to the vm
- * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
+ * Note: if goal_nr_mmu_pages is too small, you will get dead lock
  */
-void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
+void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
 {
-	int used_pages;
 	LIST_HEAD(invalid_list);
-
-	used_pages = kvm->arch.n_max_mmu_pages - kvm_mmu_available_pages(kvm);
-	used_pages = max(0, used_pages);
-
 	/*
 	 * If we set the number of mmu pages to be smaller be than the
 	 * number of actived pages , we must to free some mmu pages before we
 	 * change the value
 	 */
 
-	if (used_pages > kvm_nr_mmu_pages) {
-		while (used_pages > kvm_nr_mmu_pages &&
+	if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
+		while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages &&
 			!list_empty(&kvm->arch.active_mmu_pages)) {
 			struct kvm_mmu_page *page;
 
 			page = container_of(kvm->arch.active_mmu_pages.prev,
 					    struct kvm_mmu_page, link);
-			used_pages -= kvm_mmu_prepare_zap_page(kvm, page,
+			kvm_mmu_prepare_zap_page(kvm, page,
 							       &invalid_list);
 		}
 		kvm_mmu_commit_zap_page(kvm, &invalid_list);
-		kvm_nr_mmu_pages = used_pages;
-		kvm->arch.n_free_mmu_pages = 0;
+		goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
 	}
-	else
-		kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages
-					 - kvm->arch.n_max_mmu_pages;
 
-	kvm->arch.n_max_mmu_pages = kvm_nr_mmu_pages;
+	kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
 }
 
 static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index c3a689ae7df..f05a03dfba4 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -52,7 +52,8 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
 
 static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
 {
-	return kvm->arch.n_free_mmu_pages;
+	return kvm->arch.n_max_mmu_pages -
+		kvm->arch.n_used_mmu_pages;
 }
 
 static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3-18-g5258


From 45221ab6684a82a5b60208b76d6f8bfb1bbcb969 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave@linux.vnet.ibm.com>
Date: Thu, 19 Aug 2010 18:11:37 -0700
Subject: KVM: create aggregate kvm_total_used_mmu_pages value

Of slab shrinkers, the VM code says:

 * Note that 'shrink' will be passed nr_to_scan == 0 when the VM is
 * querying the cache size, so a fastpath for that case is appropriate.

and it *means* it.  Look at how it calls the shrinkers:

    nr_before = (*shrinker->shrink)(0, gfp_mask);
    shrink_ret = (*shrinker->shrink)(this_scan, gfp_mask);

So, if you do anything stupid in your shrinker, the VM will doubly
punish you.

The mmu_shrink() function takes the global kvm_lock, then acquires
every VM's kvm->mmu_lock in sequence.  If we have 100 VMs, then
we're going to take 101 locks.  We do it twice, so each call takes
202 locks.  If we're under memory pressure, we can have each cpu
trying to do this.  It can get really hairy, and we've seen lock
spinning in mmu_shrink() be the dominant entry in profiles.

This is guaranteed to optimize at least half of those lock
aquisitions away.  It removes the need to take any of the locks
when simply trying to count objects.

A 'percpu_counter' can be a large object, but we only have one
of these for the entire system.  There are not any better
alternatives at the moment, especially ones that handle CPU
hotplug.

Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
Signed-off-by: Tim Pepper <lnxninja@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 34 ++++++++++++++++++++++++----------
 1 file changed, 24 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ff39b85d7a4..33d7af50cf8 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -178,6 +178,7 @@ typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);
 static struct kmem_cache *pte_chain_cache;
 static struct kmem_cache *rmap_desc_cache;
 static struct kmem_cache *mmu_page_header_cache;
+static struct percpu_counter kvm_total_used_mmu_pages;
 
 static u64 __read_mostly shadow_trap_nonpresent_pte;
 static u64 __read_mostly shadow_notrap_nonpresent_pte;
@@ -971,6 +972,18 @@ static int is_empty_shadow_page(u64 *spt)
 }
 #endif
 
+/*
+ * This value is the sum of all of the kvm instances's
+ * kvm->arch.n_used_mmu_pages values.  We need a global,
+ * aggregate version in order to make the slab shrinker
+ * faster
+ */
+static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
+{
+	kvm->arch.n_used_mmu_pages += nr;
+	percpu_counter_add(&kvm_total_used_mmu_pages, nr);
+}
+
 static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
 	ASSERT(is_empty_shadow_page(sp->spt));
@@ -980,7 +993,7 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 	if (!sp->role.direct)
 		__free_page(virt_to_page(sp->gfns));
 	kmem_cache_free(mmu_page_header_cache, sp);
-	--kvm->arch.n_used_mmu_pages;
+	kvm_mod_used_mmu_pages(kvm, -1);
 }
 
 static unsigned kvm_page_table_hashfn(gfn_t gfn)
@@ -1003,7 +1016,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
 	bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
 	sp->multimapped = 0;
 	sp->parent_pte = parent_pte;
-	++vcpu->kvm->arch.n_used_mmu_pages;
+	kvm_mod_used_mmu_pages(vcpu->kvm, +1);
 	return sp;
 }
 
@@ -3122,23 +3135,22 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
 {
 	struct kvm *kvm;
 	struct kvm *kvm_freed = NULL;
-	int cache_count = 0;
+
+	if (nr_to_scan == 0)
+		goto out;
 
 	spin_lock(&kvm_lock);
 
 	list_for_each_entry(kvm, &vm_list, vm_list) {
-		int npages, idx, freed_pages;
+		int idx, freed_pages;
 		LIST_HEAD(invalid_list);
 
 		idx = srcu_read_lock(&kvm->srcu);
 		spin_lock(&kvm->mmu_lock);
-		npages = kvm->arch.n_max_mmu_pages -
-			 kvm_mmu_available_pages(kvm);
-		cache_count += npages;
-		if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
+		if (!kvm_freed && nr_to_scan > 0 &&
+		    kvm->arch.n_used_mmu_pages > 0) {
 			freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm,
 							  &invalid_list);
-			cache_count -= freed_pages;
 			kvm_freed = kvm;
 		}
 		nr_to_scan--;
@@ -3152,7 +3164,8 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
 
 	spin_unlock(&kvm_lock);
 
-	return cache_count;
+out:
+	return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
 }
 
 static struct shrinker mmu_shrinker = {
@@ -3195,6 +3208,7 @@ int kvm_mmu_module_init(void)
 	if (!mmu_page_header_cache)
 		goto nomem;
 
+	percpu_counter_init(&kvm_total_used_mmu_pages, 0);
 	register_shrinker(&mmu_shrinker);
 
 	return 0;
-- 
cgit v1.2.3-18-g5258


From 09b5f4d3c4aa2d4928c0a3723a8de26a76b6339e Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Mon, 23 Aug 2010 14:56:54 +0800
Subject: KVM: x86 emulator: add LDS/LES/LFS/LGS/LSS instruction emulation

Add LDS/LES/LFS/LGS/LSS instruction emulation.
(opcode 0xc4, 0xc5, 0x0f 0xb2, 0x0f 0xb4~0xb5)

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 46 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index e257f228686..aece501edce 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1514,6 +1514,23 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
 	return rc;
 }
 
+static int emulate_load_segment(struct x86_emulate_ctxt *ctxt,
+			   struct x86_emulate_ops *ops, int seg)
+{
+	struct decode_cache *c = &ctxt->decode;
+	unsigned short sel;
+	int rc;
+
+	memcpy(&sel, c->src.valptr + c->op_bytes, 2);
+
+	rc = load_segment_descriptor(ctxt, ops, sel, seg);
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+
+	c->dst.val = c->src.val;
+	return rc;
+}
+
 static inline void
 setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
 			struct x86_emulate_ops *ops, struct desc_struct *cs,
@@ -2458,7 +2475,7 @@ static struct opcode opcode_table[256] = {
 	D(ByteOp | DstMem | SrcImm | ModRM), D(DstMem | SrcImmByte | ModRM),
 	I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm),
 	D(ImplicitOps | Stack),
-	N, N,
+	D(DstReg | SrcMemFAddr | ModRM | No64), D(DstReg | SrcMemFAddr | ModRM | No64),
 	D(ByteOp | DstMem | SrcImm | ModRM | Mov), D(DstMem | SrcImm | ModRM | Mov),
 	/* 0xC8 - 0xCF */
 	N, N, N, D(ImplicitOps | Stack),
@@ -2529,9 +2546,9 @@ static struct opcode twobyte_table[256] = {
 	D(ModRM), I(DstReg | SrcMem | ModRM, em_imul),
 	/* 0xB0 - 0xB7 */
 	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
-	N, D(DstMem | SrcReg | ModRM | BitOp | Lock),
-	N, N, D(ByteOp | DstReg | SrcMem | ModRM | Mov),
-	    D(DstReg | SrcMem16 | ModRM | Mov),
+	D(DstReg | SrcMemFAddr | ModRM), D(DstMem | SrcReg | ModRM | BitOp | Lock),
+	D(DstReg | SrcMemFAddr | ModRM), D(DstReg | SrcMemFAddr | ModRM),
+	D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
 	/* 0xB8 - 0xBF */
 	N, N,
 	G(BitOp, group8), D(DstMem | SrcReg | ModRM | BitOp | Lock),
@@ -3214,6 +3231,16 @@ special_insn:
 		c->dst.addr.reg = &c->eip;
 		c->dst.bytes = c->op_bytes;
 		goto pop_instruction;
+	case 0xc4:		/* les */
+		rc = emulate_load_segment(ctxt, ops, VCPU_SREG_ES);
+		if (rc != X86EMUL_CONTINUE)
+			goto done;
+		break;
+	case 0xc5:		/* lds */
+		rc = emulate_load_segment(ctxt, ops, VCPU_SREG_DS);
+		if (rc != X86EMUL_CONTINUE)
+			goto done;
+		break;
 	case 0xc6 ... 0xc7:	/* mov (sole member of Grp11) */
 	mov:
 		c->dst.val = c->src.val;
@@ -3659,10 +3686,25 @@ twobyte_insn:
 			c->dst.addr.reg = (unsigned long *)&c->regs[VCPU_REGS_RAX];
 		}
 		break;
+	case 0xb2:		/* lss */
+		rc = emulate_load_segment(ctxt, ops, VCPU_SREG_SS);
+		if (rc != X86EMUL_CONTINUE)
+			goto done;
+		break;
 	case 0xb3:
 	      btr:		/* btr */
 		emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags);
 		break;
+	case 0xb4:		/* lfs */
+		rc = emulate_load_segment(ctxt, ops, VCPU_SREG_FS);
+		if (rc != X86EMUL_CONTINUE)
+			goto done;
+		break;
+	case 0xb5:		/* lgs */
+		rc = emulate_load_segment(ctxt, ops, VCPU_SREG_GS);
+		if (rc != X86EMUL_CONTINUE)
+			goto done;
+		break;
 	case 0xb6 ... 0xb7:	/* movzx */
 		c->dst.bytes = c->op_bytes;
 		c->dst.val = (c->d & ByteOp) ? (u8) c->src.val
-- 
cgit v1.2.3-18-g5258


From e4abac67b756680c63af369f053d11991616aeb4 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Thu, 19 Aug 2010 14:25:48 +0800
Subject: KVM: x86 emulator: add JrCXZ instruction emulation

Add JrCXZ instruction emulation (opcode 0xe3)
Used by FreeBSD boot loader.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index aece501edce..312e798d542 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2487,7 +2487,7 @@ static struct opcode opcode_table[256] = {
 	/* 0xD8 - 0xDF */
 	N, N, N, N, N, N, N, N,
 	/* 0xE0 - 0xE7 */
-	X3(D(SrcImmByte)), N,
+	X4(D(SrcImmByte)),
 	D(ByteOp | SrcImmUByte | DstAcc), D(SrcImmUByte | DstAcc),
 	D(ByteOp | SrcAcc | DstImmUByte), D(SrcAcc | DstImmUByte),
 	/* 0xE8 - 0xEF */
@@ -3285,6 +3285,10 @@ special_insn:
 		    (c->b == 0xe2 || test_cc(c->b ^ 0x5, ctxt->eflags)))
 			jmp_rel(c, c->src.val);
 		break;
+	case 0xe3:	/* jcxz/jecxz/jrcxz */
+		if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0)
+			jmp_rel(c, c->src.val);
+		break;
 	case 0xe4: 	/* inb */
 	case 0xe5: 	/* in */
 		goto do_io_in;
-- 
cgit v1.2.3-18-g5258


From 80b63faf028fba79e630d3643b0e615bddf4067b Mon Sep 17 00:00:00 2001
From: Xiaotian Feng <dfeng@redhat.com>
Date: Tue, 24 Aug 2010 10:31:07 +0800
Subject: KVM: MMU: fix regression from rework mmu_shrink() code

Latest kvm mmu_shrink code rework makes kernel changes kvm->arch.n_used_mmu_pages/
kvm->arch.n_max_mmu_pages at kvm_mmu_free_page/kvm_mmu_alloc_page, which is called
by kvm_mmu_commit_zap_page. So the kvm->arch.n_used_mmu_pages or
kvm_mmu_available_pages(vcpu->kvm) is unchanged after kvm_mmu_prepare_zap_page(),
This caused kvm_mmu_change_mmu_pages/__kvm_mmu_free_some_pages loops forever.
Moving kvm_mmu_commit_zap_page would make the while loop performs as normal.

Reported-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Xiaotian Feng <dfeng@redhat.com>
Tested-by: Avi Kivity <avi@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Tim Pepper <lnxninja@linux.vnet.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 33d7af50cf8..c2ac7004441 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1720,10 +1720,9 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
 
 			page = container_of(kvm->arch.active_mmu_pages.prev,
 					    struct kvm_mmu_page, link);
-			kvm_mmu_prepare_zap_page(kvm, page,
-							       &invalid_list);
+			kvm_mmu_prepare_zap_page(kvm, page, &invalid_list);
+			kvm_mmu_commit_zap_page(kvm, &invalid_list);
 		}
-		kvm_mmu_commit_zap_page(kvm, &invalid_list);
 		goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
 	}
 
@@ -2972,9 +2971,9 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
 		sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
 				  struct kvm_mmu_page, link);
 		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
+		kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
 		++vcpu->kvm->stat.mmu_recycled;
 	}
-	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
 }
 
 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
-- 
cgit v1.2.3-18-g5258


From 45bf21a8ce7a2884f067a702a5c7683684846ce1 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Mon, 23 Aug 2010 16:13:15 +0800
Subject: KVM: MMU: fix missing percpu counter destroy

commit ad05c88266b4cce1c820928ce8a0fb7690912ba1
(KVM: create aggregate kvm_total_used_mmu_pages value)
introduce percpu counter kvm_total_used_mmu_pages but never
destroy it, this may cause oops when rmmod & modprobe.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Acked-by: Tim Pepper <lnxninja@linux.vnet.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c2ac7004441..54a50268ceb 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3185,6 +3185,7 @@ static void mmu_destroy_caches(void)
 void kvm_mmu_module_exit(void)
 {
 	mmu_destroy_caches();
+	percpu_counter_destroy(&kvm_total_used_mmu_pages);
 	unregister_shrinker(&mmu_shrinker);
 }
 
@@ -3207,7 +3208,9 @@ int kvm_mmu_module_init(void)
 	if (!mmu_page_header_cache)
 		goto nomem;
 
-	percpu_counter_init(&kvm_total_used_mmu_pages, 0);
+	if (percpu_counter_init(&kvm_total_used_mmu_pages, 0))
+		goto nomem;
+
 	register_shrinker(&mmu_shrinker);
 
 	return 0;
-- 
cgit v1.2.3-18-g5258


From ae38436b78a8abff767e2ac10e2cd663a7eef476 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zamsden@redhat.com>
Date: Thu, 19 Aug 2010 22:07:15 -1000
Subject: KVM: x86: Drop vm_init_tsc

This is used only by the VMX code, and is not done properly;
if the TSC is indeed backwards, it is out of sync, and will
need proper handling in the logic at each and every CPU change.
For now, drop this test during init as misguided.

Signed-off-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 -
 arch/x86/kvm/vmx.c              | 10 +++-------
 arch/x86/kvm/x86.c              |  2 --
 3 files changed, 3 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index e01b7282556..6056a23dc4c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -394,7 +394,6 @@ struct kvm_arch {
 	gpa_t ept_identity_map_addr;
 
 	unsigned long irq_sources_bitmap;
-	u64 vm_init_tsc;
 	s64 kvmclock_offset;
 
 	struct kvm_xen_hvm_config xen_hvm_config;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 751a2d29f4c..4fbab2469bf 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2512,7 +2512,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 {
 	u32 host_sysenter_cs, msr_low, msr_high;
 	u32 junk;
-	u64 host_pat, tsc_this, tsc_base;
+	u64 host_pat, tsc_this;
 	unsigned long a;
 	struct desc_ptr dt;
 	int i;
@@ -2653,12 +2653,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 		vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
 	vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
 
-	tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc;
-	rdtscll(tsc_this);
-	if (tsc_this < vmx->vcpu.kvm->arch.vm_init_tsc)
-		tsc_base = tsc_this;
-
-	guest_write_tsc(0, tsc_base);
+	tsc_this = native_read_tsc();
+	guest_write_tsc(0, tsc_this);
 
 	return 0;
 }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4b4d2836240..8b0c51a1ada 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5495,8 +5495,6 @@ struct  kvm *kvm_arch_create_vm(void)
 	/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
 	set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
 
-	rdtscll(kvm->arch.vm_init_tsc);
-
 	return kvm;
 }
 
-- 
cgit v1.2.3-18-g5258


From f4e1b3c8bd2a044cd0ccf80595bfd088a49fe60b Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zamsden@redhat.com>
Date: Thu, 19 Aug 2010 22:07:16 -1000
Subject: KVM: x86: Convert TSC writes to TSC offset writes

Change svm / vmx to be the same internally and write TSC offset
instead of bare TSC in helper functions.  Isolated as a single
patch to contain code movement.

Signed-off-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/svm.c | 31 +++++++++++++++++--------------
 arch/x86/kvm/vmx.c | 11 +++++------
 2 files changed, 22 insertions(+), 20 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index af5b9ea5196..e06f00d1f15 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -701,6 +701,20 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
 	seg->base = 0;
 }
 
+static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	u64 g_tsc_offset = 0;
+
+	if (is_nested(svm)) {
+		g_tsc_offset = svm->vmcb->control.tsc_offset -
+			       svm->nested.hsave->control.tsc_offset;
+		svm->nested.hsave->control.tsc_offset = offset;
+	}
+
+	svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
+}
+
 static void init_vmcb(struct vcpu_svm *svm)
 {
 	struct vmcb_control_area *control = &svm->vmcb->control;
@@ -901,7 +915,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 	svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
 	svm->asid_generation = 0;
 	init_vmcb(svm);
-	svm->vmcb->control.tsc_offset = 0-native_read_tsc();
+	svm_write_tsc_offset(&svm->vcpu, 0-native_read_tsc());
 
 	err = fx_init(&svm->vcpu);
 	if (err)
@@ -2566,20 +2580,9 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
 	struct vcpu_svm *svm = to_svm(vcpu);
 
 	switch (ecx) {
-	case MSR_IA32_TSC: {
-		u64 tsc_offset = data - native_read_tsc();
-		u64 g_tsc_offset = 0;
-
-		if (is_nested(svm)) {
-			g_tsc_offset = svm->vmcb->control.tsc_offset -
-				       svm->nested.hsave->control.tsc_offset;
-			svm->nested.hsave->control.tsc_offset = tsc_offset;
-		}
-
-		svm->vmcb->control.tsc_offset = tsc_offset + g_tsc_offset;
-
+	case MSR_IA32_TSC:
+		svm_write_tsc_offset(vcpu, data - native_read_tsc());
 		break;
-	}
 	case MSR_STAR:
 		svm->vmcb->save.star = data;
 		break;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4fbab2469bf..d9bec5ee38b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1149,9 +1149,9 @@ static u64 guest_read_tsc(void)
  * writes 'guest_tsc' into guest's timestamp counter "register"
  * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc
  */
-static void guest_write_tsc(u64 guest_tsc, u64 host_tsc)
+static void vmx_write_tsc_offset(u64 offset)
 {
-	vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc);
+	vmcs_write64(TSC_OFFSET, offset);
 }
 
 /*
@@ -1255,7 +1255,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 		break;
 	case MSR_IA32_TSC:
 		rdtscll(host_tsc);
-		guest_write_tsc(data, host_tsc);
+		vmx_write_tsc_offset(data - host_tsc);
 		break;
 	case MSR_IA32_CR_PAT:
 		if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
@@ -2512,7 +2512,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 {
 	u32 host_sysenter_cs, msr_low, msr_high;
 	u32 junk;
-	u64 host_pat, tsc_this;
+	u64 host_pat;
 	unsigned long a;
 	struct desc_ptr dt;
 	int i;
@@ -2653,8 +2653,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 		vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
 	vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
 
-	tsc_this = native_read_tsc();
-	guest_write_tsc(0, tsc_this);
+	vmx_write_tsc_offset(0-native_read_tsc());
 
 	return 0;
 }
-- 
cgit v1.2.3-18-g5258


From 99e3e30aee1a326a98bf3a5f47b8622219c685f3 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zamsden@redhat.com>
Date: Thu, 19 Aug 2010 22:07:17 -1000
Subject: KVM: x86: Move TSC offset writes to common code

Also, ensure that the storing of the offset and the reading of the TSC
are never preempted by taking a spinlock.  While the lock is overkill
now, it is useful later in this patch series.

Signed-off-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  3 +++
 arch/x86/kvm/svm.c              |  6 ++++--
 arch/x86/kvm/vmx.c              | 13 ++++++-------
 arch/x86/kvm/x86.c              | 18 ++++++++++++++++++
 arch/x86/kvm/x86.h              |  2 ++
 5 files changed, 33 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6056a23dc4c..a215153f1ff 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -395,6 +395,7 @@ struct kvm_arch {
 
 	unsigned long irq_sources_bitmap;
 	s64 kvmclock_offset;
+	spinlock_t tsc_write_lock;
 
 	struct kvm_xen_hvm_config xen_hvm_config;
 
@@ -521,6 +522,8 @@ struct kvm_x86_ops {
 
 	bool (*has_wbinvd_exit)(void);
 
+	void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
+
 	const struct trace_print_flags *exit_reasons_str;
 };
 
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index e06f00d1f15..ea41c551fa4 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -915,7 +915,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 	svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
 	svm->asid_generation = 0;
 	init_vmcb(svm);
-	svm_write_tsc_offset(&svm->vcpu, 0-native_read_tsc());
+	kvm_write_tsc(&svm->vcpu, 0);
 
 	err = fx_init(&svm->vcpu);
 	if (err)
@@ -2581,7 +2581,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
 
 	switch (ecx) {
 	case MSR_IA32_TSC:
-		svm_write_tsc_offset(vcpu, data - native_read_tsc());
+		kvm_write_tsc(vcpu, data);
 		break;
 	case MSR_STAR:
 		svm->vmcb->save.star = data;
@@ -3551,6 +3551,8 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.set_supported_cpuid = svm_set_supported_cpuid,
 
 	.has_wbinvd_exit = svm_has_wbinvd_exit,
+
+	.write_tsc_offset = svm_write_tsc_offset,
 };
 
 static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d9bec5ee38b..138746d3afe 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1146,10 +1146,9 @@ static u64 guest_read_tsc(void)
 }
 
 /*
- * writes 'guest_tsc' into guest's timestamp counter "register"
- * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc
+ * writes 'offset' into guest's timestamp counter offset register
  */
-static void vmx_write_tsc_offset(u64 offset)
+static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 {
 	vmcs_write64(TSC_OFFSET, offset);
 }
@@ -1224,7 +1223,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct shared_msr_entry *msr;
-	u64 host_tsc;
 	int ret = 0;
 
 	switch (msr_index) {
@@ -1254,8 +1252,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 		vmcs_writel(GUEST_SYSENTER_ESP, data);
 		break;
 	case MSR_IA32_TSC:
-		rdtscll(host_tsc);
-		vmx_write_tsc_offset(data - host_tsc);
+		kvm_write_tsc(vcpu, data);
 		break;
 	case MSR_IA32_CR_PAT:
 		if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
@@ -2653,7 +2650,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 		vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
 	vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
 
-	vmx_write_tsc_offset(0-native_read_tsc());
+	kvm_write_tsc(&vmx->vcpu, 0);
 
 	return 0;
 }
@@ -4348,6 +4345,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.set_supported_cpuid = vmx_set_supported_cpuid,
 
 	.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
+
+	.write_tsc_offset = vmx_write_tsc_offset,
 };
 
 static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8b0c51a1ada..886132b6ef1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -895,6 +895,22 @@ static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *
 
 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
 
+void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
+{
+	struct kvm *kvm = vcpu->kvm;
+	u64 offset;
+	unsigned long flags;
+
+	spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
+	offset = data - native_read_tsc();
+	kvm_x86_ops->write_tsc_offset(vcpu, offset);
+	spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
+
+	/* Reset of TSC must disable overshoot protection below */
+	vcpu->arch.hv_clock.tsc_timestamp = 0;
+}
+EXPORT_SYMBOL_GPL(kvm_write_tsc);
+
 static void kvm_write_guest_time(struct kvm_vcpu *v)
 {
 	struct timespec ts;
@@ -5495,6 +5511,8 @@ struct  kvm *kvm_arch_create_vm(void)
 	/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
 	set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
 
+	spin_lock_init(&kvm->arch.tsc_write_lock);
+
 	return kvm;
 }
 
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index b7a404722d2..2d6385e44cc 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -68,4 +68,6 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
 void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
 void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
 
+void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data);
+
 #endif
-- 
cgit v1.2.3-18-g5258


From f38e098ff3a315bb74abbb4a35cba11bbea8e2fa Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zamsden@redhat.com>
Date: Thu, 19 Aug 2010 22:07:20 -1000
Subject: KVM: x86: TSC reset compensation

Attempt to synchronize TSCs which are reset to the same value.  In the
case of a reliable hardware TSC, we can just re-use the same offset, but
on non-reliable hardware, we can get closer by adjusting the offset to
match the elapsed time.

Signed-off-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  3 +++
 arch/x86/kvm/x86.c              | 31 ++++++++++++++++++++++++++++++-
 2 files changed, 33 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a215153f1ff..57b4394491e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -396,6 +396,9 @@ struct kvm_arch {
 	unsigned long irq_sources_bitmap;
 	s64 kvmclock_offset;
 	spinlock_t tsc_write_lock;
+	u64 last_tsc_nsec;
+	u64 last_tsc_offset;
+	u64 last_tsc_write;
 
 	struct kvm_xen_hvm_config xen_hvm_config;
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 886132b6ef1..e7da14c317e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -898,11 +898,40 @@ static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
 void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
 {
 	struct kvm *kvm = vcpu->kvm;
-	u64 offset;
+	u64 offset, ns, elapsed;
 	unsigned long flags;
+	struct timespec ts;
 
 	spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
 	offset = data - native_read_tsc();
+	ktime_get_ts(&ts);
+	monotonic_to_bootbased(&ts);
+	ns = timespec_to_ns(&ts);
+	elapsed = ns - kvm->arch.last_tsc_nsec;
+
+	/*
+	 * Special case: identical write to TSC within 5 seconds of
+	 * another CPU is interpreted as an attempt to synchronize
+	 * (the 5 seconds is to accomodate host load / swapping).
+	 *
+	 * In that case, for a reliable TSC, we can match TSC offsets,
+	 * or make a best guest using kernel_ns value.
+	 */
+	if (data == kvm->arch.last_tsc_write && elapsed < 5ULL * NSEC_PER_SEC) {
+		if (!check_tsc_unstable()) {
+			offset = kvm->arch.last_tsc_offset;
+			pr_debug("kvm: matched tsc offset for %llu\n", data);
+		} else {
+			u64 tsc_delta = elapsed * __get_cpu_var(cpu_tsc_khz);
+			tsc_delta = tsc_delta / USEC_PER_SEC;
+			offset += tsc_delta;
+			pr_debug("kvm: adjusted tsc offset by %llu\n", tsc_delta);
+		}
+		ns = kvm->arch.last_tsc_nsec;
+	}
+	kvm->arch.last_tsc_nsec = ns;
+	kvm->arch.last_tsc_write = data;
+	kvm->arch.last_tsc_offset = offset;
 	kvm_x86_ops->write_tsc_offset(vcpu, offset);
 	spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
 
-- 
cgit v1.2.3-18-g5258


From 8cfdc0008542b57caadbfe013da163131a8293f4 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zamsden@redhat.com>
Date: Thu, 19 Aug 2010 22:07:21 -1000
Subject: KVM: x86: Make cpu_tsc_khz updates use local CPU

This simplifies much of the init code; we can now simply always
call tsc_khz_changed, optionally passing it a new value, or letting
it figure out the existing value (while interrupts are disabled, and
thus, by inference from the rule, not raceful against CPU hotplug or
frequency updates, which will issue IPIs to the local CPU to perform
this very same task).

Signed-off-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 157 ++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 114 insertions(+), 43 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e7da14c317e..699c6b89c1b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -895,6 +895,15 @@ static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *
 
 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
 
+static inline int kvm_tsc_changes_freq(void)
+{
+	int cpu = get_cpu();
+	int ret = !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
+		  cpufreq_quick_get(cpu) != 0;
+	put_cpu();
+	return ret;
+}
+
 void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
 {
 	struct kvm *kvm = vcpu->kvm;
@@ -940,7 +949,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
 }
 EXPORT_SYMBOL_GPL(kvm_write_tsc);
 
-static void kvm_write_guest_time(struct kvm_vcpu *v)
+static int kvm_write_guest_time(struct kvm_vcpu *v)
 {
 	struct timespec ts;
 	unsigned long flags;
@@ -949,24 +958,27 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
 	unsigned long this_tsc_khz;
 
 	if ((!vcpu->time_page))
-		return;
-
-	this_tsc_khz = get_cpu_var(cpu_tsc_khz);
-	if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) {
-		kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock);
-		vcpu->hv_clock_tsc_khz = this_tsc_khz;
-	}
-	put_cpu_var(cpu_tsc_khz);
+		return 0;
 
 	/* Keep irq disabled to prevent changes to the clock */
 	local_irq_save(flags);
 	kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp);
 	ktime_get_ts(&ts);
 	monotonic_to_bootbased(&ts);
+	this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
 	local_irq_restore(flags);
 
-	/* With all the info we got, fill in the values */
+	if (unlikely(this_tsc_khz == 0)) {
+		kvm_make_request(KVM_REQ_KVMCLOCK_UPDATE, v);
+		return 1;
+	}
 
+	if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) {
+		kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock);
+		vcpu->hv_clock_tsc_khz = this_tsc_khz;
+	}
+
+	/* With all the info we got, fill in the values */
 	vcpu->hv_clock.system_time = ts.tv_nsec +
 				     (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset;
 
@@ -987,6 +999,7 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
 	kunmap_atomic(shared_kaddr, KM_USER0);
 
 	mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
+	return 0;
 }
 
 static int kvm_request_guest_time_update(struct kvm_vcpu *v)
@@ -1853,12 +1866,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	}
 
 	kvm_x86_ops->vcpu_load(vcpu, cpu);
-	if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) {
-		unsigned long khz = cpufreq_quick_get(cpu);
-		if (!khz)
-			khz = tsc_khz;
-		per_cpu(cpu_tsc_khz, cpu) = khz;
-	}
 	kvm_request_guest_time_update(vcpu);
 }
 
@@ -4152,9 +4159,23 @@ int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
 }
 EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
 
-static void bounce_off(void *info)
+static void tsc_bad(void *info)
+{
+	__get_cpu_var(cpu_tsc_khz) = 0;
+}
+
+static void tsc_khz_changed(void *data)
 {
-	/* nothing */
+	struct cpufreq_freqs *freq = data;
+	unsigned long khz = 0;
+
+	if (data)
+		khz = freq->new;
+	else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+		khz = cpufreq_quick_get(raw_smp_processor_id());
+	if (!khz)
+		khz = tsc_khz;
+	__get_cpu_var(cpu_tsc_khz) = khz;
 }
 
 static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
@@ -4165,11 +4186,51 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
 	struct kvm_vcpu *vcpu;
 	int i, send_ipi = 0;
 
+	/*
+	 * We allow guests to temporarily run on slowing clocks,
+	 * provided we notify them after, or to run on accelerating
+	 * clocks, provided we notify them before.  Thus time never
+	 * goes backwards.
+	 *
+	 * However, we have a problem.  We can't atomically update
+	 * the frequency of a given CPU from this function; it is
+	 * merely a notifier, which can be called from any CPU.
+	 * Changing the TSC frequency at arbitrary points in time
+	 * requires a recomputation of local variables related to
+	 * the TSC for each VCPU.  We must flag these local variables
+	 * to be updated and be sure the update takes place with the
+	 * new frequency before any guests proceed.
+	 *
+	 * Unfortunately, the combination of hotplug CPU and frequency
+	 * change creates an intractable locking scenario; the order
+	 * of when these callouts happen is undefined with respect to
+	 * CPU hotplug, and they can race with each other.  As such,
+	 * merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is
+	 * undefined; you can actually have a CPU frequency change take
+	 * place in between the computation of X and the setting of the
+	 * variable.  To protect against this problem, all updates of
+	 * the per_cpu tsc_khz variable are done in an interrupt
+	 * protected IPI, and all callers wishing to update the value
+	 * must wait for a synchronous IPI to complete (which is trivial
+	 * if the caller is on the CPU already).  This establishes the
+	 * necessary total order on variable updates.
+	 *
+	 * Note that because a guest time update may take place
+	 * anytime after the setting of the VCPU's request bit, the
+	 * correct TSC value must be set before the request.  However,
+	 * to ensure the update actually makes it to any guest which
+	 * starts running in hardware virtualization between the set
+	 * and the acquisition of the spinlock, we must also ping the
+	 * CPU after setting the request bit.
+	 *
+	 */
+
 	if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
 		return 0;
 	if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
 		return 0;
-	per_cpu(cpu_tsc_khz, freq->cpu) = freq->new;
+
+	smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
 
 	spin_lock(&kvm_lock);
 	list_for_each_entry(kvm, &vm_list, vm_list) {
@@ -4179,7 +4240,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
 			if (!kvm_request_guest_time_update(vcpu))
 				continue;
 			if (vcpu->cpu != smp_processor_id())
-				send_ipi++;
+				send_ipi = 1;
 		}
 	}
 	spin_unlock(&kvm_lock);
@@ -4197,32 +4258,48 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
 		 * guest context is entered kvmclock will be updated,
 		 * so the guest will not see stale values.
 		 */
-		smp_call_function_single(freq->cpu, bounce_off, NULL, 1);
+		smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
 	}
 	return 0;
 }
 
 static struct notifier_block kvmclock_cpufreq_notifier_block = {
-        .notifier_call  = kvmclock_cpufreq_notifier
+	.notifier_call  = kvmclock_cpufreq_notifier
+};
+
+static int kvmclock_cpu_notifier(struct notifier_block *nfb,
+					unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (unsigned long)hcpu;
+
+	switch (action) {
+		case CPU_ONLINE:
+		case CPU_DOWN_FAILED:
+			smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
+			break;
+		case CPU_DOWN_PREPARE:
+			smp_call_function_single(cpu, tsc_bad, NULL, 1);
+			break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block kvmclock_cpu_notifier_block = {
+	.notifier_call  = kvmclock_cpu_notifier,
+	.priority = -INT_MAX
 };
 
 static void kvm_timer_init(void)
 {
 	int cpu;
 
+	register_hotcpu_notifier(&kvmclock_cpu_notifier_block);
 	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
 		cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
 					  CPUFREQ_TRANSITION_NOTIFIER);
-		for_each_online_cpu(cpu) {
-			unsigned long khz = cpufreq_get(cpu);
-			if (!khz)
-				khz = tsc_khz;
-			per_cpu(cpu_tsc_khz, cpu) = khz;
-		}
-	} else {
-		for_each_possible_cpu(cpu)
-			per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
 	}
+	for_each_online_cpu(cpu)
+		smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
 }
 
 static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
@@ -4324,6 +4401,7 @@ void kvm_arch_exit(void)
 	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
 		cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
 					    CPUFREQ_TRANSITION_NOTIFIER);
+	unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block);
 	kvm_x86_ops = NULL;
 	kvm_mmu_module_exit();
 }
@@ -4739,8 +4817,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 			kvm_mmu_unload(vcpu);
 		if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
 			__kvm_migrate_timers(vcpu);
-		if (kvm_check_request(KVM_REQ_KVMCLOCK_UPDATE, vcpu))
-			kvm_write_guest_time(vcpu);
+		if (kvm_check_request(KVM_REQ_KVMCLOCK_UPDATE, vcpu)) {
+			r = kvm_write_guest_time(vcpu);
+			if (unlikely(r))
+				goto out;
+		}
 		if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
 			kvm_mmu_sync_roots(vcpu);
 		if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
@@ -5423,17 +5504,7 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
 
 int kvm_arch_hardware_enable(void *garbage)
 {
-	/*
-	 * Since this may be called from a hotplug notifcation,
-	 * we can't get the CPU frequency directly.
-	 */
-	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
-		int cpu = raw_smp_processor_id();
-		per_cpu(cpu_tsc_khz, cpu) = 0;
-	}
-
 	kvm_shared_msr_cpu_online();
-
 	return kvm_x86_ops->hardware_enable(garbage);
 }
 
-- 
cgit v1.2.3-18-g5258


From 6755bae8e69093b2994b6f29cd3eaecdf610374e Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zamsden@redhat.com>
Date: Thu, 19 Aug 2010 22:07:22 -1000
Subject: KVM: x86: Warn about unstable TSC

If creating an SMP guest with unstable host TSC, issue a warning

Signed-off-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 699c6b89c1b..a8dee58e871 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5457,6 +5457,10 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
 						unsigned int id)
 {
+	if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
+		printk_once(KERN_WARNING
+		"kvm: SMP vm created on host with unstable TSC; "
+		"guest TSC will not be reliable\n");
 	return kvm_x86_ops->vcpu_create(kvm, id);
 }
 
-- 
cgit v1.2.3-18-g5258


From e48672fa25e879f7ae21785c7efd187738139593 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zamsden@redhat.com>
Date: Thu, 19 Aug 2010 22:07:23 -1000
Subject: KVM: x86: Unify TSC logic

Move the TSC control logic from the vendor backends into x86.c
by adding adjust_tsc_offset to x86 ops.  Now all TSC decisions
can be done in one place.

Signed-off-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  5 +++--
 arch/x86/kvm/svm.c              | 26 ++++++++++----------------
 arch/x86/kvm/vmx.c              | 22 ++++++++--------------
 arch/x86/kvm/x86.c              | 17 ++++++++++++++---
 4 files changed, 35 insertions(+), 35 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 57b4394491e..5ab1c3fb34e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -255,7 +255,6 @@ struct kvm_mmu {
 };
 
 struct kvm_vcpu_arch {
-	u64 host_tsc;
 	/*
 	 * rip and regs accesses must go through
 	 * kvm_{register,rip}_{read,write} functions.
@@ -336,9 +335,10 @@ struct kvm_vcpu_arch {
 
 	gpa_t time;
 	struct pvclock_vcpu_time_info hv_clock;
-	unsigned int hv_clock_tsc_khz;
+	unsigned int hw_tsc_khz;
 	unsigned int time_offset;
 	struct page *time_page;
+	u64 last_host_tsc;
 
 	bool nmi_pending;
 	bool nmi_injected;
@@ -520,6 +520,7 @@ struct kvm_x86_ops {
 	u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
 	int (*get_lpage_level)(void);
 	bool (*rdtscp_supported)(void);
+	void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment);
 
 	void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry);
 
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ea41c551fa4..ff28f652106 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -715,6 +715,15 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 	svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
 }
 
+static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	svm->vmcb->control.tsc_offset += adjustment;
+	if (is_nested(svm))
+		svm->nested.hsave->control.tsc_offset += adjustment;
+}
+
 static void init_vmcb(struct vcpu_svm *svm)
 {
 	struct vmcb_control_area *control = &svm->vmcb->control;
@@ -961,20 +970,6 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	int i;
 
 	if (unlikely(cpu != vcpu->cpu)) {
-		u64 delta;
-
-		if (check_tsc_unstable()) {
-			/*
-			 * Make sure that the guest sees a monotonically
-			 * increasing TSC.
-			 */
-			delta = vcpu->arch.host_tsc - native_read_tsc();
-			svm->vmcb->control.tsc_offset += delta;
-			if (is_nested(svm))
-				svm->nested.hsave->control.tsc_offset += delta;
-		}
-		vcpu->cpu = cpu;
-		kvm_migrate_timers(vcpu);
 		svm->asid_generation = 0;
 	}
 
@@ -990,8 +985,6 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
 	++vcpu->stat.host_state_reload;
 	for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
 		wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
-
-	vcpu->arch.host_tsc = native_read_tsc();
 }
 
 static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
@@ -3553,6 +3546,7 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.has_wbinvd_exit = svm_has_wbinvd_exit,
 
 	.write_tsc_offset = svm_write_tsc_offset,
+	.adjust_tsc_offset = svm_adjust_tsc_offset,
 };
 
 static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 138746d3afe..275a81d571c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -505,7 +505,6 @@ static void __vcpu_clear(void *arg)
 		vmcs_clear(vmx->vmcs);
 	if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
 		per_cpu(current_vmcs, cpu) = NULL;
-	rdtscll(vmx->vcpu.arch.host_tsc);
 	list_del(&vmx->local_vcpus_link);
 	vmx->vcpu.cpu = -1;
 	vmx->launched = 0;
@@ -881,7 +880,6 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
 static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	u64 tsc_this, delta, new_offset;
 	u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
 
 	if (!vmm_exclusive)
@@ -898,14 +896,12 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
 		unsigned long sysenter_esp;
 
-		kvm_migrate_timers(vcpu);
 		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 		local_irq_disable();
 		list_add(&vmx->local_vcpus_link,
 			 &per_cpu(vcpus_on_cpu, cpu));
 		local_irq_enable();
 
-		vcpu->cpu = cpu;
 		/*
 		 * Linux uses per-cpu TSS and GDT, so set these when switching
 		 * processors.
@@ -915,16 +911,6 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 		rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
 		vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
-
-		/*
-		 * Make sure the time stamp counter is monotonous.
-		 */
-		rdtscll(tsc_this);
-		if (tsc_this < vcpu->arch.host_tsc) {
-			delta = vcpu->arch.host_tsc - tsc_this;
-			new_offset = vmcs_read64(TSC_OFFSET) + delta;
-			vmcs_write64(TSC_OFFSET, new_offset);
-		}
 	}
 }
 
@@ -1153,6 +1139,12 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 	vmcs_write64(TSC_OFFSET, offset);
 }
 
+static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
+{
+	u64 offset = vmcs_read64(TSC_OFFSET);
+	vmcs_write64(TSC_OFFSET, offset + adjustment);
+}
+
 /*
  * Reads an msr value (of 'msr_index') into 'pdata'.
  * Returns 0 on success, non-0 otherwise.
@@ -4108,6 +4100,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 
 	cpu = get_cpu();
 	vmx_vcpu_load(&vmx->vcpu, cpu);
+	vmx->vcpu.cpu = cpu;
 	err = vmx_vcpu_setup(vmx);
 	vmx_vcpu_put(&vmx->vcpu);
 	put_cpu();
@@ -4347,6 +4340,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
 
 	.write_tsc_offset = vmx_write_tsc_offset,
+	.adjust_tsc_offset = vmx_adjust_tsc_offset,
 };
 
 static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a8dee58e871..468fafaed1a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -973,9 +973,9 @@ static int kvm_write_guest_time(struct kvm_vcpu *v)
 		return 1;
 	}
 
-	if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) {
+	if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
 		kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock);
-		vcpu->hv_clock_tsc_khz = this_tsc_khz;
+		vcpu->hw_tsc_khz = this_tsc_khz;
 	}
 
 	/* With all the info we got, fill in the values */
@@ -1866,13 +1866,24 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	}
 
 	kvm_x86_ops->vcpu_load(vcpu, cpu);
-	kvm_request_guest_time_update(vcpu);
+	if (unlikely(vcpu->cpu != cpu)) {
+		/* Make sure TSC doesn't go backwards */
+		s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
+				native_read_tsc() - vcpu->arch.last_host_tsc;
+		if (tsc_delta < 0)
+			mark_tsc_unstable("KVM discovered backwards TSC");
+		if (check_tsc_unstable())
+			kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta);
+		kvm_migrate_timers(vcpu);
+		vcpu->cpu = cpu;
+	}
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
 	kvm_x86_ops->vcpu_put(vcpu);
 	kvm_put_guest_fpu(vcpu);
+	vcpu->arch.last_host_tsc = native_read_tsc();
 }
 
 static int is_efer_nx(void)
-- 
cgit v1.2.3-18-g5258


From 48434c20e18d59001469699fcaaf9cf30b815a20 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zamsden@redhat.com>
Date: Thu, 19 Aug 2010 22:07:24 -1000
Subject: KVM: x86: Fix deep C-state TSC desynchronization

When CPUs with unstable TSCs enter deep C-state, TSC may stop
running.  This causes us to require resynchronization.  Since
we can't tell when this may potentially happen, we assume the
worst by forcing re-compensation for it at every point the VCPU
task is descheduled.

Signed-off-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 468fafaed1a..9396b3f2c59 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1866,7 +1866,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	}
 
 	kvm_x86_ops->vcpu_load(vcpu, cpu);
-	if (unlikely(vcpu->cpu != cpu)) {
+	if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
 		/* Make sure TSC doesn't go backwards */
 		s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
 				native_read_tsc() - vcpu->arch.last_host_tsc;
-- 
cgit v1.2.3-18-g5258


From 759379dd68c2885d1fafa433083d4487e710a685 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zamsden@redhat.com>
Date: Thu, 19 Aug 2010 22:07:25 -1000
Subject: KVM: x86: Add helper functions for time computation

Add a helper function to compute the kernel time and convert nanoseconds
back to CPU specific cycles.  Note that these must not be called in preemptible
context, as that would mean the kernel could enter software suspend state,
which would cause non-atomic operation.

Also, convert the KVM_SET_CLOCK / KVM_GET_CLOCK ioctls to use the kernel
time helper, these should be bootbased as well.

Signed-off-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 48 ++++++++++++++++++++++++++++--------------------
 1 file changed, 28 insertions(+), 20 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9396b3f2c59..4bcb120cc76 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -893,6 +893,16 @@ static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *
 		 hv_clock->tsc_to_system_mul);
 }
 
+static inline u64 get_kernel_ns(void)
+{
+	struct timespec ts;
+
+	WARN_ON(preemptible());
+	ktime_get_ts(&ts);
+	monotonic_to_bootbased(&ts);
+	return timespec_to_ns(&ts);
+}
+
 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
 
 static inline int kvm_tsc_changes_freq(void)
@@ -904,18 +914,24 @@ static inline int kvm_tsc_changes_freq(void)
 	return ret;
 }
 
+static inline u64 nsec_to_cycles(u64 nsec)
+{
+	WARN_ON(preemptible());
+	if (kvm_tsc_changes_freq())
+		printk_once(KERN_WARNING
+		 "kvm: unreliable cycle conversion on adjustable rate TSC\n");
+	return (nsec * __get_cpu_var(cpu_tsc_khz)) / USEC_PER_SEC;
+}
+
 void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
 {
 	struct kvm *kvm = vcpu->kvm;
 	u64 offset, ns, elapsed;
 	unsigned long flags;
-	struct timespec ts;
 
 	spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
 	offset = data - native_read_tsc();
-	ktime_get_ts(&ts);
-	monotonic_to_bootbased(&ts);
-	ns = timespec_to_ns(&ts);
+	ns = get_kernel_ns();
 	elapsed = ns - kvm->arch.last_tsc_nsec;
 
 	/*
@@ -931,10 +947,9 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
 			offset = kvm->arch.last_tsc_offset;
 			pr_debug("kvm: matched tsc offset for %llu\n", data);
 		} else {
-			u64 tsc_delta = elapsed * __get_cpu_var(cpu_tsc_khz);
-			tsc_delta = tsc_delta / USEC_PER_SEC;
-			offset += tsc_delta;
-			pr_debug("kvm: adjusted tsc offset by %llu\n", tsc_delta);
+			u64 delta = nsec_to_cycles(elapsed);
+			offset += delta;
+			pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
 		}
 		ns = kvm->arch.last_tsc_nsec;
 	}
@@ -951,11 +966,11 @@ EXPORT_SYMBOL_GPL(kvm_write_tsc);
 
 static int kvm_write_guest_time(struct kvm_vcpu *v)
 {
-	struct timespec ts;
 	unsigned long flags;
 	struct kvm_vcpu_arch *vcpu = &v->arch;
 	void *shared_kaddr;
 	unsigned long this_tsc_khz;
+	s64 kernel_ns;
 
 	if ((!vcpu->time_page))
 		return 0;
@@ -963,8 +978,7 @@ static int kvm_write_guest_time(struct kvm_vcpu *v)
 	/* Keep irq disabled to prevent changes to the clock */
 	local_irq_save(flags);
 	kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp);
-	ktime_get_ts(&ts);
-	monotonic_to_bootbased(&ts);
+	kernel_ns = get_kernel_ns();
 	this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
 	local_irq_restore(flags);
 
@@ -979,9 +993,7 @@ static int kvm_write_guest_time(struct kvm_vcpu *v)
 	}
 
 	/* With all the info we got, fill in the values */
-	vcpu->hv_clock.system_time = ts.tv_nsec +
-				     (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset;
-
+	vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
 	vcpu->hv_clock.flags = 0;
 
 	/*
@@ -3263,7 +3275,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		break;
 	}
 	case KVM_SET_CLOCK: {
-		struct timespec now;
 		struct kvm_clock_data user_ns;
 		u64 now_ns;
 		s64 delta;
@@ -3277,19 +3288,16 @@ long kvm_arch_vm_ioctl(struct file *filp,
 			goto out;
 
 		r = 0;
-		ktime_get_ts(&now);
-		now_ns = timespec_to_ns(&now);
+		now_ns = get_kernel_ns();
 		delta = user_ns.clock - now_ns;
 		kvm->arch.kvmclock_offset = delta;
 		break;
 	}
 	case KVM_GET_CLOCK: {
-		struct timespec now;
 		struct kvm_clock_data user_ns;
 		u64 now_ns;
 
-		ktime_get_ts(&now);
-		now_ns = timespec_to_ns(&now);
+		now_ns = get_kernel_ns();
 		user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
 		user_ns.flags = 0;
 
-- 
cgit v1.2.3-18-g5258


From 46543ba45fc4b64ca32655efdc8d9c599b4164e2 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zamsden@redhat.com>
Date: Thu, 19 Aug 2010 22:07:26 -1000
Subject: KVM: x86: Robust TSC compensation

Make the match of TSC find TSC writes that are close to each other
instead of perfectly identical; this allows the compensator to also
work in migration / suspend scenarios.

Signed-off-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4bcb120cc76..4ff0c271f12 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -928,21 +928,27 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
 	struct kvm *kvm = vcpu->kvm;
 	u64 offset, ns, elapsed;
 	unsigned long flags;
+	s64 sdiff;
 
 	spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
 	offset = data - native_read_tsc();
 	ns = get_kernel_ns();
 	elapsed = ns - kvm->arch.last_tsc_nsec;
+	sdiff = data - kvm->arch.last_tsc_write;
+	if (sdiff < 0)
+		sdiff = -sdiff;
 
 	/*
-	 * Special case: identical write to TSC within 5 seconds of
+	 * Special case: close write to TSC within 5 seconds of
 	 * another CPU is interpreted as an attempt to synchronize
-	 * (the 5 seconds is to accomodate host load / swapping).
+	 * The 5 seconds is to accomodate host load / swapping as
+	 * well as any reset of TSC during the boot process.
 	 *
 	 * In that case, for a reliable TSC, we can match TSC offsets,
-	 * or make a best guest using kernel_ns value.
+	 * or make a best guest using elapsed value.
 	 */
-	if (data == kvm->arch.last_tsc_write && elapsed < 5ULL * NSEC_PER_SEC) {
+	if (sdiff < nsec_to_cycles(5ULL * NSEC_PER_SEC) &&
+	    elapsed < 5ULL * NSEC_PER_SEC) {
 		if (!check_tsc_unstable()) {
 			offset = kvm->arch.last_tsc_offset;
 			pr_debug("kvm: matched tsc offset for %llu\n", data);
-- 
cgit v1.2.3-18-g5258


From ca84d1a24c376e0841f35db08dab7b829c8c0b1e Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zamsden@redhat.com>
Date: Thu, 19 Aug 2010 22:07:28 -1000
Subject: KVM: x86: Add clock sync request to hardware enable

If there are active VCPUs which are marked as belonging to
a particular hardware CPU, request a clock sync for them when
enabling hardware; the TSC could be desynchronized on a newly
arriving CPU, and we need to recompute guests system time
relative to boot after a suspend event.

This covers both cases.

Note that it is acceptable to take the spinlock, as either
no other tasks will be running and no locks held (BSP after
resume), or other tasks will be guaranteed to drop the lock
relatively quickly (AP on CPU_STARTING).

Noting we now get clock synchronization requests for VCPUs
which are starting up (or restarting), it is tempting to
attempt to remove the arch/x86/kvm/x86.c CPU hot-notifiers
at this time, however it is not correct to do so; they are
required for systems with non-constant TSC as the frequency
may not be known immediately after the processor has started
until the cpufreq driver has had a chance to run and query
the chipset.

Updated: implement better locking semantics for hardware_enable

Removed the hack of dropping and retaking the lock by adding the
semantic that we always hold kvm_lock when hardware_enable is
called.  The one place that doesn't need to worry about it is
resume, as resuming a frozen CPU, the spinlock won't be taken.

Signed-off-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4ff0c271f12..d0764a25804 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5533,7 +5533,15 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
 
 int kvm_arch_hardware_enable(void *garbage)
 {
+	struct kvm *kvm;
+	struct kvm_vcpu *vcpu;
+	int i;
+
 	kvm_shared_msr_cpu_online();
+	list_for_each_entry(kvm, &vm_list, vm_list)
+		kvm_for_each_vcpu(i, vcpu, kvm)
+			if (vcpu->cpu == smp_processor_id())
+				kvm_request_guest_time_update(vcpu);
 	return kvm_x86_ops->hardware_enable(garbage);
 }
 
-- 
cgit v1.2.3-18-g5258


From 347bb4448c2155eb2310923ccaa4be5677649003 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zamsden@redhat.com>
Date: Thu, 19 Aug 2010 22:07:29 -1000
Subject: x86: pvclock: Move scale_delta into common header

The scale_delta function for shift / multiply with 31-bit
precision moves to a common header so it can be used by both
kernel and kvm module.

Signed-off-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/pvclock.h | 38 ++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/pvclock.c      |  3 ++-
 2 files changed, 40 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index cd02f324aa6..7f7e577a0e3 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -12,4 +12,42 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
 			    struct pvclock_vcpu_time_info *vcpu,
 			    struct timespec *ts);
 
+/*
+ * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
+ * yielding a 64-bit result.
+ */
+static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
+{
+	u64 product;
+#ifdef __i386__
+	u32 tmp1, tmp2;
+#endif
+
+	if (shift < 0)
+		delta >>= -shift;
+	else
+		delta <<= shift;
+
+#ifdef __i386__
+	__asm__ (
+		"mul  %5       ; "
+		"mov  %4,%%eax ; "
+		"mov  %%edx,%4 ; "
+		"mul  %5       ; "
+		"xor  %5,%5    ; "
+		"add  %4,%%eax ; "
+		"adc  %5,%%edx ; "
+		: "=A" (product), "=r" (tmp1), "=r" (tmp2)
+		: "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
+#elif defined(__x86_64__)
+	__asm__ (
+		"mul %%rdx ; shrd $32,%%rdx,%%rax"
+		: "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
+#else
+#error implement me!
+#endif
+
+	return product;
+}
+
 #endif /* _ASM_X86_PVCLOCK_H */
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 239427ca02a..bab3b9e6f66 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -82,7 +82,8 @@ static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
 static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow)
 {
 	u64 delta = native_read_tsc() - shadow->tsc_timestamp;
-	return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
+	return pvclock_scale_delta(delta, shadow->tsc_to_nsec_mul,
+				   shadow->tsc_shift);
 }
 
 /*
-- 
cgit v1.2.3-18-g5258


From 1d5f066e0b63271b67eac6d3752f8aa96adcbddb Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zamsden@redhat.com>
Date: Thu, 19 Aug 2010 22:07:30 -1000
Subject: KVM: x86: Fix a possible backwards warp of kvmclock

Kernel time, which advances in discrete steps may progress much slower
than TSC.  As a result, when kvmclock is adjusted to a new base, the
apparent time to the guest, which runs at a much higher, nsec scaled
rate based on the current TSC, may have already been observed to have
a larger value (kernel_ns + scaled tsc) than the value to which we are
setting it (kernel_ns + 0).

We must instead compute the clock as potentially observed by the guest
for kernel_ns to make sure it does not go backwards.

Signed-off-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/kvm/x86.c              | 44 +++++++++++++++++++++++++++++++++++++++--
 2 files changed, 44 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 5ab1c3fb34e..789e9462668 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -339,6 +339,8 @@ struct kvm_vcpu_arch {
 	unsigned int time_offset;
 	struct page *time_page;
 	u64 last_host_tsc;
+	u64 last_guest_tsc;
+	u64 last_kernel_ns;
 
 	bool nmi_pending;
 	bool nmi_injected;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d0764a25804..d4d33f943d9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -55,6 +55,7 @@
 #include <asm/mce.h>
 #include <asm/i387.h>
 #include <asm/xcr.h>
+#include <asm/pvclock.h>
 
 #define MAX_IO_MSRS 256
 #define CR0_RESERVED_BITS						\
@@ -976,14 +977,15 @@ static int kvm_write_guest_time(struct kvm_vcpu *v)
 	struct kvm_vcpu_arch *vcpu = &v->arch;
 	void *shared_kaddr;
 	unsigned long this_tsc_khz;
-	s64 kernel_ns;
+	s64 kernel_ns, max_kernel_ns;
+	u64 tsc_timestamp;
 
 	if ((!vcpu->time_page))
 		return 0;
 
 	/* Keep irq disabled to prevent changes to the clock */
 	local_irq_save(flags);
-	kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp);
+	kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp);
 	kernel_ns = get_kernel_ns();
 	this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
 	local_irq_restore(flags);
@@ -993,13 +995,49 @@ static int kvm_write_guest_time(struct kvm_vcpu *v)
 		return 1;
 	}
 
+	/*
+	 * Time as measured by the TSC may go backwards when resetting the base
+	 * tsc_timestamp.  The reason for this is that the TSC resolution is
+	 * higher than the resolution of the other clock scales.  Thus, many
+	 * possible measurments of the TSC correspond to one measurement of any
+	 * other clock, and so a spread of values is possible.  This is not a
+	 * problem for the computation of the nanosecond clock; with TSC rates
+	 * around 1GHZ, there can only be a few cycles which correspond to one
+	 * nanosecond value, and any path through this code will inevitably
+	 * take longer than that.  However, with the kernel_ns value itself,
+	 * the precision may be much lower, down to HZ granularity.  If the
+	 * first sampling of TSC against kernel_ns ends in the low part of the
+	 * range, and the second in the high end of the range, we can get:
+	 *
+	 * (TSC - offset_low) * S + kns_old > (TSC - offset_high) * S + kns_new
+	 *
+	 * As the sampling errors potentially range in the thousands of cycles,
+	 * it is possible such a time value has already been observed by the
+	 * guest.  To protect against this, we must compute the system time as
+	 * observed by the guest and ensure the new system time is greater.
+	 */
+	max_kernel_ns = 0;
+	if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) {
+		max_kernel_ns = vcpu->last_guest_tsc -
+				vcpu->hv_clock.tsc_timestamp;
+		max_kernel_ns = pvclock_scale_delta(max_kernel_ns,
+				    vcpu->hv_clock.tsc_to_system_mul,
+				    vcpu->hv_clock.tsc_shift);
+		max_kernel_ns += vcpu->last_kernel_ns;
+	}
+
 	if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
 		kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock);
 		vcpu->hw_tsc_khz = this_tsc_khz;
 	}
 
+	if (max_kernel_ns > kernel_ns)
+		kernel_ns = max_kernel_ns;
+
 	/* With all the info we got, fill in the values */
+	vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
 	vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
+	vcpu->last_kernel_ns = kernel_ns;
 	vcpu->hv_clock.flags = 0;
 
 	/*
@@ -4931,6 +4969,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	if (hw_breakpoint_active())
 		hw_breakpoint_restore();
 
+	kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc);
+
 	atomic_set(&vcpu->guest_mode, 0);
 	smp_wmb();
 	local_irq_enable();
-- 
cgit v1.2.3-18-g5258


From 957ed9effd80b04482cbdce8c95bdf803a656b94 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Sun, 22 Aug 2010 19:12:48 +0800
Subject: KVM: MMU: prefetch ptes when intercepted guest #PF

Support prefetch ptes when intercept guest #PF, avoid to #PF by later
access

If we meet any failure in the prefetch path, we will exit it and
not try other ptes to avoid become heavy path

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c         | 104 ++++++++++++++++++++++++++++++++++++++++++++-
 arch/x86/kvm/paging_tmpl.h |  72 +++++++++++++++++++++++++++++++
 2 files changed, 175 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 54a50268ceb..b0037a77e56 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -89,6 +89,8 @@ module_param(oos_shadow, bool, 0644);
 	}
 #endif
 
+#define PTE_PREFETCH_NUM		8
+
 #define PT_FIRST_AVAIL_BITS_SHIFT 9
 #define PT64_SECOND_AVAIL_BITS_SHIFT 52
 
@@ -400,7 +402,7 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
 	if (r)
 		goto out;
 	r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
-				   rmap_desc_cache, 4);
+				   rmap_desc_cache, 4 + PTE_PREFETCH_NUM);
 	if (r)
 		goto out;
 	r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
@@ -2089,6 +2091,105 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
 {
 }
 
+static struct kvm_memory_slot *
+pte_prefetch_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log)
+{
+	struct kvm_memory_slot *slot;
+
+	slot = gfn_to_memslot(vcpu->kvm, gfn);
+	if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
+	      (no_dirty_log && slot->dirty_bitmap))
+		slot = NULL;
+
+	return slot;
+}
+
+static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
+				     bool no_dirty_log)
+{
+	struct kvm_memory_slot *slot;
+	unsigned long hva;
+
+	slot = pte_prefetch_gfn_to_memslot(vcpu, gfn, no_dirty_log);
+	if (!slot) {
+		get_page(bad_page);
+		return page_to_pfn(bad_page);
+	}
+
+	hva = gfn_to_hva_memslot(slot, gfn);
+
+	return hva_to_pfn_atomic(vcpu->kvm, hva);
+}
+
+static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
+				    struct kvm_mmu_page *sp,
+				    u64 *start, u64 *end)
+{
+	struct page *pages[PTE_PREFETCH_NUM];
+	unsigned access = sp->role.access;
+	int i, ret;
+	gfn_t gfn;
+
+	gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
+	if (!pte_prefetch_gfn_to_memslot(vcpu, gfn, access & ACC_WRITE_MASK))
+		return -1;
+
+	ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start);
+	if (ret <= 0)
+		return -1;
+
+	for (i = 0; i < ret; i++, gfn++, start++)
+		mmu_set_spte(vcpu, start, ACC_ALL,
+			     access, 0, 0, 1, NULL,
+			     sp->role.level, gfn,
+			     page_to_pfn(pages[i]), true, true);
+
+	return 0;
+}
+
+static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
+				  struct kvm_mmu_page *sp, u64 *sptep)
+{
+	u64 *spte, *start = NULL;
+	int i;
+
+	WARN_ON(!sp->role.direct);
+
+	i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
+	spte = sp->spt + i;
+
+	for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
+		if (*spte != shadow_trap_nonpresent_pte || spte == sptep) {
+			if (!start)
+				continue;
+			if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
+				break;
+			start = NULL;
+		} else if (!start)
+			start = spte;
+	}
+}
+
+static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
+{
+	struct kvm_mmu_page *sp;
+
+	/*
+	 * Since it's no accessed bit on EPT, it's no way to
+	 * distinguish between actually accessed translations
+	 * and prefetched, so disable pte prefetch if EPT is
+	 * enabled.
+	 */
+	if (!shadow_accessed_mask)
+		return;
+
+	sp = page_header(__pa(sptep));
+	if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+		return;
+
+	__direct_pte_prefetch(vcpu, sp, sptep);
+}
+
 static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 			int level, gfn_t gfn, pfn_t pfn)
 {
@@ -2102,6 +2203,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 			mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
 				     0, write, 1, &pt_write,
 				     level, gfn, pfn, false, true);
+			direct_pte_prefetch(vcpu, iterator.sptep);
 			++vcpu->stat.pf_fixed;
 			break;
 		}
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 51ef9097960..872ff265c91 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -310,6 +310,77 @@ static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
 	return r || curr_pte != gw->ptes[level - 1];
 }
 
+static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, u64 *sptep)
+{
+	struct kvm_mmu_page *sp;
+	pt_element_t gptep[PTE_PREFETCH_NUM];
+	gpa_t first_pte_gpa;
+	int offset = 0, i;
+	u64 *spte;
+
+	sp = page_header(__pa(sptep));
+
+	if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+		return;
+
+	if (sp->role.direct)
+		return __direct_pte_prefetch(vcpu, sp, sptep);
+
+	i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
+
+	if (PTTYPE == 32)
+		offset = sp->role.quadrant << PT64_LEVEL_BITS;
+
+	first_pte_gpa = gfn_to_gpa(sp->gfn) +
+				(offset + i) * sizeof(pt_element_t);
+
+	if (kvm_read_guest_atomic(vcpu->kvm, first_pte_gpa, gptep,
+					sizeof(gptep)) < 0)
+		return;
+
+	spte = sp->spt + i;
+
+	for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
+		pt_element_t gpte;
+		unsigned pte_access;
+		gfn_t gfn;
+		pfn_t pfn;
+		bool dirty;
+
+		if (spte == sptep)
+			continue;
+
+		if (*spte != shadow_trap_nonpresent_pte)
+			continue;
+
+		gpte = gptep[i];
+
+		if (!is_present_gpte(gpte) ||
+		      is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL)) {
+			if (!sp->unsync)
+				__set_spte(spte, shadow_notrap_nonpresent_pte);
+			continue;
+		}
+
+		if (!(gpte & PT_ACCESSED_MASK))
+			continue;
+
+		pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+		gfn = gpte_to_gfn(gpte);
+		dirty = is_dirty_gpte(gpte);
+		pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
+				      (pte_access & ACC_WRITE_MASK) && dirty);
+		if (is_error_pfn(pfn)) {
+			kvm_release_pfn_clean(pfn);
+			break;
+		}
+
+		mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
+			     dirty, NULL, PT_PAGE_TABLE_LEVEL, gfn,
+			     pfn, true, true);
+	}
+}
+
 /*
  * Fetch a shadow pte for a specific level in the paging hierarchy.
  */
@@ -391,6 +462,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 	mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
 		     user_fault, write_fault, dirty, ptwrite, it.level,
 		     gw->gfn, pfn, false, true);
+	FNAME(pte_prefetch)(vcpu, it.sptep);
 
 	return it.sptep;
 
-- 
cgit v1.2.3-18-g5258


From 189be38db3dde12699a8b9dc22d33e8c95efe110 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Sun, 22 Aug 2010 19:13:33 +0800
Subject: KVM: MMU: combine guest pte read between fetch and pte prefetch

Combine guest pte read between guest pte check in the fetch path and pte prefetch

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 40 +++++++++++++++++++++-------------------
 1 file changed, 21 insertions(+), 19 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 872ff265c91..a4e8389df2a 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -67,6 +67,7 @@ struct guest_walker {
 	int level;
 	gfn_t table_gfn[PT_MAX_FULL_LEVELS];
 	pt_element_t ptes[PT_MAX_FULL_LEVELS];
+	pt_element_t prefetch_ptes[PTE_PREFETCH_NUM];
 	gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
 	unsigned pt_access;
 	unsigned pte_access;
@@ -302,21 +303,33 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
 				struct guest_walker *gw, int level)
 {
-	int r;
 	pt_element_t curr_pte;
-
-	r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 1],
+	gpa_t base_gpa, pte_gpa = gw->pte_gpa[level - 1];
+	u64 mask;
+	int r, index;
+
+	if (level == PT_PAGE_TABLE_LEVEL) {
+		mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1;
+		base_gpa = pte_gpa & ~mask;
+		index = (pte_gpa - base_gpa) / sizeof(pt_element_t);
+
+		r = kvm_read_guest_atomic(vcpu->kvm, base_gpa,
+				gw->prefetch_ptes, sizeof(gw->prefetch_ptes));
+		curr_pte = gw->prefetch_ptes[index];
+	} else
+		r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa,
 				  &curr_pte, sizeof(curr_pte));
+
 	return r || curr_pte != gw->ptes[level - 1];
 }
 
-static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, u64 *sptep)
+static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
+				u64 *sptep)
 {
 	struct kvm_mmu_page *sp;
-	pt_element_t gptep[PTE_PREFETCH_NUM];
-	gpa_t first_pte_gpa;
-	int offset = 0, i;
+	pt_element_t *gptep = gw->prefetch_ptes;
 	u64 *spte;
+	int i;
 
 	sp = page_header(__pa(sptep));
 
@@ -327,17 +340,6 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, u64 *sptep)
 		return __direct_pte_prefetch(vcpu, sp, sptep);
 
 	i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
-
-	if (PTTYPE == 32)
-		offset = sp->role.quadrant << PT64_LEVEL_BITS;
-
-	first_pte_gpa = gfn_to_gpa(sp->gfn) +
-				(offset + i) * sizeof(pt_element_t);
-
-	if (kvm_read_guest_atomic(vcpu->kvm, first_pte_gpa, gptep,
-					sizeof(gptep)) < 0)
-		return;
-
 	spte = sp->spt + i;
 
 	for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
@@ -462,7 +464,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 	mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
 		     user_fault, write_fault, dirty, ptwrite, it.level,
 		     gw->gfn, pfn, false, true);
-	FNAME(pte_prefetch)(vcpu, it.sptep);
+	FNAME(pte_prefetch)(vcpu, gw, it.sptep);
 
 	return it.sptep;
 
-- 
cgit v1.2.3-18-g5258


From cc4feed57fcd4934b89aaac51d64dbff921e2f2b Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Wed, 25 Aug 2010 14:10:53 +0800
Subject: KVM: x86 emulator: add CALL FAR instruction emulation (opcode 9a)

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 312e798d542..1702ea8a28c 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2455,7 +2455,7 @@ static struct opcode opcode_table[256] = {
 	X8(D(SrcAcc | DstReg)),
 	/* 0x98 - 0x9F */
 	D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd),
-	D(SrcImmFAddr | No64), N,
+	I(SrcImmFAddr | No64, em_call_far), N,
 	D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, N,
 	/* 0xA0 - 0xA7 */
 	D(ByteOp | DstAcc | SrcMem | Mov | MemAbs), D(DstAcc | SrcMem | Mov | MemAbs),
-- 
cgit v1.2.3-18-g5258


From 6e2fb2cadd9a523ff5494d4c4d53c0d3e0024691 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 25 Aug 2010 12:47:41 +0300
Subject: KVM: x86 emulator: Rename variable that shadows another local
 variable.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 1702ea8a28c..42d42ca2c37 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3421,7 +3421,7 @@ writeback:
 				&c->dst);
 
 	if (c->rep_prefix && (c->d & String)) {
-		struct read_cache *rc = &ctxt->decode.io_read;
+		struct read_cache *r = &ctxt->decode.io_read;
 		register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
 		/* The second termination condition only applies for REPE
 		 * and REPNE. Test if the repeat string operation prefix is
@@ -3441,8 +3441,8 @@ writeback:
 		 * Re-enter guest when pio read ahead buffer is empty or,
 		 * if it is not used, after each 1024 iteration.
 		 */
-		else if ((rc->end == 0 && !(c->regs[VCPU_REGS_RCX] & 0x3ff)) ||
-			 (rc->end != 0 && rc->end == rc->pos)) {
+		else if ((r->end == 0 && !(c->regs[VCPU_REGS_RCX] & 0x3ff)) ||
+			 (r->end != 0 && r->end == r->pos)) {
 			ctxt->restart = false;
 			c->eip = ctxt->eip;
 		}
-- 
cgit v1.2.3-18-g5258


From 3e2f65d57a0c1897fcc3287eeb41f117f4d021e5 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 25 Aug 2010 12:47:42 +0300
Subject: KVM: x86 emulator: move string instruction completion check into
 separate function

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 37 ++++++++++++++++++++++++-------------
 1 file changed, 24 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 42d42ca2c37..3dcbc1d0a59 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2933,6 +2933,28 @@ done:
 	return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
 }
 
+static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
+{
+	struct decode_cache *c = &ctxt->decode;
+
+	/* The second termination condition only applies for REPE
+	 * and REPNE. Test if the repeat string operation prefix is
+	 * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
+	 * corresponding termination condition according to:
+	 * 	- if REPE/REPZ and ZF = 0 then done
+	 * 	- if REPNE/REPNZ and ZF = 1 then done
+	 */
+	if (((c->b == 0xa6) || (c->b == 0xa7) ||
+	     (c->b == 0xae) || (c->b == 0xaf))
+	    && (((c->rep_prefix == REPE_PREFIX) &&
+		 ((ctxt->eflags & EFLG_ZF) == 0))
+		|| ((c->rep_prefix == REPNE_PREFIX) &&
+		    ((ctxt->eflags & EFLG_ZF) == EFLG_ZF))))
+		return true;
+
+	return false;
+}
+
 int
 x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 {
@@ -3423,19 +3445,8 @@ writeback:
 	if (c->rep_prefix && (c->d & String)) {
 		struct read_cache *r = &ctxt->decode.io_read;
 		register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
-		/* The second termination condition only applies for REPE
-		 * and REPNE. Test if the repeat string operation prefix is
-		 * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
-		 * corresponding termination condition according to:
-		 * 	- if REPE/REPZ and ZF = 0 then done
-		 * 	- if REPNE/REPNZ and ZF = 1 then done
-		 */
-		if (((c->b == 0xa6) || (c->b == 0xa7) ||
-		     (c->b == 0xae) || (c->b == 0xaf))
-		    && (((c->rep_prefix == REPE_PREFIX) &&
-			 ((ctxt->eflags & EFLG_ZF) == 0))
-			|| ((c->rep_prefix == REPNE_PREFIX) &&
-			    ((ctxt->eflags & EFLG_ZF) == EFLG_ZF))))
+
+		if (string_insn_completed(ctxt))
 			ctxt->restart = false;
 		/*
 		 * Re-enter guest when pio read ahead buffer is empty or,
-- 
cgit v1.2.3-18-g5258


From d2ddd1c48364e4161052d6089f06b2cf3c50496b Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 25 Aug 2010 12:47:43 +0300
Subject: KVM: x86 emulator: get rid of "restart" in emulation context.

x86_emulate_insn() will return 1 if instruction can be restarted
without re-entering a guest.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |  4 +++-
 arch/x86/kvm/emulate.c             | 43 +++++++++++++++++---------------------
 arch/x86/kvm/x86.c                 | 16 +++++++-------
 3 files changed, 30 insertions(+), 33 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 1bbf2b6f2a7..1bf11400ae9 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -224,7 +224,6 @@ struct x86_emulate_ctxt {
 	/* interruptibility state, as a result of execution of STI or MOV SS */
 	int interruptibility;
 
-	bool restart; /* restart string instruction after writeback */
 	bool perm_ok; /* do not check permissions if true */
 
 	int exception; /* exception that happens during emulation or -1 */
@@ -255,6 +254,9 @@ struct x86_emulate_ctxt {
 #endif
 
 int x86_decode_insn(struct x86_emulate_ctxt *ctxt);
+#define EMULATION_FAILED -1
+#define EMULATION_OK 0
+#define EMULATION_RESTART 1
 int x86_emulate_insn(struct x86_emulate_ctxt *ctxt);
 int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
 			 u16 tss_selector, int reason,
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 3dcbc1d0a59..ec35a71d8b5 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -437,7 +437,6 @@ static void emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
 	ctxt->exception = vec;
 	ctxt->error_code = error;
 	ctxt->error_code_valid = valid;
-	ctxt->restart = false;
 }
 
 static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
@@ -2633,9 +2632,6 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt)
 	struct opcode opcode, *g_mod012, *g_mod3;
 	struct operand memop = { .type = OP_NONE };
 
-	/* we cannot decode insn before we complete previous rep insn */
-	WARN_ON(ctxt->restart);
-
 	c->eip = ctxt->eip;
 	c->fetch.start = c->fetch.end = c->eip;
 	ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS);
@@ -2985,10 +2981,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 	}
 
 	if (c->rep_prefix && (c->d & String)) {
-		ctxt->restart = true;
 		/* All REP prefixes have the same first termination condition */
 		if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) {
-			ctxt->restart = false;
 			ctxt->eip = c->eip;
 			goto done;
 		}
@@ -3446,28 +3440,29 @@ writeback:
 		struct read_cache *r = &ctxt->decode.io_read;
 		register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
 
-		if (string_insn_completed(ctxt))
-			ctxt->restart = false;
-		/*
-		 * Re-enter guest when pio read ahead buffer is empty or,
-		 * if it is not used, after each 1024 iteration.
-		 */
-		else if ((r->end == 0 && !(c->regs[VCPU_REGS_RCX] & 0x3ff)) ||
-			 (r->end != 0 && r->end == r->pos)) {
-			ctxt->restart = false;
-			c->eip = ctxt->eip;
+		if (!string_insn_completed(ctxt)) {
+			/*
+			 * Re-enter guest when pio read ahead buffer is empty
+			 * or, if it is not used, after each 1024 iteration.
+			 */
+			if ((r->end != 0 || c->regs[VCPU_REGS_RCX] & 0x3ff) &&
+			    (r->end == 0 || r->end != r->pos)) {
+				/*
+				 * Reset read cache. Usually happens before
+				 * decode, but since instruction is restarted
+				 * we have to do it here.
+				 */
+				ctxt->decode.mem_read.end = 0;
+				return EMULATION_RESTART;
+			}
+			goto done; /* skip rip writeback */
 		}
 	}
-	/*
-	 * reset read cache here in case string instruction is restared
-	 * without decoding
-	 */
-	ctxt->decode.mem_read.end = 0;
-	if (!ctxt->restart)
-		ctxt->eip = c->eip;
+
+	ctxt->eip = c->eip;
 
 done:
-	return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
+	return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
 
 twobyte_insn:
 	switch (c->b) {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d4d33f943d9..bc96ac9ed91 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4181,18 +4181,17 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 restart:
 	r = x86_emulate_insn(&vcpu->arch.emulate_ctxt);
 
-	if (r) { /* emulation failed */
+	if (r == EMULATION_FAILED) {
 		if (reexecute_instruction(vcpu, cr2))
 			return EMULATE_DONE;
 
 		return handle_emulation_failure(vcpu);
 	}
 
-	r = EMULATE_DONE;
-
-	if (vcpu->arch.emulate_ctxt.exception >= 0)
+	if (vcpu->arch.emulate_ctxt.exception >= 0) {
 		inject_emulated_exception(vcpu);
-	else if (vcpu->arch.pio.count) {
+		r = EMULATE_DONE;
+	} else if (vcpu->arch.pio.count) {
 		if (!vcpu->arch.pio.in)
 			vcpu->arch.pio.count = 0;
 		r = EMULATE_DO_MMIO;
@@ -4200,8 +4199,10 @@ restart:
 		if (vcpu->mmio_is_write)
 			vcpu->mmio_needed = 0;
 		r = EMULATE_DO_MMIO;
-	} else if (vcpu->arch.emulate_ctxt.restart)
+	} else if (r == EMULATION_RESTART)
 		goto restart;
+	else
+		r = EMULATE_DONE;
 
 	toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility);
 	kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
@@ -5100,8 +5101,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	if (!irqchip_in_kernel(vcpu->kvm))
 		kvm_set_cr8(vcpu, kvm_run->cr8);
 
-	if (vcpu->arch.pio.count || vcpu->mmio_needed ||
-	    vcpu->arch.emulate_ctxt.restart) {
+	if (vcpu->arch.pio.count || vcpu->mmio_needed) {
 		if (vcpu->mmio_needed) {
 			memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
 			vcpu->mmio_read_completed = 1;
-- 
cgit v1.2.3-18-g5258


From 081bca0e6b87d0c7b9ade7ffee1f44aca336a8fa Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 26 Aug 2010 11:06:15 +0300
Subject: KVM: x86 emulator: refuse SrcMemFAddr (e.g. LDS) with register
 operand

SrcMemFAddr is not defined with the modrm operand designating a register
instead of a memory address.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index ec35a71d8b5..2b9b0feabdb 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2974,6 +2974,11 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 		goto done;
 	}
 
+	if ((c->d & SrcMask) == SrcMemFAddr && c->src.type != OP_MEM) {
+		emulate_ud(ctxt);
+		goto done;
+	}
+
 	/* Privileged instruction can be executed only in CPL=0 */
 	if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) {
 		emulate_gp(ctxt, 0);
-- 
cgit v1.2.3-18-g5258


From 8d8f4e9f66ab36e4fcc75eca1e828af8466309f1 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 26 Aug 2010 11:56:06 +0300
Subject: KVM: x86 emulator: support byte/word opcode pairs

Many x86 instructions come in byte and word variants distinguished with bit
0 of the opcode.  Add macros to aid in defining them.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 2b9b0feabdb..1a230b5495e 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2330,6 +2330,9 @@ static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
 #define GD(_f, _g) { .flags = ((_f) | Group | GroupDual), .u.gdual = (_g) }
 #define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
 
+#define D2bv(_f)      D((_f) | ByteOp), D(_f)
+#define I2bv(_f, _e)  I((_f) | ByteOp, _e), I(_f, _e)
+
 static struct opcode group1[] = {
 	X7(D(Lock)), N
 };
@@ -2572,6 +2575,9 @@ static struct opcode twobyte_table[256] = {
 #undef GD
 #undef I
 
+#undef D2bv
+#undef I2bv
+
 static unsigned imm_size(struct decode_cache *c)
 {
 	unsigned size;
-- 
cgit v1.2.3-18-g5258


From 5315fbb223086c078c979d16734844ccff12f087 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 26 Aug 2010 11:56:07 +0300
Subject: KVM: x86 emulator: simplify ALU block (opcodes 00-3F) decode flags

Use the new byte/word dual opcode decode.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 40 ++++++++++++++++------------------------
 1 file changed, 16 insertions(+), 24 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 1a230b5495e..277e667a382 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2385,42 +2385,34 @@ static struct group_dual group9 = { {
 
 static struct opcode opcode_table[256] = {
 	/* 0x00 - 0x07 */
-	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
-	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
-	D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm),
+	D2bv(DstMem | SrcReg | ModRM | Lock), D2bv(DstReg | SrcMem | ModRM),
+	D2bv(DstAcc | SrcImm),
 	D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
 	/* 0x08 - 0x0F */
-	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
-	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
-	D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm),
+	D2bv(DstMem | SrcReg | ModRM | Lock), D2bv(DstReg | SrcMem | ModRM),
+	D2bv(DstAcc | SrcImm),
 	D(ImplicitOps | Stack | No64), N,
 	/* 0x10 - 0x17 */
-	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
-	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
-	D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm),
+	D2bv(DstMem | SrcReg | ModRM | Lock), D2bv(DstReg | SrcMem | ModRM),
+	D2bv(DstAcc | SrcImm),
 	D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
 	/* 0x18 - 0x1F */
-	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
-	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
-	D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm),
+	D2bv(DstMem | SrcReg | ModRM | Lock), D2bv(DstReg | SrcMem | ModRM),
+	D2bv(DstAcc | SrcImm),
 	D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
 	/* 0x20 - 0x27 */
-	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
-	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
-	D(ByteOp | DstAcc | SrcImmByte), D(DstAcc | SrcImm), N, N,
+	D2bv(DstMem | SrcReg | ModRM | Lock), D2bv(DstReg | SrcMem | ModRM),
+	D2bv(DstAcc | SrcImm), N, N,
 	/* 0x28 - 0x2F */
-	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
-	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
-	D(ByteOp | DstAcc | SrcImmByte), D(DstAcc | SrcImm),
+	D2bv(DstMem | SrcReg | ModRM | Lock), D2bv(DstReg | SrcMem | ModRM),
+	D2bv(DstAcc | SrcImm),
 	N, I(ByteOp | DstAcc | No64, em_das),
 	/* 0x30 - 0x37 */
-	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
-	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
-	D(ByteOp | DstAcc | SrcImmByte), D(DstAcc | SrcImm), N, N,
+	D2bv(DstMem | SrcReg | ModRM | Lock), D2bv(DstReg | SrcMem | ModRM),
+	D2bv(DstAcc | SrcImm), N, N,
 	/* 0x38 - 0x3F */
-	D(ByteOp | DstMem | SrcReg | ModRM), D(DstMem | SrcReg | ModRM),
-	D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
-	D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm),
+	D2bv(DstMem | SrcReg | ModRM), D2bv(DstReg | SrcMem | ModRM),
+	D2bv(DstAcc | SrcImm),
 	N, N,
 	/* 0x40 - 0x4F */
 	X16(D(DstReg)),
-- 
cgit v1.2.3-18-g5258


From 48fe67b5f7f71bb954dc97b18096cef12f6618b4 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 26 Aug 2010 11:56:08 +0300
Subject: KVM: x86 emulator: simplify string instruction decode flags

Use the new byte/word dual opcode decode.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 277e667a382..749322e1d95 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2429,8 +2429,8 @@ static struct opcode opcode_table[256] = {
 	I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op),
 	I(SrcImmByte | Mov | Stack, em_push),
 	I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op),
-	D(DstDI | ByteOp | Mov | String), D(DstDI | Mov | String), /* insb, insw/insd */
-	D(SrcSI | ByteOp | ImplicitOps | String), D(SrcSI | ImplicitOps | String), /* outsb, outsw/outsd */
+	D2bv(DstDI | Mov | String), /* insb, insw/insd */
+	D2bv(SrcSI | ImplicitOps | String), /* outsb, outsw/outsd */
 	/* 0x70 - 0x7F */
 	X16(D(SrcImmByte)),
 	/* 0x80 - 0x87 */
@@ -2454,13 +2454,12 @@ static struct opcode opcode_table[256] = {
 	/* 0xA0 - 0xA7 */
 	D(ByteOp | DstAcc | SrcMem | Mov | MemAbs), D(DstAcc | SrcMem | Mov | MemAbs),
 	D(ByteOp | DstMem | SrcAcc | Mov | MemAbs), D(DstMem | SrcAcc | Mov | MemAbs),
-	D(ByteOp | SrcSI | DstDI | Mov | String), D(SrcSI | DstDI | Mov | String),
-	D(ByteOp | SrcSI | DstDI | String), D(SrcSI | DstDI | String),
+	D2bv(SrcSI | DstDI | Mov | String), D2bv(SrcSI | DstDI | String),
 	/* 0xA8 - 0xAF */
 	D(DstAcc | SrcImmByte | ByteOp), D(DstAcc | SrcImm),
-	D(ByteOp | SrcAcc | DstDI | Mov | String), D(SrcAcc | DstDI | Mov | String),
-	D(ByteOp | SrcSI | DstAcc | Mov | String), D(SrcSI | DstAcc | Mov | String),
-	D(ByteOp | SrcAcc | DstDI | String), D(SrcAcc | DstDI | String),
+	D2bv(SrcAcc | DstDI | Mov | String),
+	D2bv(SrcSI | DstAcc | Mov | String),
+	D2bv(SrcAcc | DstDI | String),
 	/* 0xB0 - 0xB7 */
 	X8(D(ByteOp | DstReg | SrcImm | Mov)),
 	/* 0xB8 - 0xBF */
-- 
cgit v1.2.3-18-g5258


From 76e8e68d4435bb894a1a03be853a55a4a2b45247 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 26 Aug 2010 11:56:09 +0300
Subject: KVM: x86 emulator: simplify instruction decode flags for opcodes
 80-8F

Use the new byte/word dual opcode decode.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 749322e1d95..661013fdb3b 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2438,11 +2438,10 @@ static struct opcode opcode_table[256] = {
 	G(DstMem | SrcImm | ModRM | Group, group1),
 	G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1),
 	G(DstMem | SrcImmByte | ModRM | Group, group1),
-	D(ByteOp | DstMem | SrcReg | ModRM), D(DstMem | SrcReg | ModRM),
-	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
+	D2bv(DstMem | SrcReg | ModRM), D2bv(DstMem | SrcReg | ModRM | Lock),
 	/* 0x88 - 0x8F */
-	D(ByteOp | DstMem | SrcReg | ModRM | Mov), D(DstMem | SrcReg | ModRM | Mov),
-	D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem | ModRM | Mov),
+	D2bv(DstMem | SrcReg | ModRM | Mov),
+	D2bv(DstReg | SrcMem | ModRM | Mov),
 	D(DstMem | SrcNone | ModRM | Mov), D(ModRM | SrcMem | NoAccess | DstReg),
 	D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A),
 	/* 0x90 - 0x97 */
-- 
cgit v1.2.3-18-g5258


From 50748613d16f55cbf7da14bc6e92b7cb1cd4fa7d Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 26 Aug 2010 11:56:10 +0300
Subject: KVM: x86 emulator: simplify instruction decode flags for opcodes
 A0-AF

Use the new byte/word dual opcode decode.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 661013fdb3b..d59e54bb589 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2451,11 +2451,11 @@ static struct opcode opcode_table[256] = {
 	I(SrcImmFAddr | No64, em_call_far), N,
 	D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, N,
 	/* 0xA0 - 0xA7 */
-	D(ByteOp | DstAcc | SrcMem | Mov | MemAbs), D(DstAcc | SrcMem | Mov | MemAbs),
-	D(ByteOp | DstMem | SrcAcc | Mov | MemAbs), D(DstMem | SrcAcc | Mov | MemAbs),
+	D2bv(DstAcc | SrcMem | Mov | MemAbs),
+	D2bv(DstMem | SrcAcc | Mov | MemAbs),
 	D2bv(SrcSI | DstDI | Mov | String), D2bv(SrcSI | DstDI | String),
 	/* 0xA8 - 0xAF */
-	D(DstAcc | SrcImmByte | ByteOp), D(DstAcc | SrcImm),
+	D2bv(DstAcc | SrcImm),
 	D2bv(SrcAcc | DstDI | Mov | String),
 	D2bv(SrcSI | DstAcc | Mov | String),
 	D2bv(SrcAcc | DstDI | String),
-- 
cgit v1.2.3-18-g5258


From d2c6c7adb181eac5b18dbefdf24c0e6745470939 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 26 Aug 2010 11:56:11 +0300
Subject: KVM: x86 emulator: simplify instruction decode flags for opcodes
 C0-DF

Use the new byte/word dual opcode decode.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index d59e54bb589..02566c1283f 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2464,17 +2464,16 @@ static struct opcode opcode_table[256] = {
 	/* 0xB8 - 0xBF */
 	X8(D(DstReg | SrcImm | Mov)),
 	/* 0xC0 - 0xC7 */
-	D(ByteOp | DstMem | SrcImm | ModRM), D(DstMem | SrcImmByte | ModRM),
+	D2bv(DstMem | SrcImmByte | ModRM),
 	I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm),
 	D(ImplicitOps | Stack),
 	D(DstReg | SrcMemFAddr | ModRM | No64), D(DstReg | SrcMemFAddr | ModRM | No64),
-	D(ByteOp | DstMem | SrcImm | ModRM | Mov), D(DstMem | SrcImm | ModRM | Mov),
+	D2bv(DstMem | SrcImm | ModRM | Mov),
 	/* 0xC8 - 0xCF */
 	N, N, N, D(ImplicitOps | Stack),
 	D(ImplicitOps), D(SrcImmByte), D(ImplicitOps | No64), D(ImplicitOps),
 	/* 0xD0 - 0xD7 */
-	D(ByteOp | DstMem | SrcOne | ModRM), D(DstMem | SrcOne | ModRM),
-	D(ByteOp | DstMem | ModRM), D(DstMem | ModRM),
+	D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM),
 	N, N, N, N,
 	/* 0xD8 - 0xDF */
 	N, N, N, N, N, N, N, N,
-- 
cgit v1.2.3-18-g5258


From d269e3961a65bbf6a76a8dc37b70cb578216e2c0 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 26 Aug 2010 11:56:12 +0300
Subject: KVM: x86 emulator: simplify instruction decode flags for opcodes
 E0-FF

Use the new byte/word dual opcode decode.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 02566c1283f..b43572afce3 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2479,13 +2479,11 @@ static struct opcode opcode_table[256] = {
 	N, N, N, N, N, N, N, N,
 	/* 0xE0 - 0xE7 */
 	X4(D(SrcImmByte)),
-	D(ByteOp | SrcImmUByte | DstAcc), D(SrcImmUByte | DstAcc),
-	D(ByteOp | SrcAcc | DstImmUByte), D(SrcAcc | DstImmUByte),
+	D2bv(SrcImmUByte | DstAcc), D2bv(SrcAcc | DstImmUByte),
 	/* 0xE8 - 0xEF */
 	D(SrcImm | Stack), D(SrcImm | ImplicitOps),
 	D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps),
-	D(SrcNone | ByteOp | DstAcc), D(SrcNone | DstAcc),
-	D(ByteOp | SrcAcc | ImplicitOps), D(SrcAcc | ImplicitOps),
+	D2bv(SrcNone | DstAcc),	D2bv(SrcAcc | ImplicitOps),
 	/* 0xF0 - 0xF7 */
 	N, N, N, N,
 	D(ImplicitOps | Priv), D(ImplicitOps), G(ByteOp, group3), G(0, group3),
-- 
cgit v1.2.3-18-g5258


From 739ae406068211b235b488f247aab349e486c382 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 26 Aug 2010 11:56:13 +0300
Subject: KVM: x86 emulator: simplify instruction decode flags for opcodes 0F
 00-FF

Use the new byte/word dual opcode decode.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index b43572afce3..58e715cb517 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2534,7 +2534,7 @@ static struct opcode twobyte_table[256] = {
 	D(DstMem | SrcReg | Src2CL | ModRM),
 	D(ModRM), I(DstReg | SrcMem | ModRM, em_imul),
 	/* 0xB0 - 0xB7 */
-	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
+	D2bv(DstMem | SrcReg | ModRM | Lock),
 	D(DstReg | SrcMemFAddr | ModRM), D(DstMem | SrcReg | ModRM | BitOp | Lock),
 	D(DstReg | SrcMemFAddr | ModRM), D(DstReg | SrcMemFAddr | ModRM),
 	D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
@@ -2544,7 +2544,7 @@ static struct opcode twobyte_table[256] = {
 	D(DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
 	D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
 	/* 0xC0 - 0xCF */
-	D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock),
+	D2bv(DstMem | SrcReg | ModRM | Lock),
 	N, D(DstMem | SrcReg | ModRM | Mov),
 	N, N, N, GD(0, &group9),
 	N, N, N, N, N, N, N, N,
-- 
cgit v1.2.3-18-g5258


From f6b3597bded9ed261b42fdcb5e741489cb5ccbfe Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 26 Aug 2010 11:59:00 +0300
Subject: KVM: x86 emulator: add macros for executing instructions that may
 trap

Like DIV and IDIV.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 58e715cb517..e96cce17022 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -331,6 +331,27 @@ struct group_dual {
 			  "a" (_rax), "d" (_rdx));			\
 	} while (0)
 
+#define __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _suffix, _ex) \
+	do {								\
+		unsigned long _tmp;					\
+									\
+		__asm__ __volatile__ (					\
+			_PRE_EFLAGS("0", "5", "1")			\
+			"1: \n\t"					\
+			_op _suffix " %6; "				\
+			"2: \n\t"					\
+			_POST_EFLAGS("0", "5", "1")			\
+			".pushsection .fixup,\"ax\" \n\t"		\
+			"3: movb $1, %4 \n\t"				\
+			"jmp 2b \n\t"					\
+			".popsection \n\t"				\
+			_ASM_EXTABLE(1b, 3b)				\
+			: "=m" (_eflags), "=&r" (_tmp),			\
+			  "+a" (_rax), "+d" (_rdx), "+qm"(_ex)		\
+			: "i" (EFLAGS_MASK), "m" ((_src).val),		\
+			  "a" (_rax), "d" (_rdx));			\
+	} while (0)
+
 /* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */
 #define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags)			\
 	do {									\
@@ -342,6 +363,28 @@ struct group_dual {
 		}							\
 	} while (0)
 
+#define emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _ex)	\
+	do {								\
+		switch((_src).bytes) {					\
+		case 1:							\
+			__emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx,	\
+						 _eflags, "b", _ex);	\
+			break;						\
+		case 2:							\
+			__emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
+						 _eflags, "w", _ex);	\
+			break;						\
+		case 4:							\
+			__emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
+						 _eflags, "l", _ex);	\
+			break;						\
+		case 8: ON64(						\
+			__emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
+						 _eflags, "q", _ex));	\
+			break;						\
+		}							\
+	} while (0)
+
 /* Fetch next part of the instruction being emulated. */
 #define insn_fetch(_type, _size, _eip)                                  \
 ({	unsigned long _x;						\
-- 
cgit v1.2.3-18-g5258


From 34d1f4905eb66478a890ea808ec58bc842e6e589 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 26 Aug 2010 11:59:01 +0300
Subject: KVM: x86 emulator: trap and propagate #DE from DIV and IDIV

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index e96cce17022..917b9b50fab 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -504,6 +504,12 @@ static void emulate_ts(struct x86_emulate_ctxt *ctxt, int err)
 	emulate_exception(ctxt, TS_VECTOR, err, true);
 }
 
+static int emulate_de(struct x86_emulate_ctxt *ctxt)
+{
+	emulate_exception(ctxt, DE_VECTOR, 0, false);
+	return X86EMUL_PROPAGATE_FAULT;
+}
+
 static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
 			      struct x86_emulate_ops *ops,
 			      unsigned long eip, u8 *dest)
@@ -1458,6 +1464,7 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
 	struct decode_cache *c = &ctxt->decode;
 	unsigned long *rax = &c->regs[VCPU_REGS_RAX];
 	unsigned long *rdx = &c->regs[VCPU_REGS_RDX];
+	u8 de = 0;
 
 	switch (c->modrm_reg) {
 	case 0 ... 1:	/* test */
@@ -1476,14 +1483,18 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
 		emulate_1op_rax_rdx("imul", c->src, *rax, *rdx, ctxt->eflags);
 		break;
 	case 6: /* div */
-		emulate_1op_rax_rdx("div", c->src, *rax, *rdx, ctxt->eflags);
+		emulate_1op_rax_rdx_ex("div", c->src, *rax, *rdx,
+				       ctxt->eflags, de);
 		break;
 	case 7: /* idiv */
-		emulate_1op_rax_rdx("idiv", c->src, *rax, *rdx, ctxt->eflags);
+		emulate_1op_rax_rdx_ex("idiv", c->src, *rax, *rdx,
+				       ctxt->eflags, de);
 		break;
 	default:
 		return X86EMUL_UNHANDLEABLE;
 	}
+	if (de)
+		return emulate_de(ctxt);
 	return X86EMUL_CONTINUE;
 }
 
@@ -3413,8 +3424,9 @@ special_insn:
 		ctxt->eflags ^= EFLG_CF;
 		break;
 	case 0xf6 ... 0xf7:	/* Grp3 */
-		if (emulate_grp3(ctxt, ops) != X86EMUL_CONTINUE)
-			goto cannot_emulate;
+		rc = emulate_grp3(ctxt, ops);
+		if (rc != X86EMUL_CONTINUE)
+			goto done;
 		break;
 	case 0xf8: /* clc */
 		ctxt->eflags &= ~EFLG_CF;
-- 
cgit v1.2.3-18-g5258


From 217fc9cfca21a0bc2f4246183ebd8ee9863b019d Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 26 Aug 2010 13:38:03 +0300
Subject: KVM: Fix build error due to 64-bit division in nsec_to_cycles()

Use do_div() instead.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bc96ac9ed91..bdba1d09a97 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -56,6 +56,7 @@
 #include <asm/i387.h>
 #include <asm/xcr.h>
 #include <asm/pvclock.h>
+#include <asm/div64.h>
 
 #define MAX_IO_MSRS 256
 #define CR0_RESERVED_BITS						\
@@ -917,11 +918,15 @@ static inline int kvm_tsc_changes_freq(void)
 
 static inline u64 nsec_to_cycles(u64 nsec)
 {
+	u64 ret;
+
 	WARN_ON(preemptible());
 	if (kvm_tsc_changes_freq())
 		printk_once(KERN_WARNING
 		 "kvm: unreliable cycle conversion on adjustable rate TSC\n");
-	return (nsec * __get_cpu_var(cpu_tsc_khz)) / USEC_PER_SEC;
+	ret = nsec * __get_cpu_var(cpu_tsc_khz);
+	do_div(ret, USEC_PER_SEC);
+	return ret;
 }
 
 void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
-- 
cgit v1.2.3-18-g5258


From 6230f7fc0453c5bc5daa8e053773021e1c4a2f16 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 26 Aug 2010 18:34:55 +0300
Subject: KVM: x86 emulator: simplify ALU opcode block decode further

The ALU opcode block is very regular; introduce D6ALU() to define decode
flags for 6 instructions at a time.

Suggested by Paolo Bonzini.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 32 ++++++++++++++------------------
 1 file changed, 14 insertions(+), 18 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 917b9b50fab..8bfa3e3aa71 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2387,6 +2387,11 @@ static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
 #define D2bv(_f)      D((_f) | ByteOp), D(_f)
 #define I2bv(_f, _e)  I((_f) | ByteOp, _e), I(_f, _e)
 
+#define D6ALU(_f) D2bv((_f) | DstMem | SrcReg | ModRM),			\
+		D2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock),		\
+		D2bv(((_f) & ~Lock) | DstAcc | SrcImm)
+
+
 static struct opcode group1[] = {
 	X7(D(Lock)), N
 };
@@ -2439,35 +2444,25 @@ static struct group_dual group9 = { {
 
 static struct opcode opcode_table[256] = {
 	/* 0x00 - 0x07 */
-	D2bv(DstMem | SrcReg | ModRM | Lock), D2bv(DstReg | SrcMem | ModRM),
-	D2bv(DstAcc | SrcImm),
+	D6ALU(Lock),
 	D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
 	/* 0x08 - 0x0F */
-	D2bv(DstMem | SrcReg | ModRM | Lock), D2bv(DstReg | SrcMem | ModRM),
-	D2bv(DstAcc | SrcImm),
+	D6ALU(Lock),
 	D(ImplicitOps | Stack | No64), N,
 	/* 0x10 - 0x17 */
-	D2bv(DstMem | SrcReg | ModRM | Lock), D2bv(DstReg | SrcMem | ModRM),
-	D2bv(DstAcc | SrcImm),
+	D6ALU(Lock),
 	D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
 	/* 0x18 - 0x1F */
-	D2bv(DstMem | SrcReg | ModRM | Lock), D2bv(DstReg | SrcMem | ModRM),
-	D2bv(DstAcc | SrcImm),
+	D6ALU(Lock),
 	D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
 	/* 0x20 - 0x27 */
-	D2bv(DstMem | SrcReg | ModRM | Lock), D2bv(DstReg | SrcMem | ModRM),
-	D2bv(DstAcc | SrcImm), N, N,
+	D6ALU(Lock), N, N,
 	/* 0x28 - 0x2F */
-	D2bv(DstMem | SrcReg | ModRM | Lock), D2bv(DstReg | SrcMem | ModRM),
-	D2bv(DstAcc | SrcImm),
-	N, I(ByteOp | DstAcc | No64, em_das),
+	D6ALU(Lock), N, I(ByteOp | DstAcc | No64, em_das),
 	/* 0x30 - 0x37 */
-	D2bv(DstMem | SrcReg | ModRM | Lock), D2bv(DstReg | SrcMem | ModRM),
-	D2bv(DstAcc | SrcImm), N, N,
+	D6ALU(Lock), N, N,
 	/* 0x38 - 0x3F */
-	D2bv(DstMem | SrcReg | ModRM), D2bv(DstReg | SrcMem | ModRM),
-	D2bv(DstAcc | SrcImm),
-	N, N,
+	D6ALU(0), N, N,
 	/* 0x40 - 0x4F */
 	X16(D(DstReg)),
 	/* 0x50 - 0x57 */
@@ -2618,6 +2613,7 @@ static struct opcode twobyte_table[256] = {
 
 #undef D2bv
 #undef I2bv
+#undef D6ALU
 
 static unsigned imm_size(struct decode_cache *c)
 {
-- 
cgit v1.2.3-18-g5258


From 23e7a7944f3779155e2f6bbc831b544eb925f387 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Fri, 27 Aug 2010 17:15:06 +0800
Subject: KVM: pit: Do not check pending pit timer in vcpu thread

Pit interrupt injection was done by workqueue, so no need to check
pending pit timer in vcpu thread which could lead unnecessary
unblocking of vcpu.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/i8254.c | 9 ---------
 arch/x86/kvm/irq.c   | 7 +------
 2 files changed, 1 insertion(+), 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index ddeb2314b52..2ad40a4ddc3 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -232,15 +232,6 @@ static void pit_latch_status(struct kvm *kvm, int channel)
 	}
 }
 
-int pit_has_pending_timer(struct kvm_vcpu *vcpu)
-{
-	struct kvm_pit *pit = vcpu->kvm->arch.vpit;
-
-	if (pit && kvm_vcpu_is_bsp(vcpu) && pit->pit_state.irq_ack)
-		return atomic_read(&pit->pit_state.pit_timer.pending);
-	return 0;
-}
-
 static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
 {
 	struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 2095a049835..f994da40ad9 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -33,12 +33,7 @@
  */
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 {
-	int ret;
-
-	ret = pit_has_pending_timer(vcpu);
-	ret |= apic_has_pending_timer(vcpu);
-
-	return ret;
+	return apic_has_pending_timer(vcpu);
 }
 EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
 
-- 
cgit v1.2.3-18-g5258


From 9ad17b10011702cb56c5e32e41ecd5fe281c3574 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Sat, 28 Aug 2010 19:19:42 +0800
Subject: KVM: MMU: fix compile warning in audit code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fix:

arch/x86/kvm/mmu.c: In function ‘kvm_mmu_unprotect_page’:
arch/x86/kvm/mmu.c:1741: warning: format ‘%lx’ expects type ‘long unsigned int’, but argument 3 has type ‘gfn_t’
arch/x86/kvm/mmu.c:1745: warning: format ‘%lx’ expects type ‘long unsigned int’, but argument 3 has type ‘gfn_t’
arch/x86/kvm/mmu.c: In function ‘mmu_unshadow’:
arch/x86/kvm/mmu.c:1761: warning: format ‘%lx’ expects type ‘long unsigned int’, but argument 3 has type ‘gfn_t’
arch/x86/kvm/mmu.c: In function ‘set_spte’:
arch/x86/kvm/mmu.c:2005: warning: format ‘%lx’ expects type ‘long unsigned int’, but argument 3 has type ‘gfn_t’
arch/x86/kvm/mmu.c: In function ‘mmu_set_spte’:
arch/x86/kvm/mmu.c:2033: warning: format ‘%lx’ expects type ‘long unsigned int’, but argument 7 has type ‘gfn_t’

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b0037a77e56..59bf1d9553a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1738,11 +1738,11 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 	LIST_HEAD(invalid_list);
 	int r;
 
-	pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
+	pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
 	r = 0;
 
 	for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
-		pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
+		pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
 			 sp->role.word);
 		r = 1;
 		kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
@@ -1758,7 +1758,7 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
 	LIST_HEAD(invalid_list);
 
 	for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
-		pgprintk("%s: zap %lx %x\n",
+		pgprintk("%s: zap %llx %x\n",
 			 __func__, gfn, sp->role.word);
 		kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
 	}
@@ -2002,7 +2002,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 			goto set_pte;
 
 		if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
-			pgprintk("%s: found shadow page for %lx, marking ro\n",
+			pgprintk("%s: found shadow page for %llx, marking ro\n",
 				 __func__, gfn);
 			ret = 1;
 			pte_access &= ~ACC_WRITE_MASK;
@@ -2031,7 +2031,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	int rmap_count;
 
 	pgprintk("%s: spte %llx access %x write_fault %d"
-		 " user_fault %d gfn %lx\n",
+		 " user_fault %d gfn %llx\n",
 		 __func__, *sptep, pt_access,
 		 write_fault, user_fault, gfn);
 
@@ -2050,7 +2050,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 			__set_spte(sptep, shadow_trap_nonpresent_pte);
 			kvm_flush_remote_tlbs(vcpu->kvm);
 		} else if (pfn != spte_to_pfn(*sptep)) {
-			pgprintk("hfn old %lx new %lx\n",
+			pgprintk("hfn old %llx new %llx\n",
 				 spte_to_pfn(*sptep), pfn);
 			drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
 			kvm_flush_remote_tlbs(vcpu->kvm);
@@ -2067,7 +2067,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	}
 
 	pgprintk("%s: setting spte %llx\n", __func__, *sptep);
-	pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
+	pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
 		 is_large_pte(*sptep)? "2MB" : "4kB",
 		 *sptep & PT_PRESENT_MASK ?"RW":"R", gfn,
 		 *sptep, sptep);
@@ -3651,9 +3651,9 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
 		if (!gfn_to_memslot(kvm, gfn)) {
 			if (!printk_ratelimit())
 				return;
-			printk(KERN_ERR "%s: no memslot for gfn %ld\n",
+			printk(KERN_ERR "%s: no memslot for gfn %llx\n",
 					 audit_msg, gfn);
-			printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n",
+			printk(KERN_ERR "%s: index %ld of sp (gfn=%llx)\n",
 			       audit_msg, (long int)(sptep - rev_sp->spt),
 					rev_sp->gfn);
 			dump_stack();
@@ -3728,7 +3728,7 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
 		while (spte) {
 			if (is_writable_pte(*spte))
 				printk(KERN_ERR "%s: (%s) shadow page has "
-				"writable mappings: gfn %lx role %x\n",
+				"writable mappings: gfn %llx role %x\n",
 			       __func__, audit_msg, sp->gfn,
 			       sp->role.word);
 			spte = rmap_next(vcpu->kvm, rmapp, spte);
-- 
cgit v1.2.3-18-g5258


From 0beb8d660425aab339ff68e6f4d4528739e8fc4f Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Sat, 28 Aug 2010 19:20:47 +0800
Subject: KVM: MMU: check rmap for every spte

The read-only spte also has reverse mapping, so fix the code to check them,
also modify the function name to fit its doing

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 57 +++++++++++++++++++++++++-----------------------------
 1 file changed, 26 insertions(+), 31 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 59bf1d9553a..1c784b96dac 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3644,40 +3644,38 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
 	struct kvm_mmu_page *rev_sp;
 	gfn_t gfn;
 
-	if (is_writable_pte(*sptep)) {
-		rev_sp = page_header(__pa(sptep));
-		gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
 
-		if (!gfn_to_memslot(kvm, gfn)) {
-			if (!printk_ratelimit())
-				return;
-			printk(KERN_ERR "%s: no memslot for gfn %llx\n",
-					 audit_msg, gfn);
-			printk(KERN_ERR "%s: index %ld of sp (gfn=%llx)\n",
-			       audit_msg, (long int)(sptep - rev_sp->spt),
-					rev_sp->gfn);
-			dump_stack();
-			return;
-		}
+	rev_sp = page_header(__pa(sptep));
+	gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
 
-		rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
-		if (!*rmapp) {
-			if (!printk_ratelimit())
-				return;
-			printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
-					 audit_msg, *sptep);
-			dump_stack();
-		}
+	if (!gfn_to_memslot(kvm, gfn)) {
+		if (!printk_ratelimit())
+			return;
+		printk(KERN_ERR "%s: no memslot for gfn %llx\n",
+				 audit_msg, gfn);
+		printk(KERN_ERR "%s: index %ld of sp (gfn=%llx)\n",
+		       audit_msg, (long int)(sptep - rev_sp->spt),
+				rev_sp->gfn);
+		dump_stack();
+		return;
 	}
 
+	rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
+	if (!*rmapp) {
+		if (!printk_ratelimit())
+			return;
+		printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
+				 audit_msg, *sptep);
+		dump_stack();
+	}
 }
 
-void audit_writable_sptes_have_rmaps(struct kvm_vcpu *vcpu)
+void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu)
 {
 	mmu_spte_walk(vcpu, inspect_spte_has_rmap);
 }
 
-static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
+static void check_mappings_rmap(struct kvm_vcpu *vcpu)
 {
 	struct kvm_mmu_page *sp;
 	int i;
@@ -3689,12 +3687,9 @@ static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
 			continue;
 
 		for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
-			u64 ent = pt[i];
-
-			if (!(ent & PT_PRESENT_MASK))
-				continue;
-			if (!is_writable_pte(ent))
+			if (!is_rmap_spte(pt[i]))
 				continue;
+
 			inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
 		}
 	}
@@ -3703,7 +3698,7 @@ static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
 
 static void audit_rmap(struct kvm_vcpu *vcpu)
 {
-	check_writable_mappings_rmap(vcpu);
+	check_mappings_rmap(vcpu);
 	count_rmaps(vcpu);
 }
 
@@ -3746,7 +3741,7 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
 	audit_write_protection(vcpu);
 	if (strcmp("pre pte write", audit_msg) != 0)
 		audit_mappings(vcpu);
-	audit_writable_sptes_have_rmaps(vcpu);
+	audit_sptes_have_rmaps(vcpu);
 	dbg = olddbg;
 }
 
-- 
cgit v1.2.3-18-g5258


From bc32ce2152406431acf4daf4a81dc1664bb7b91b Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Sat, 28 Aug 2010 19:22:46 +0800
Subject: KVM: MMU: fix wrong not write protected sp report

The audit code reports some sp not write protected in current code, it's just the
bug in audit_write_protection(), since:

- the invalid sp not need write protected
- using uninitialize local variable('gfn')
- call kvm_mmu_audit() out of mmu_lock's protection

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c         | 5 +++--
 arch/x86/kvm/paging_tmpl.h | 3 ++-
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 1c784b96dac..68575dc32ec 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3708,16 +3708,17 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
 	struct kvm_memory_slot *slot;
 	unsigned long *rmapp;
 	u64 *spte;
-	gfn_t gfn;
 
 	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
 		if (sp->role.direct)
 			continue;
 		if (sp->unsync)
 			continue;
+		if (sp->role.invalid)
+			continue;
 
 		slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
-		rmapp = &slot->rmap[gfn - slot->base_gfn];
+		rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
 
 		spte = rmap_next(vcpu->kvm, rmapp, NULL);
 		while (spte) {
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index a4e8389df2a..a0f2febf569 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -504,7 +504,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	unsigned long mmu_seq;
 
 	pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
-	kvm_mmu_audit(vcpu, "pre page fault");
 
 	r = mmu_topup_memory_caches(vcpu);
 	if (r)
@@ -542,6 +541,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	spin_lock(&vcpu->kvm->mmu_lock);
 	if (mmu_notifier_retry(vcpu, mmu_seq))
 		goto out_unlock;
+
+	kvm_mmu_audit(vcpu, "pre page fault");
 	kvm_mmu_free_some_pages(vcpu);
 	sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
 			     level, &write_pt, pfn);
-- 
cgit v1.2.3-18-g5258


From 365fb3fdf6769d3553999d8eb6cc2a8c56c747c1 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Sat, 28 Aug 2010 19:24:13 +0800
Subject: KVM: MMU: rewrite audit_mappings_page() function

There is a bugs in this function, we call gfn_to_pfn() and kvm_mmu_gva_to_gpa_read() in
atomic context(kvm_mmu_audit() is called under the spinlock(mmu_lock)'s protection).

This patch fix it by:
- introduce gfn_to_pfn_atomic instead of gfn_to_pfn
- get the mapping gfn from kvm_mmu_page_get_gfn()

And it adds 'notrap' ptes check in unsync/direct sps

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 75 +++++++++++++++++++++++++++++-------------------------
 1 file changed, 40 insertions(+), 35 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 68575dc32ec..0d91f60af1a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3487,15 +3487,6 @@ EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
 
 static const char *audit_msg;
 
-static gva_t canonicalize(gva_t gva)
-{
-#ifdef CONFIG_X86_64
-	gva = (long long)(gva << 16) >> 16;
-#endif
-	return gva;
-}
-
-
 typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
 
 static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
@@ -3550,39 +3541,53 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
 	gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
 
 	for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
-		u64 ent = pt[i];
+		u64 *sptep = pt + i;
+		struct kvm_mmu_page *sp;
+		gfn_t gfn;
+		pfn_t pfn;
+		hpa_t hpa;
 
-		if (ent == shadow_trap_nonpresent_pte)
-			continue;
+		sp = page_header(__pa(sptep));
 
-		va = canonicalize(va);
-		if (is_shadow_present_pte(ent) && !is_last_spte(ent, level))
-			audit_mappings_page(vcpu, ent, va, level - 1);
-		else {
-			gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, va, NULL);
-			gfn_t gfn = gpa >> PAGE_SHIFT;
-			pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn);
-			hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT;
+		if (sp->unsync) {
+			if (level != PT_PAGE_TABLE_LEVEL) {
+				printk(KERN_ERR "audit: (%s) error: unsync sp: %p level = %d\n",
+						audit_msg, sp, level);
+				return;
+			}
 
-			if (is_error_pfn(pfn)) {
-				kvm_release_pfn_clean(pfn);
-				continue;
+			if (*sptep == shadow_notrap_nonpresent_pte) {
+				printk(KERN_ERR "audit: (%s) error: notrap spte in unsync sp: %p\n",
+						audit_msg, sp);
+				return;
 			}
+		}
 
-			if (is_shadow_present_pte(ent)
-			    && (ent & PT64_BASE_ADDR_MASK) != hpa)
-				printk(KERN_ERR "xx audit error: (%s) levels %d"
-				       " gva %lx gpa %llx hpa %llx ent %llx %d\n",
-				       audit_msg, vcpu->arch.mmu.root_level,
-				       va, gpa, hpa, ent,
-				       is_shadow_present_pte(ent));
-			else if (ent == shadow_notrap_nonpresent_pte
-				 && !is_error_hpa(hpa))
-				printk(KERN_ERR "audit: (%s) notrap shadow,"
-				       " valid guest gva %lx\n", audit_msg, va);
-			kvm_release_pfn_clean(pfn);
+		if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
+			printk(KERN_ERR "audit: (%s) error: notrap spte in direct sp: %p\n",
+					audit_msg, sp);
+			return;
+		}
+
+		if (!is_shadow_present_pte(*sptep) ||
+		      !is_last_spte(*sptep, level))
+			return;
+
+		gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+		pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
 
+		if (is_error_pfn(pfn)) {
+			kvm_release_pfn_clean(pfn);
+			return;
 		}
+
+		hpa =  pfn << PAGE_SHIFT;
+
+		if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
+			printk(KERN_ERR "xx audit error: (%s) levels %d"
+					   " gva %lx pfn %llx hpa %llx ent %llxn",
+					   audit_msg, vcpu->arch.mmu.root_level,
+					   va, pfn, hpa, *sptep);
 	}
 }
 
-- 
cgit v1.2.3-18-g5258


From 8e0e8afa82018a3c751ea474eb47dfc65f00f4c3 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Sat, 28 Aug 2010 19:25:09 +0800
Subject: KVM: MMU: remove count_rmaps()

Nothing is checked in count_rmaps(), so remove it

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 38 --------------------------------------
 1 file changed, 38 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 0d91f60af1a..0bff4d54817 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3606,43 +3606,6 @@ static void audit_mappings(struct kvm_vcpu *vcpu)
 						    2);
 }
 
-static int count_rmaps(struct kvm_vcpu *vcpu)
-{
-	struct kvm *kvm = vcpu->kvm;
-	struct kvm_memslots *slots;
-	int nmaps = 0;
-	int i, j, k, idx;
-
-	idx = srcu_read_lock(&kvm->srcu);
-	slots = kvm_memslots(kvm);
-	for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
-		struct kvm_memory_slot *m = &slots->memslots[i];
-		struct kvm_rmap_desc *d;
-
-		for (j = 0; j < m->npages; ++j) {
-			unsigned long *rmapp = &m->rmap[j];
-
-			if (!*rmapp)
-				continue;
-			if (!(*rmapp & 1)) {
-				++nmaps;
-				continue;
-			}
-			d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
-			while (d) {
-				for (k = 0; k < RMAP_EXT; ++k)
-					if (d->sptes[k])
-						++nmaps;
-					else
-						break;
-				d = d->more;
-			}
-		}
-	}
-	srcu_read_unlock(&kvm->srcu, idx);
-	return nmaps;
-}
-
 void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
 {
 	unsigned long *rmapp;
@@ -3704,7 +3667,6 @@ static void check_mappings_rmap(struct kvm_vcpu *vcpu)
 static void audit_rmap(struct kvm_vcpu *vcpu)
 {
 	check_mappings_rmap(vcpu);
-	count_rmaps(vcpu);
 }
 
 static void audit_write_protection(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3-18-g5258


From c41a15dd4632499b9c1a00871e160276999767d9 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 30 Aug 2010 10:46:56 +0300
Subject: KVM: Fix pio trace direction

out = write, in = read, not the other way round.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bdba1d09a97..d0ba857cd7c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3743,7 +3743,7 @@ static int emulator_pio_in_emulated(int size, unsigned short port, void *val,
 	if (vcpu->arch.pio.count)
 		goto data_avail;
 
-	trace_kvm_pio(1, port, size, 1);
+	trace_kvm_pio(0, port, size, 1);
 
 	vcpu->arch.pio.port = port;
 	vcpu->arch.pio.in = 1;
@@ -3771,7 +3771,7 @@ static int emulator_pio_out_emulated(int size, unsigned short port,
 			      const void *val, unsigned int count,
 			      struct kvm_vcpu *vcpu)
 {
-	trace_kvm_pio(0, port, size, 1);
+	trace_kvm_pio(1, port, size, 1);
 
 	vcpu->arch.pio.port = port;
 	vcpu->arch.pio.in = 0;
-- 
cgit v1.2.3-18-g5258


From 678041ad9dc82eedc598f709e8a3d620139d4105 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 31 Aug 2010 19:13:13 -0300
Subject: KVM: SVM: reset mmu context in init_vmcb

Since commit aad827034e419fa no mmu reinitialization is performed
via init_vmcb.

Zero vcpu->arch.cr0 and pass the reset value as a parameter to
kvm_set_cr0.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ff28f652106..60bc1e53d23 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -827,8 +827,8 @@ static void init_vmcb(struct vcpu_svm *svm)
 	 * This is the guest-visible cr0 value.
 	 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
 	 */
-	svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
-	(void)kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0);
+	svm->vcpu.arch.cr0 = 0;
+	(void)kvm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
 
 	save->cr4 = X86_CR4_PAE;
 	/* rdx = ?? */
-- 
cgit v1.2.3-18-g5258


From eaa48512ba9df32aab8be5fceec10f3f80369379 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 31 Aug 2010 19:13:14 -0300
Subject: KVM: SVM: init_vmcb should reset vcpu->efer

Otherwise EFER_LMA bit is retained across a SIPI reset.

Fixes guest cpu onlining.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 60bc1e53d23..a1a83b955ed 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -816,7 +816,7 @@ static void init_vmcb(struct vcpu_svm *svm)
 	init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
 	init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
 
-	save->efer = EFER_SVME;
+	svm_set_efer(&svm->vcpu, 0);
 	save->dr6 = 0xffff0ff0;
 	save->dr7 = 0x400;
 	save->rflags = 2;
-- 
cgit v1.2.3-18-g5258


From e90aa41e6ca76cd7be021d4d5560e64954cd4585 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 1 Sep 2010 10:23:35 +0300
Subject: KVM: Don't save/restore MSR_IA32_PERF_STATUS

It is read/only; restoring it only results in annoying messages.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d0ba857cd7c..1c972382e5d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -739,7 +739,7 @@ static u32 msrs_to_save[] = {
 #ifdef CONFIG_X86_64
 	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 #endif
-	MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
+	MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
 };
 
 static unsigned num_msrs_to_save;
-- 
cgit v1.2.3-18-g5258


From b9eac5f4d146dc6cb88c8e6d891f8abe60493338 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 3 Aug 2010 14:46:56 +0300
Subject: KVM: x86 emulator: use single stage decoding for mov instructions

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 42 ++++++++++++++++++------------------------
 1 file changed, 18 insertions(+), 24 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 8bfa3e3aa71..c0715ae05a5 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2378,6 +2378,13 @@ static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
+static int em_mov(struct x86_emulate_ctxt *ctxt)
+{
+	struct decode_cache *c = &ctxt->decode;
+	c->dst.val = c->src.val;
+	return X86EMUL_CONTINUE;
+}
+
 #define D(_y) { .flags = (_y) }
 #define N    D(0)
 #define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) }
@@ -2489,8 +2496,8 @@ static struct opcode opcode_table[256] = {
 	G(DstMem | SrcImmByte | ModRM | Group, group1),
 	D2bv(DstMem | SrcReg | ModRM), D2bv(DstMem | SrcReg | ModRM | Lock),
 	/* 0x88 - 0x8F */
-	D2bv(DstMem | SrcReg | ModRM | Mov),
-	D2bv(DstReg | SrcMem | ModRM | Mov),
+	I2bv(DstMem | SrcReg | ModRM | Mov, em_mov),
+	I2bv(DstReg | SrcMem | ModRM | Mov, em_mov),
 	D(DstMem | SrcNone | ModRM | Mov), D(ModRM | SrcMem | NoAccess | DstReg),
 	D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A),
 	/* 0x90 - 0x97 */
@@ -2500,24 +2507,25 @@ static struct opcode opcode_table[256] = {
 	I(SrcImmFAddr | No64, em_call_far), N,
 	D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, N,
 	/* 0xA0 - 0xA7 */
-	D2bv(DstAcc | SrcMem | Mov | MemAbs),
-	D2bv(DstMem | SrcAcc | Mov | MemAbs),
-	D2bv(SrcSI | DstDI | Mov | String), D2bv(SrcSI | DstDI | String),
+	I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov),
+	I2bv(DstMem | SrcAcc | Mov | MemAbs, em_mov),
+	I2bv(SrcSI | DstDI | Mov | String, em_mov),
+	D2bv(SrcSI | DstDI | String),
 	/* 0xA8 - 0xAF */
 	D2bv(DstAcc | SrcImm),
-	D2bv(SrcAcc | DstDI | Mov | String),
-	D2bv(SrcSI | DstAcc | Mov | String),
+	I2bv(SrcAcc | DstDI | Mov | String, em_mov),
+	I2bv(SrcSI | DstAcc | Mov | String, em_mov),
 	D2bv(SrcAcc | DstDI | String),
 	/* 0xB0 - 0xB7 */
-	X8(D(ByteOp | DstReg | SrcImm | Mov)),
+	X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)),
 	/* 0xB8 - 0xBF */
-	X8(D(DstReg | SrcImm | Mov)),
+	X8(I(DstReg | SrcImm | Mov, em_mov)),
 	/* 0xC0 - 0xC7 */
 	D2bv(DstMem | SrcImmByte | ModRM),
 	I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm),
 	D(ImplicitOps | Stack),
 	D(DstReg | SrcMemFAddr | ModRM | No64), D(DstReg | SrcMemFAddr | ModRM | No64),
-	D2bv(DstMem | SrcImm | ModRM | Mov),
+	I2bv(DstMem | SrcImm | ModRM | Mov, em_mov),
 	/* 0xC8 - 0xCF */
 	N, N, N, D(ImplicitOps | Stack),
 	D(ImplicitOps), D(SrcImmByte), D(ImplicitOps | No64), D(ImplicitOps),
@@ -3212,8 +3220,6 @@ special_insn:
 		c->dst.val = c->src.orig_val;
 		c->lock_prefix = 1;
 		break;
-	case 0x88 ... 0x8b:	/* mov */
-		goto mov;
 	case 0x8c:  /* mov r/m, sreg */
 		if (c->modrm_reg > VCPU_SREG_GS) {
 			emulate_ud(ctxt);
@@ -3271,22 +3277,14 @@ special_insn:
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		break;
-	case 0xa0 ... 0xa3:	/* mov */
-	case 0xa4 ... 0xa5:	/* movs */
-		goto mov;
 	case 0xa6 ... 0xa7:	/* cmps */
 		c->dst.type = OP_NONE; /* Disable writeback. */
 		DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.addr.mem, c->dst.addr.mem);
 		goto cmp;
 	case 0xa8 ... 0xa9:	/* test ax, imm */
 		goto test;
-	case 0xaa ... 0xab:	/* stos */
-	case 0xac ... 0xad:	/* lods */
-		goto mov;
 	case 0xae ... 0xaf:	/* scas */
 		goto cmp;
-	case 0xb0 ... 0xbf: /* mov r, imm */
-		goto mov;
 	case 0xc0 ... 0xc1:
 		emulate_grp2(ctxt);
 		break;
@@ -3305,10 +3303,6 @@ special_insn:
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		break;
-	case 0xc6 ... 0xc7:	/* mov (sole member of Grp11) */
-	mov:
-		c->dst.val = c->src.val;
-		break;
 	case 0xcb:		/* ret far */
 		rc = emulate_ret_far(ctxt, ops);
 		if (rc != X86EMUL_CONTINUE)
-- 
cgit v1.2.3-18-g5258


From a4d4a7c1880db98a521bc27c15348185fa30c256 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 3 Aug 2010 15:05:46 +0300
Subject: KVM: x86 emulator: fix group 11 decoding for reg != 0

These are all undefined.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index c0715ae05a5..9940d166154 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2449,6 +2449,10 @@ static struct group_dual group9 = { {
 	N, N, N, N, N, N, N, N,
 } };
 
+static struct opcode group11[] = {
+	I(DstMem | SrcImm | ModRM | Mov, em_mov), X7(D(Undefined)),
+};
+
 static struct opcode opcode_table[256] = {
 	/* 0x00 - 0x07 */
 	D6ALU(Lock),
@@ -2525,7 +2529,7 @@ static struct opcode opcode_table[256] = {
 	I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm),
 	D(ImplicitOps | Stack),
 	D(DstReg | SrcMemFAddr | ModRM | No64), D(DstReg | SrcMemFAddr | ModRM | No64),
-	I2bv(DstMem | SrcImm | ModRM | Mov, em_mov),
+	G(ByteOp, group11), G(0, group11),
 	/* 0xC8 - 0xCF */
 	N, N, N, D(ImplicitOps | Stack),
 	D(ImplicitOps), D(SrcImmByte), D(ImplicitOps | No64), D(ImplicitOps),
-- 
cgit v1.2.3-18-g5258


From 7d9ddaedd8a9d0442fda5b5a90f22a33becbd235 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 30 Aug 2010 17:12:28 +0300
Subject: KVM: x86 emulator: clean up control flow in x86_emulate_insn()

x86_emulate_insn() is full of things like

    if (rc != X86EMUL_CONTINUE)
        goto done;
    break;

consolidate all of those at the end of the switch statement.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 64 ++++++--------------------------------------------
 1 file changed, 7 insertions(+), 57 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 9940d166154..27d2c22b114 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3098,8 +3098,6 @@ special_insn:
 		break;
 	case 0x07:		/* pop es */
 		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
 	case 0x08 ... 0x0d:
 	      or:		/* or */
@@ -3117,8 +3115,6 @@ special_insn:
 		break;
 	case 0x17:		/* pop ss */
 		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
 	case 0x18 ... 0x1d:
 	      sbb:		/* sbb */
@@ -3129,8 +3125,6 @@ special_insn:
 		break;
 	case 0x1f:		/* pop ds */
 		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
 	case 0x20 ... 0x25:
 	      and:		/* and */
@@ -3157,18 +3151,12 @@ special_insn:
 	case 0x58 ... 0x5f: /* pop reg */
 	pop_instruction:
 		rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
 	case 0x60:	/* pusha */
 		rc = emulate_pusha(ctxt, ops);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
 	case 0x61:	/* popa */
 		rc = emulate_popa(ctxt, ops);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
 	case 0x63:		/* movsxd */
 		if (ctxt->mode != X86EMUL_MODE_PROT64)
@@ -3255,8 +3243,6 @@ special_insn:
 	}
 	case 0x8f:		/* pop (sole member of Grp1a) */
 		rc = emulate_grp1a(ctxt, ops);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
 	case 0x90 ... 0x97: /* nop / xchg reg, rax */
 		if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX])
@@ -3278,8 +3264,6 @@ special_insn:
 		c->dst.addr.reg = &ctxt->eflags;
 		c->dst.bytes = c->op_bytes;
 		rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
 	case 0xa6 ... 0xa7:	/* cmps */
 		c->dst.type = OP_NONE; /* Disable writeback. */
@@ -3299,18 +3283,12 @@ special_insn:
 		goto pop_instruction;
 	case 0xc4:		/* les */
 		rc = emulate_load_segment(ctxt, ops, VCPU_SREG_ES);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
 	case 0xc5:		/* lds */
 		rc = emulate_load_segment(ctxt, ops, VCPU_SREG_DS);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
 	case 0xcb:		/* ret far */
 		rc = emulate_ret_far(ctxt, ops);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
 	case 0xcc:		/* int3 */
 		irq = 3;
@@ -3319,8 +3297,6 @@ special_insn:
 		irq = c->src.val;
 	do_interrupt:
 		rc = emulate_int(ctxt, ops, irq);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
 	case 0xce:		/* into */
 		if (ctxt->eflags & EFLG_OF) {
@@ -3330,9 +3306,6 @@ special_insn:
 		break;
 	case 0xcf:		/* iret */
 		rc = emulate_iret(ctxt, ops);
-
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
 	case 0xd0 ... 0xd1:	/* Grp2 */
 		emulate_grp2(ctxt);
@@ -3419,8 +3392,6 @@ special_insn:
 		break;
 	case 0xf6 ... 0xf7:	/* Grp3 */
 		rc = emulate_grp3(ctxt, ops);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
 	case 0xf8: /* clc */
 		ctxt->eflags &= ~EFLG_CF;
@@ -3453,8 +3424,6 @@ special_insn:
 	case 0xfe: /* Grp4 */
 	grp45:
 		rc = emulate_grp45(ctxt, ops);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
 	case 0xff: /* Grp5 */
 		if (c->modrm_reg == 5)
@@ -3464,6 +3433,9 @@ special_insn:
 		goto cannot_emulate;
 	}
 
+	if (rc != X86EMUL_CONTINUE)
+		goto done;
+
 writeback:
 	rc = writeback(ctxt, ops);
 	if (rc != X86EMUL_CONTINUE)
@@ -3545,8 +3517,6 @@ twobyte_insn:
 				switch (c->modrm_rm) {
 				case 1:
 					rc = kvm_fix_hypercall(ctxt->vcpu);
-					if (rc != X86EMUL_CONTINUE)
-						goto done;
 					break;
 				default:
 					goto cannot_emulate;
@@ -3585,10 +3555,6 @@ twobyte_insn:
 		break;
 	case 0x05: 		/* syscall */
 		rc = emulate_syscall(ctxt, ops);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
-		else
-			goto writeback;
 		break;
 	case 0x06:
 		emulate_clts(ctxt->vcpu);
@@ -3665,17 +3631,9 @@ twobyte_insn:
 		break;
 	case 0x34:		/* sysenter */
 		rc = emulate_sysenter(ctxt, ops);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
-		else
-			goto writeback;
 		break;
 	case 0x35:		/* sysexit */
 		rc = emulate_sysexit(ctxt, ops);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
-		else
-			goto writeback;
 		break;
 	case 0x40 ... 0x4f:	/* cmov */
 		c->dst.val = c->dst.orig_val = c->src.val;
@@ -3694,8 +3652,6 @@ twobyte_insn:
 		break;
 	case 0xa1:	 /* pop fs */
 		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
 	case 0xa3:
 	      bt:		/* bt */
@@ -3713,8 +3669,6 @@ twobyte_insn:
 		break;
 	case 0xa9:	/* pop gs */
 		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
 	case 0xab:
 	      bts:		/* bts */
@@ -3745,8 +3699,6 @@ twobyte_insn:
 		break;
 	case 0xb2:		/* lss */
 		rc = emulate_load_segment(ctxt, ops, VCPU_SREG_SS);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
 	case 0xb3:
 	      btr:		/* btr */
@@ -3754,13 +3706,9 @@ twobyte_insn:
 		break;
 	case 0xb4:		/* lfs */
 		rc = emulate_load_segment(ctxt, ops, VCPU_SREG_FS);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
 	case 0xb5:		/* lgs */
 		rc = emulate_load_segment(ctxt, ops, VCPU_SREG_GS);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
 	case 0xb6 ... 0xb7:	/* movzx */
 		c->dst.bytes = c->op_bytes;
@@ -3825,12 +3773,14 @@ twobyte_insn:
 		break;
 	case 0xc7:		/* Grp9 (cmpxchg8b) */
 		rc = emulate_grp9(ctxt, ops);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
 		break;
 	default:
 		goto cannot_emulate;
 	}
+
+	if (rc != X86EMUL_CONTINUE)
+		goto done;
+
 	goto writeback;
 
 cannot_emulate:
-- 
cgit v1.2.3-18-g5258


From 9ed049c3b6230b68985da31f8243d4bec95e0b3a Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 30 Aug 2010 12:18:24 +0300
Subject: KVM: i8259: Make ICW1 conform to spec

ICW is not a full reset, instead it resets a limited number of registers
in the PIC.  Change ICW1 emulation to only reset those registers.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/i8259.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 4b7b73ce209..6e77471951e 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -308,13 +308,17 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
 	addr &= 1;
 	if (addr == 0) {
 		if (val & 0x10) {
-			kvm_pic_reset(s);	/* init */
-			/*
-			 * deassert a pending interrupt
-			 */
-			pic_irq_request(s->pics_state->kvm, 0);
-			s->init_state = 1;
 			s->init4 = val & 1;
+			s->last_irr = 0;
+			s->imr = 0;
+			s->priority_add = 0;
+			s->special_mask = 0;
+			s->read_reg_select = 0;
+			if (!s->init4) {
+				s->special_fully_nested_mode = 0;
+				s->auto_eoi = 0;
+			}
+			s->init_state = 1;
 			if (val & 0x02)
 				printk(KERN_ERR "single mode not supported");
 			if (val & 0x08)
-- 
cgit v1.2.3-18-g5258


From 84e0cefa8ddd5d5018d3b582e1e90585ed551757 Mon Sep 17 00:00:00 2001
From: Jes Sorensen <Jes.Sorensen@redhat.com>
Date: Wed, 1 Sep 2010 11:42:04 +0200
Subject: KVM: Fix guest kernel crash on MSR_K7_CLK_CTL

MSR_K7_CLK_CTL is a no longer documented MSR, which is only relevant
on said old AMD CPU models. This change returns the expected value,
which the Linux kernel is expecting to avoid writing back the MSR,
plus it ignores all writes to the MSR.

Signed-off-by: Jes Sorensen <Jes.Sorensen@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1c972382e5d..f47db2588a4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1449,6 +1449,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 		pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
 			"0x%x data 0x%llx\n", msr, data);
 		break;
+	case MSR_K7_CLK_CTL:
+		/*
+		 * Ignore all writes to this no longer documented MSR.
+		 * Writes are only relevant for old K7 processors,
+		 * all pre-dating SVM, but a recommended workaround from
+		 * AMD for these chips. It is possible to speicify the
+		 * affected processor models on the command line, hence
+		 * the need to ignore the workaround.
+		 */
+		break;
 	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
 		if (kvm_hv_msr_partition_wide(msr)) {
 			int r;
@@ -1674,6 +1684,18 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case MSR_IA32_MCG_STATUS:
 	case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
 		return get_msr_mce(vcpu, msr, pdata);
+	case MSR_K7_CLK_CTL:
+		/*
+		 * Provide expected ramp-up count for K7. All other
+		 * are set to zero, indicating minimum divisors for
+		 * every field.
+		 *
+		 * This prevents guest kernels on AMD host with CPU
+		 * type 6, model 8 and higher from exploding due to
+		 * the rdmsr failing.
+		 */
+		data = 0x20000000;
+		break;
 	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
 		if (kvm_hv_msr_partition_wide(msr)) {
 			int r;
-- 
cgit v1.2.3-18-g5258


From 8b1fe17cc7a8b2c62b400dcbfaebd96da6b4f58e Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Mon, 30 Aug 2010 18:22:53 +0800
Subject: KVM: MMU: support disable/enable mmu audit dynamicly

Add a r/w module parameter named 'mmu_audit', it can control audit
enable/disable:

enable:
  echo 1 > /sys/module/kvm/parameters/mmu_audit

disable:
  echo 0 > /sys/module/kvm/parameters/mmu_audit

This patch not change the logic

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/Kconfig       |  7 ++++
 arch/x86/kvm/mmu.c         | 91 +++++++++++++++++++++++++++++++++++++---------
 arch/x86/kvm/mmutrace.h    | 19 ++++++++++
 arch/x86/kvm/paging_tmpl.h |  4 +-
 4 files changed, 101 insertions(+), 20 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 970bbd47951..ddc131ff438 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -64,6 +64,13 @@ config KVM_AMD
 	  To compile this as a module, choose M here: the module
 	  will be called kvm-amd.
 
+config KVM_MMU_AUDIT
+	bool "Audit KVM MMU"
+	depends on KVM && TRACEPOINTS
+	---help---
+	 This option adds a R/W kVM module parameter 'mmu_audit', which allows
+	 audit  KVM MMU at runtime.
+
 # OK, it's a little counter-intuitive to do this, but it puts it neatly under
 # the virtualization menu.
 source drivers/vhost/Kconfig
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 0bff4d54817..8b750ff6911 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -49,15 +49,21 @@
  */
 bool tdp_enabled = false;
 
-#undef MMU_DEBUG
+enum {
+	AUDIT_PRE_PAGE_FAULT,
+	AUDIT_POST_PAGE_FAULT,
+	AUDIT_PRE_PTE_WRITE,
+	AUDIT_POST_PTE_WRITE
+};
 
-#undef AUDIT
+char *audit_point_name[] = {
+	"pre page fault",
+	"post page fault",
+	"pre pte write",
+	"post pte write"
+};
 
-#ifdef AUDIT
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
-#else
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
-#endif
+#undef MMU_DEBUG
 
 #ifdef MMU_DEBUG
 
@@ -71,7 +77,7 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
 
 #endif
 
-#if defined(MMU_DEBUG) || defined(AUDIT)
+#ifdef MMU_DEBUG
 static int dbg = 0;
 module_param(dbg, bool, 0644);
 #endif
@@ -2964,7 +2970,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	kvm_mmu_access_page(vcpu, gfn);
 	kvm_mmu_free_some_pages(vcpu);
 	++vcpu->kvm->stat.mmu_pte_write;
-	kvm_mmu_audit(vcpu, "pre pte write");
+	trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
 	if (guest_initiated) {
 		if (gfn == vcpu->arch.last_pt_write_gfn
 		    && !last_updated_pte_accessed(vcpu)) {
@@ -3037,7 +3043,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	}
 	mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
 	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
-	kvm_mmu_audit(vcpu, "post pte write");
+	trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 	if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
 		kvm_release_pfn_clean(vcpu->arch.update_pte.pfn);
@@ -3483,8 +3489,7 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
 
-#ifdef AUDIT
-
+#ifdef CONFIG_KVM_MMU_AUDIT
 static const char *audit_msg;
 
 typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
@@ -3699,18 +3704,68 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
 	}
 }
 
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
+static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int audit_point)
 {
-	int olddbg = dbg;
-
-	dbg = 0;
-	audit_msg = msg;
+	audit_msg = audit_point_name[audit_point];
 	audit_rmap(vcpu);
 	audit_write_protection(vcpu);
 	if (strcmp("pre pte write", audit_msg) != 0)
 		audit_mappings(vcpu);
 	audit_sptes_have_rmaps(vcpu);
-	dbg = olddbg;
 }
 
+static bool mmu_audit;
+
+static void mmu_audit_enable(void)
+{
+	int ret;
+
+	if (mmu_audit)
+		return;
+
+	ret = register_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
+	WARN_ON(ret);
+
+	mmu_audit = true;
+}
+
+static void mmu_audit_disable(void)
+{
+	if (!mmu_audit)
+		return;
+
+	unregister_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
+	tracepoint_synchronize_unregister();
+	mmu_audit = false;
+}
+
+static int mmu_audit_set(const char *val, const struct kernel_param *kp)
+{
+	int ret;
+	unsigned long enable;
+
+	ret = strict_strtoul(val, 10, &enable);
+	if (ret < 0)
+		return -EINVAL;
+
+	switch (enable) {
+	case 0:
+		mmu_audit_disable();
+		break;
+	case 1:
+		mmu_audit_enable();
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static struct kernel_param_ops audit_param_ops = {
+	.set = mmu_audit_set,
+	.get = param_get_bool,
+};
+
+module_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644);
 #endif
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 3aab0f0930e..b60b4fdb3ed 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -195,6 +195,25 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
 
 	TP_ARGS(sp)
 );
+
+TRACE_EVENT(
+	kvm_mmu_audit,
+	TP_PROTO(struct kvm_vcpu *vcpu, int audit_point),
+	TP_ARGS(vcpu, audit_point),
+
+	TP_STRUCT__entry(
+		__field(struct kvm_vcpu *, vcpu)
+		__field(int, audit_point)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu = vcpu;
+		__entry->audit_point = audit_point;
+	),
+
+	TP_printk("vcpu:%d %s", __entry->vcpu->cpu,
+		  audit_point_name[__entry->audit_point])
+);
 #endif /* _TRACE_KVMMMU_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index a0f2febf569..debe7703536 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -542,7 +542,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	if (mmu_notifier_retry(vcpu, mmu_seq))
 		goto out_unlock;
 
-	kvm_mmu_audit(vcpu, "pre page fault");
+	trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
 	kvm_mmu_free_some_pages(vcpu);
 	sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
 			     level, &write_pt, pfn);
@@ -554,7 +554,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 		vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
 
 	++vcpu->stat.pf_fixed;
-	kvm_mmu_audit(vcpu, "post page fault (fixed)");
+	trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 
 	return write_pt;
-- 
cgit v1.2.3-18-g5258


From 2f4f337248cd5660040b7e09b7287a7a0a861f3f Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Mon, 30 Aug 2010 18:24:10 +0800
Subject: KVM: MMU: move audit to a separate file

Move the audit code from arch/x86/kvm/mmu.c to arch/x86/kvm/mmu_audit.c

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c       | 279 +-------------------------------------------
 arch/x86/kvm/mmu_audit.c | 297 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 298 insertions(+), 278 deletions(-)
 create mode 100644 arch/x86/kvm/mmu_audit.c

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 8b750ff6911..d2dad65a45f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3490,282 +3490,5 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
 EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
 
 #ifdef CONFIG_KVM_MMU_AUDIT
-static const char *audit_msg;
-
-typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
-
-static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
-			    inspect_spte_fn fn)
-{
-	int i;
-
-	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
-		u64 ent = sp->spt[i];
-
-		if (is_shadow_present_pte(ent)) {
-			if (!is_last_spte(ent, sp->role.level)) {
-				struct kvm_mmu_page *child;
-				child = page_header(ent & PT64_BASE_ADDR_MASK);
-				__mmu_spte_walk(kvm, child, fn);
-			} else
-				fn(kvm, &sp->spt[i]);
-		}
-	}
-}
-
-static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
-{
-	int i;
-	struct kvm_mmu_page *sp;
-
-	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
-		return;
-	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
-		hpa_t root = vcpu->arch.mmu.root_hpa;
-		sp = page_header(root);
-		__mmu_spte_walk(vcpu->kvm, sp, fn);
-		return;
-	}
-	for (i = 0; i < 4; ++i) {
-		hpa_t root = vcpu->arch.mmu.pae_root[i];
-
-		if (root && VALID_PAGE(root)) {
-			root &= PT64_BASE_ADDR_MASK;
-			sp = page_header(root);
-			__mmu_spte_walk(vcpu->kvm, sp, fn);
-		}
-	}
-	return;
-}
-
-static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
-				gva_t va, int level)
-{
-	u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
-	int i;
-	gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
-
-	for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
-		u64 *sptep = pt + i;
-		struct kvm_mmu_page *sp;
-		gfn_t gfn;
-		pfn_t pfn;
-		hpa_t hpa;
-
-		sp = page_header(__pa(sptep));
-
-		if (sp->unsync) {
-			if (level != PT_PAGE_TABLE_LEVEL) {
-				printk(KERN_ERR "audit: (%s) error: unsync sp: %p level = %d\n",
-						audit_msg, sp, level);
-				return;
-			}
-
-			if (*sptep == shadow_notrap_nonpresent_pte) {
-				printk(KERN_ERR "audit: (%s) error: notrap spte in unsync sp: %p\n",
-						audit_msg, sp);
-				return;
-			}
-		}
-
-		if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
-			printk(KERN_ERR "audit: (%s) error: notrap spte in direct sp: %p\n",
-					audit_msg, sp);
-			return;
-		}
-
-		if (!is_shadow_present_pte(*sptep) ||
-		      !is_last_spte(*sptep, level))
-			return;
-
-		gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
-		pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
-
-		if (is_error_pfn(pfn)) {
-			kvm_release_pfn_clean(pfn);
-			return;
-		}
-
-		hpa =  pfn << PAGE_SHIFT;
-
-		if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
-			printk(KERN_ERR "xx audit error: (%s) levels %d"
-					   " gva %lx pfn %llx hpa %llx ent %llxn",
-					   audit_msg, vcpu->arch.mmu.root_level,
-					   va, pfn, hpa, *sptep);
-	}
-}
-
-static void audit_mappings(struct kvm_vcpu *vcpu)
-{
-	unsigned i;
-
-	if (vcpu->arch.mmu.root_level == 4)
-		audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
-	else
-		for (i = 0; i < 4; ++i)
-			if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
-				audit_mappings_page(vcpu,
-						    vcpu->arch.mmu.pae_root[i],
-						    i << 30,
-						    2);
-}
-
-void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
-{
-	unsigned long *rmapp;
-	struct kvm_mmu_page *rev_sp;
-	gfn_t gfn;
-
-
-	rev_sp = page_header(__pa(sptep));
-	gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
-
-	if (!gfn_to_memslot(kvm, gfn)) {
-		if (!printk_ratelimit())
-			return;
-		printk(KERN_ERR "%s: no memslot for gfn %llx\n",
-				 audit_msg, gfn);
-		printk(KERN_ERR "%s: index %ld of sp (gfn=%llx)\n",
-		       audit_msg, (long int)(sptep - rev_sp->spt),
-				rev_sp->gfn);
-		dump_stack();
-		return;
-	}
-
-	rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
-	if (!*rmapp) {
-		if (!printk_ratelimit())
-			return;
-		printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
-				 audit_msg, *sptep);
-		dump_stack();
-	}
-}
-
-void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu)
-{
-	mmu_spte_walk(vcpu, inspect_spte_has_rmap);
-}
-
-static void check_mappings_rmap(struct kvm_vcpu *vcpu)
-{
-	struct kvm_mmu_page *sp;
-	int i;
-
-	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
-		u64 *pt = sp->spt;
-
-		if (sp->role.level != PT_PAGE_TABLE_LEVEL)
-			continue;
-
-		for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
-			if (!is_rmap_spte(pt[i]))
-				continue;
-
-			inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
-		}
-	}
-	return;
-}
-
-static void audit_rmap(struct kvm_vcpu *vcpu)
-{
-	check_mappings_rmap(vcpu);
-}
-
-static void audit_write_protection(struct kvm_vcpu *vcpu)
-{
-	struct kvm_mmu_page *sp;
-	struct kvm_memory_slot *slot;
-	unsigned long *rmapp;
-	u64 *spte;
-
-	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
-		if (sp->role.direct)
-			continue;
-		if (sp->unsync)
-			continue;
-		if (sp->role.invalid)
-			continue;
-
-		slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
-		rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
-
-		spte = rmap_next(vcpu->kvm, rmapp, NULL);
-		while (spte) {
-			if (is_writable_pte(*spte))
-				printk(KERN_ERR "%s: (%s) shadow page has "
-				"writable mappings: gfn %llx role %x\n",
-			       __func__, audit_msg, sp->gfn,
-			       sp->role.word);
-			spte = rmap_next(vcpu->kvm, rmapp, spte);
-		}
-	}
-}
-
-static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int audit_point)
-{
-	audit_msg = audit_point_name[audit_point];
-	audit_rmap(vcpu);
-	audit_write_protection(vcpu);
-	if (strcmp("pre pte write", audit_msg) != 0)
-		audit_mappings(vcpu);
-	audit_sptes_have_rmaps(vcpu);
-}
-
-static bool mmu_audit;
-
-static void mmu_audit_enable(void)
-{
-	int ret;
-
-	if (mmu_audit)
-		return;
-
-	ret = register_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
-	WARN_ON(ret);
-
-	mmu_audit = true;
-}
-
-static void mmu_audit_disable(void)
-{
-	if (!mmu_audit)
-		return;
-
-	unregister_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
-	tracepoint_synchronize_unregister();
-	mmu_audit = false;
-}
-
-static int mmu_audit_set(const char *val, const struct kernel_param *kp)
-{
-	int ret;
-	unsigned long enable;
-
-	ret = strict_strtoul(val, 10, &enable);
-	if (ret < 0)
-		return -EINVAL;
-
-	switch (enable) {
-	case 0:
-		mmu_audit_disable();
-		break;
-	case 1:
-		mmu_audit_enable();
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-static struct kernel_param_ops audit_param_ops = {
-	.set = mmu_audit_set,
-	.get = param_get_bool,
-};
-
-module_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644);
+#include "mmu_audit.c"
 #endif
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
new file mode 100644
index 00000000000..fb8a461333c
--- /dev/null
+++ b/arch/x86/kvm/mmu_audit.c
@@ -0,0 +1,297 @@
+/*
+ * mmu_audit.c:
+ *
+ * Audit code for KVM MMU
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ *
+ * Authors:
+ *   Yaniv Kamay  <yaniv@qumranet.com>
+ *   Avi Kivity   <avi@qumranet.com>
+ *   Marcelo Tosatti <mtosatti@redhat.com>
+ *   Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+static const char *audit_msg;
+
+typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
+
+static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
+			    inspect_spte_fn fn)
+{
+	int i;
+
+	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+		u64 ent = sp->spt[i];
+
+		if (is_shadow_present_pte(ent)) {
+			if (!is_last_spte(ent, sp->role.level)) {
+				struct kvm_mmu_page *child;
+				child = page_header(ent & PT64_BASE_ADDR_MASK);
+				__mmu_spte_walk(kvm, child, fn);
+			} else
+				fn(kvm, &sp->spt[i]);
+		}
+	}
+}
+
+static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
+{
+	int i;
+	struct kvm_mmu_page *sp;
+
+	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+		return;
+	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+		hpa_t root = vcpu->arch.mmu.root_hpa;
+		sp = page_header(root);
+		__mmu_spte_walk(vcpu->kvm, sp, fn);
+		return;
+	}
+	for (i = 0; i < 4; ++i) {
+		hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+		if (root && VALID_PAGE(root)) {
+			root &= PT64_BASE_ADDR_MASK;
+			sp = page_header(root);
+			__mmu_spte_walk(vcpu->kvm, sp, fn);
+		}
+	}
+	return;
+}
+
+static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
+				gva_t va, int level)
+{
+	u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
+	int i;
+	gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
+
+	for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
+		u64 *sptep = pt + i;
+		struct kvm_mmu_page *sp;
+		gfn_t gfn;
+		pfn_t pfn;
+		hpa_t hpa;
+
+		sp = page_header(__pa(sptep));
+
+		if (sp->unsync) {
+			if (level != PT_PAGE_TABLE_LEVEL) {
+				printk(KERN_ERR "audit: (%s) error: unsync sp: %p level = %d\n",
+						audit_msg, sp, level);
+				return;
+			}
+
+			if (*sptep == shadow_notrap_nonpresent_pte) {
+				printk(KERN_ERR "audit: (%s) error: notrap spte in unsync sp: %p\n",
+						audit_msg, sp);
+				return;
+			}
+		}
+
+		if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
+			printk(KERN_ERR "audit: (%s) error: notrap spte in direct sp: %p\n",
+					audit_msg, sp);
+			return;
+		}
+
+		if (!is_shadow_present_pte(*sptep) ||
+		      !is_last_spte(*sptep, level))
+			return;
+
+		gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+		pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
+
+		if (is_error_pfn(pfn)) {
+			kvm_release_pfn_clean(pfn);
+			return;
+		}
+
+		hpa =  pfn << PAGE_SHIFT;
+
+		if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
+			printk(KERN_ERR "xx audit error: (%s) levels %d"
+					   " gva %lx pfn %llx hpa %llx ent %llxn",
+					   audit_msg, vcpu->arch.mmu.root_level,
+					   va, pfn, hpa, *sptep);
+	}
+}
+
+static void audit_mappings(struct kvm_vcpu *vcpu)
+{
+	unsigned i;
+
+	if (vcpu->arch.mmu.root_level == 4)
+		audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
+	else
+		for (i = 0; i < 4; ++i)
+			if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
+				audit_mappings_page(vcpu,
+						    vcpu->arch.mmu.pae_root[i],
+						    i << 30,
+						    2);
+}
+
+void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
+{
+	unsigned long *rmapp;
+	struct kvm_mmu_page *rev_sp;
+	gfn_t gfn;
+
+
+	rev_sp = page_header(__pa(sptep));
+	gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
+
+	if (!gfn_to_memslot(kvm, gfn)) {
+		if (!printk_ratelimit())
+			return;
+		printk(KERN_ERR "%s: no memslot for gfn %llx\n",
+				 audit_msg, gfn);
+		printk(KERN_ERR "%s: index %ld of sp (gfn=%llx)\n",
+		       audit_msg, (long int)(sptep - rev_sp->spt),
+				rev_sp->gfn);
+		dump_stack();
+		return;
+	}
+
+	rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
+	if (!*rmapp) {
+		if (!printk_ratelimit())
+			return;
+		printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
+				 audit_msg, *sptep);
+		dump_stack();
+	}
+}
+
+void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu)
+{
+	mmu_spte_walk(vcpu, inspect_spte_has_rmap);
+}
+
+static void check_mappings_rmap(struct kvm_vcpu *vcpu)
+{
+	struct kvm_mmu_page *sp;
+	int i;
+
+	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
+		u64 *pt = sp->spt;
+
+		if (sp->role.level != PT_PAGE_TABLE_LEVEL)
+			continue;
+
+		for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+			if (!is_rmap_spte(pt[i]))
+				continue;
+
+			inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
+		}
+	}
+	return;
+}
+
+static void audit_rmap(struct kvm_vcpu *vcpu)
+{
+	check_mappings_rmap(vcpu);
+}
+
+static void audit_write_protection(struct kvm_vcpu *vcpu)
+{
+	struct kvm_mmu_page *sp;
+	struct kvm_memory_slot *slot;
+	unsigned long *rmapp;
+	u64 *spte;
+
+	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
+		if (sp->role.direct)
+			continue;
+		if (sp->unsync)
+			continue;
+		if (sp->role.invalid)
+			continue;
+
+		slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
+		rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
+
+		spte = rmap_next(vcpu->kvm, rmapp, NULL);
+		while (spte) {
+			if (is_writable_pte(*spte))
+				printk(KERN_ERR "%s: (%s) shadow page has "
+				"writable mappings: gfn %llx role %x\n",
+			       __func__, audit_msg, sp->gfn,
+			       sp->role.word);
+			spte = rmap_next(vcpu->kvm, rmapp, spte);
+		}
+	}
+}
+
+static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int audit_point)
+{
+	audit_msg = audit_point_name[audit_point];
+	audit_rmap(vcpu);
+	audit_write_protection(vcpu);
+	if (strcmp("pre pte write", audit_msg) != 0)
+		audit_mappings(vcpu);
+	audit_sptes_have_rmaps(vcpu);
+}
+
+static bool mmu_audit;
+
+static void mmu_audit_enable(void)
+{
+	int ret;
+
+	if (mmu_audit)
+		return;
+
+	ret = register_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
+	WARN_ON(ret);
+
+	mmu_audit = true;
+}
+
+static void mmu_audit_disable(void)
+{
+	if (!mmu_audit)
+		return;
+
+	unregister_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
+	tracepoint_synchronize_unregister();
+	mmu_audit = false;
+}
+
+static int mmu_audit_set(const char *val, const struct kernel_param *kp)
+{
+	int ret;
+	unsigned long enable;
+
+	ret = strict_strtoul(val, 10, &enable);
+	if (ret < 0)
+		return -EINVAL;
+
+	switch (enable) {
+	case 0:
+		mmu_audit_disable();
+		break;
+	case 1:
+		mmu_audit_enable();
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static struct kernel_param_ops audit_param_ops = {
+	.set = mmu_audit_set,
+	.get = param_get_bool,
+};
+
+module_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644);
-- 
cgit v1.2.3-18-g5258


From 49edf87806f52a005152beaed9f4731862efc8fe Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Mon, 30 Aug 2010 18:25:03 +0800
Subject: KVM: MMU: improve active sp audit

Both audit_rmap() and audit_write_protection() need to walk all active sp, so
we can do these checking in a sp walking

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu_audit.c | 74 +++++++++++++++++++++++++-----------------------
 1 file changed, 38 insertions(+), 36 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index fb8a461333c..8becb86cd34 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -65,6 +65,16 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
 	return;
 }
 
+typedef void (*sp_handler) (struct kvm *kvm, struct kvm_mmu_page *sp);
+
+static void walk_all_active_sps(struct kvm *kvm, sp_handler fn)
+{
+	struct kvm_mmu_page *sp;
+
+	list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link)
+		fn(kvm, sp);
+}
+
 static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
 				gva_t va, int level)
 {
@@ -175,67 +185,59 @@ void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu)
 	mmu_spte_walk(vcpu, inspect_spte_has_rmap);
 }
 
-static void check_mappings_rmap(struct kvm_vcpu *vcpu)
+static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
-	struct kvm_mmu_page *sp;
 	int i;
 
-	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
-		u64 *pt = sp->spt;
+	if (sp->role.level != PT_PAGE_TABLE_LEVEL)
+		return;
 
-		if (sp->role.level != PT_PAGE_TABLE_LEVEL)
+	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+		if (!is_rmap_spte(sp->spt[i]))
 			continue;
 
-		for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
-			if (!is_rmap_spte(pt[i]))
-				continue;
-
-			inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
-		}
+		inspect_spte_has_rmap(kvm, sp->spt + i);
 	}
-	return;
 }
 
-static void audit_rmap(struct kvm_vcpu *vcpu)
+void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
-	check_mappings_rmap(vcpu);
-}
-
-static void audit_write_protection(struct kvm_vcpu *vcpu)
-{
-	struct kvm_mmu_page *sp;
 	struct kvm_memory_slot *slot;
 	unsigned long *rmapp;
 	u64 *spte;
 
-	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
-		if (sp->role.direct)
-			continue;
-		if (sp->unsync)
-			continue;
-		if (sp->role.invalid)
-			continue;
+	if (sp->role.direct || sp->unsync || sp->role.invalid)
+		return;
 
-		slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
-		rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
+	slot = gfn_to_memslot(kvm, sp->gfn);
+	rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
 
-		spte = rmap_next(vcpu->kvm, rmapp, NULL);
-		while (spte) {
-			if (is_writable_pte(*spte))
-				printk(KERN_ERR "%s: (%s) shadow page has "
+	spte = rmap_next(kvm, rmapp, NULL);
+	while (spte) {
+		if (is_writable_pte(*spte))
+			printk(KERN_ERR "%s: (%s) shadow page has "
 				"writable mappings: gfn %llx role %x\n",
 			       __func__, audit_msg, sp->gfn,
 			       sp->role.word);
-			spte = rmap_next(vcpu->kvm, rmapp, spte);
-		}
+		spte = rmap_next(kvm, rmapp, spte);
 	}
 }
 
+static void audit_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+	check_mappings_rmap(kvm, sp);
+	audit_write_protection(kvm, sp);
+}
+
+static void audit_all_active_sps(struct kvm *kvm)
+{
+	walk_all_active_sps(kvm, audit_sp);
+}
+
 static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int audit_point)
 {
 	audit_msg = audit_point_name[audit_point];
-	audit_rmap(vcpu);
-	audit_write_protection(vcpu);
+	audit_all_active_sps(vcpu->kvm);
 	if (strcmp("pre pte write", audit_msg) != 0)
 		audit_mappings(vcpu);
 	audit_sptes_have_rmaps(vcpu);
-- 
cgit v1.2.3-18-g5258


From eb2591865a234c6fb1162085d9b277236fa890b6 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Mon, 30 Aug 2010 18:25:51 +0800
Subject: KVM: MMU: improve spte audit

Both audit_mappings() and audit_sptes_have_rmaps() need to walk vcpu's page
table, so we can do these checking in a spte walking

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu_audit.c | 148 ++++++++++++++++++++++-------------------------
 1 file changed, 69 insertions(+), 79 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 8becb86cd34..3bde186409b 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -19,23 +19,24 @@
 
 static const char *audit_msg;
 
-typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
+typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level);
 
-static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
-			    inspect_spte_fn fn)
+static void __mmu_spte_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+			    inspect_spte_fn fn, int level)
 {
 	int i;
 
 	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
-		u64 ent = sp->spt[i];
-
-		if (is_shadow_present_pte(ent)) {
-			if (!is_last_spte(ent, sp->role.level)) {
-				struct kvm_mmu_page *child;
-				child = page_header(ent & PT64_BASE_ADDR_MASK);
-				__mmu_spte_walk(kvm, child, fn);
-			} else
-				fn(kvm, &sp->spt[i]);
+		u64 *ent = sp->spt;
+
+		fn(vcpu, ent + i, level);
+
+		if (is_shadow_present_pte(ent[i]) &&
+		      !is_last_spte(ent[i], level)) {
+			struct kvm_mmu_page *child;
+
+			child = page_header(ent[i] & PT64_BASE_ADDR_MASK);
+			__mmu_spte_walk(vcpu, child, fn, level - 1);
 		}
 	}
 }
@@ -47,21 +48,25 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
 
 	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
 		return;
+
 	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
 		hpa_t root = vcpu->arch.mmu.root_hpa;
+
 		sp = page_header(root);
-		__mmu_spte_walk(vcpu->kvm, sp, fn);
+		__mmu_spte_walk(vcpu, sp, fn, PT64_ROOT_LEVEL);
 		return;
 	}
+
 	for (i = 0; i < 4; ++i) {
 		hpa_t root = vcpu->arch.mmu.pae_root[i];
 
 		if (root && VALID_PAGE(root)) {
 			root &= PT64_BASE_ADDR_MASK;
 			sp = page_header(root);
-			__mmu_spte_walk(vcpu->kvm, sp, fn);
+			__mmu_spte_walk(vcpu, sp, fn, 2);
 		}
 	}
+
 	return;
 }
 
@@ -75,80 +80,55 @@ static void walk_all_active_sps(struct kvm *kvm, sp_handler fn)
 		fn(kvm, sp);
 }
 
-static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
-				gva_t va, int level)
+static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
 {
-	u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
-	int i;
-	gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
-
-	for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
-		u64 *sptep = pt + i;
-		struct kvm_mmu_page *sp;
-		gfn_t gfn;
-		pfn_t pfn;
-		hpa_t hpa;
-
-		sp = page_header(__pa(sptep));
-
-		if (sp->unsync) {
-			if (level != PT_PAGE_TABLE_LEVEL) {
-				printk(KERN_ERR "audit: (%s) error: unsync sp: %p level = %d\n",
-						audit_msg, sp, level);
-				return;
-			}
-
-			if (*sptep == shadow_notrap_nonpresent_pte) {
-				printk(KERN_ERR "audit: (%s) error: notrap spte in unsync sp: %p\n",
-						audit_msg, sp);
-				return;
-			}
-		}
+	struct kvm_mmu_page *sp;
+	gfn_t gfn;
+	pfn_t pfn;
+	hpa_t hpa;
 
-		if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
-			printk(KERN_ERR "audit: (%s) error: notrap spte in direct sp: %p\n",
-					audit_msg, sp);
+	sp = page_header(__pa(sptep));
+
+	if (sp->unsync) {
+		if (level != PT_PAGE_TABLE_LEVEL) {
+			printk(KERN_ERR "audit: (%s) error: unsync sp: %p level = %d\n",
+				audit_msg, sp, level);
 			return;
 		}
 
-		if (!is_shadow_present_pte(*sptep) ||
-		      !is_last_spte(*sptep, level))
+		if (*sptep == shadow_notrap_nonpresent_pte) {
+			printk(KERN_ERR "audit: (%s) error: notrap spte in unsync sp: %p\n",
+				audit_msg, sp);
 			return;
+		}
+	}
 
-		gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
-		pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
+	if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
+		printk(KERN_ERR "audit: (%s) error: notrap spte in direct sp: %p\n",
+			audit_msg, sp);
+		return;
+	}
 
-		if (is_error_pfn(pfn)) {
-			kvm_release_pfn_clean(pfn);
-			return;
-		}
+	if (!is_shadow_present_pte(*sptep) || !is_last_spte(*sptep, level))
+		return;
 
-		hpa =  pfn << PAGE_SHIFT;
+	gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+	pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
 
-		if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
-			printk(KERN_ERR "xx audit error: (%s) levels %d"
-					   " gva %lx pfn %llx hpa %llx ent %llxn",
-					   audit_msg, vcpu->arch.mmu.root_level,
-					   va, pfn, hpa, *sptep);
+	if (is_error_pfn(pfn)) {
+		kvm_release_pfn_clean(pfn);
+		return;
 	}
-}
 
-static void audit_mappings(struct kvm_vcpu *vcpu)
-{
-	unsigned i;
-
-	if (vcpu->arch.mmu.root_level == 4)
-		audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
-	else
-		for (i = 0; i < 4; ++i)
-			if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
-				audit_mappings_page(vcpu,
-						    vcpu->arch.mmu.pae_root[i],
-						    i << 30,
-						    2);
+	hpa =  pfn << PAGE_SHIFT;
+	if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
+		printk(KERN_ERR "xx audit error: (%s) levels %d"
+				   "pfn %llx hpa %llx ent %llxn",
+				   audit_msg, vcpu->arch.mmu.root_level,
+				   pfn, hpa, *sptep);
 }
 
-void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
+static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
 {
 	unsigned long *rmapp;
 	struct kvm_mmu_page *rev_sp;
@@ -180,9 +160,10 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
 	}
 }
 
-void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu)
+static void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu, u64 *sptep, int level)
 {
-	mmu_spte_walk(vcpu, inspect_spte_has_rmap);
+	if (is_shadow_present_pte(*sptep) && is_last_spte(*sptep, level))
+		inspect_spte_has_rmap(vcpu->kvm, sptep);
 }
 
 static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
@@ -234,13 +215,22 @@ static void audit_all_active_sps(struct kvm *kvm)
 	walk_all_active_sps(kvm, audit_sp);
 }
 
+static void audit_spte(struct kvm_vcpu *vcpu, u64 *sptep, int level)
+{
+	audit_sptes_have_rmaps(vcpu, sptep, level);
+	audit_mappings(vcpu, sptep, level);
+}
+
+static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
+{
+	mmu_spte_walk(vcpu, audit_spte);
+}
+
 static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int audit_point)
 {
 	audit_msg = audit_point_name[audit_point];
 	audit_all_active_sps(vcpu->kvm);
-	if (strcmp("pre pte write", audit_msg) != 0)
-		audit_mappings(vcpu);
-	audit_sptes_have_rmaps(vcpu);
+	audit_vcpu_spte(vcpu);
 }
 
 static bool mmu_audit;
-- 
cgit v1.2.3-18-g5258


From 30644b902c5eef5328d37a2e15f1921aaca2588b Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Mon, 30 Aug 2010 18:26:33 +0800
Subject: KVM: MMU: lower the aduit frequency

The audit is very high overhead, so we need lower the frequency to assure
the guest is running.

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu_audit.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 3bde186409b..bd2b1be7066 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -17,6 +17,8 @@
  *
  */
 
+#include <linux/ratelimit.h>
+
 static const char *audit_msg;
 
 typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level);
@@ -228,6 +230,11 @@ static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
 
 static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int audit_point)
 {
+	static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
+
+	if (!__ratelimit(&ratelimit_state))
+		return;
+
 	audit_msg = audit_point_name[audit_point];
 	audit_all_active_sps(vcpu->kvm);
 	audit_vcpu_spte(vcpu);
-- 
cgit v1.2.3-18-g5258


From f87f928882d080eaec8b0d76aecff003d664697d Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 2 Sep 2010 17:29:45 +0200
Subject: KVM: MMU: Fix 32 bit legacy paging with NPT

This patch fixes 32 bit legacy paging with NPT enabled. The
mmu_check_root call on the top-level of the loop causes
root_gfn to take values (in the tdp_enabled path) which are
outside of guest memory. So the mmu_check_root call fails at
some point in the loop interation causing the guest to
tiple-fault.
This patch changes the mmu_check_root calls to the places
where they are really necessary. As a side-effect it
introduces a check for the root of a pae page table too.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d2dad65a45f..b2136f921d7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2387,6 +2387,10 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 		return 0;
 	}
 	direct = !is_paging(vcpu);
+
+	if (mmu_check_root(vcpu, root_gfn))
+		return 1;
+
 	for (i = 0; i < 4; ++i) {
 		hpa_t root = vcpu->arch.mmu.pae_root[i];
 
@@ -2398,10 +2402,10 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 				continue;
 			}
 			root_gfn = pdptr >> PAGE_SHIFT;
+			if (mmu_check_root(vcpu, root_gfn))
+				return 1;
 		} else if (vcpu->arch.mmu.root_level == 0)
 			root_gfn = 0;
-		if (mmu_check_root(vcpu, root_gfn))
-			return 1;
 		if (tdp_enabled) {
 			direct = 1;
 			root_gfn = i << 30;
-- 
cgit v1.2.3-18-g5258


From cda0008299a06f0d7218c6037c3c02d7a865e954 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 2 Sep 2010 17:29:46 +0200
Subject: KVM: SVM: Restore correct registers after sel_cr0 intercept emulation

This patch implements restoring of the correct rip, rsp, and
rax after the svm emulation in KVM injected a selective_cr0
write intercept into the guest hypervisor. The problem was
that the vmexit is emulated in the instruction emulation
which later commits the registers right after the write-cr0
instruction. So the l1 guest will continue to run with the
l2 rip, rsp and rax resulting in unpredictable behavior.

This patch is not the final word, it is just an easy patch
to fix the issue. The real fix will be done when the
instruction emulator is made aware of nested virtualization.
Until this is done this patch fixes the issue and provides
an easy way to fix this in -stable too.

Cc: stable@kernel.org
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/svm.c | 33 +++++++++++++++++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index a1a83b955ed..07655345f50 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -88,6 +88,14 @@ struct nested_state {
 	/* A VMEXIT is required but not yet emulated */
 	bool exit_required;
 
+	/*
+	 * If we vmexit during an instruction emulation we need this to restore
+	 * the l1 guest rip after the emulation
+	 */
+	unsigned long vmexit_rip;
+	unsigned long vmexit_rsp;
+	unsigned long vmexit_rax;
+
 	/* cache for intercepts of the guest */
 	u16 intercept_cr_read;
 	u16 intercept_cr_write;
@@ -1213,8 +1221,12 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 		if (old == new) {
 			/* cr0 write with ts and mp unchanged */
 			svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
-			if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE)
+			if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE) {
+				svm->nested.vmexit_rip = kvm_rip_read(vcpu);
+				svm->nested.vmexit_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
+				svm->nested.vmexit_rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
 				return;
+			}
 		}
 	}
 
@@ -2430,6 +2442,23 @@ static int emulate_on_interception(struct vcpu_svm *svm)
 	return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE;
 }
 
+static int cr0_write_interception(struct vcpu_svm *svm)
+{
+	struct kvm_vcpu *vcpu = &svm->vcpu;
+	int r;
+
+	r = emulate_instruction(&svm->vcpu, 0, 0, 0);
+
+	if (svm->nested.vmexit_rip) {
+		kvm_register_write(vcpu, VCPU_REGS_RIP, svm->nested.vmexit_rip);
+		kvm_register_write(vcpu, VCPU_REGS_RSP, svm->nested.vmexit_rsp);
+		kvm_register_write(vcpu, VCPU_REGS_RAX, svm->nested.vmexit_rax);
+		svm->nested.vmexit_rip = 0;
+	}
+
+	return r == EMULATE_DONE;
+}
+
 static int cr8_write_interception(struct vcpu_svm *svm)
 {
 	struct kvm_run *kvm_run = svm->vcpu.run;
@@ -2692,7 +2721,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
 	[SVM_EXIT_READ_CR4]			= emulate_on_interception,
 	[SVM_EXIT_READ_CR8]			= emulate_on_interception,
 	[SVM_EXIT_CR0_SEL_WRITE]		= emulate_on_interception,
-	[SVM_EXIT_WRITE_CR0]			= emulate_on_interception,
+	[SVM_EXIT_WRITE_CR0]			= cr0_write_interception,
 	[SVM_EXIT_WRITE_CR3]			= emulate_on_interception,
 	[SVM_EXIT_WRITE_CR4]			= emulate_on_interception,
 	[SVM_EXIT_WRITE_CR8]			= cr8_write_interception,
-- 
cgit v1.2.3-18-g5258


From b75f4eb34122b60ee4f07ec89973d1589002c68a Mon Sep 17 00:00:00 2001
From: "Roedel, Joerg" <Joerg.Roedel@amd.com>
Date: Fri, 3 Sep 2010 14:21:40 +0200
Subject: KVM: SVM: Clean up rip handling in vmrun emulation

This patch changes the rip handling in the vmrun emulation
path from using next_rip to the generic kvm register access
functions.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/svm.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 07655345f50..fcbc491e1f8 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2069,7 +2069,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 		return false;
 	}
 
-	trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, vmcb_gpa,
+	trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa,
 			       nested_vmcb->save.rip,
 			       nested_vmcb->control.int_ctl,
 			       nested_vmcb->control.event_inj,
@@ -2098,7 +2098,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 	hsave->save.cr0    = kvm_read_cr0(&svm->vcpu);
 	hsave->save.cr4    = svm->vcpu.arch.cr4;
 	hsave->save.rflags = vmcb->save.rflags;
-	hsave->save.rip    = svm->next_rip;
+	hsave->save.rip    = kvm_rip_read(&svm->vcpu);
 	hsave->save.rsp    = vmcb->save.rsp;
 	hsave->save.rax    = vmcb->save.rax;
 	if (npt_enabled)
@@ -2270,8 +2270,8 @@ static int vmrun_interception(struct vcpu_svm *svm)
 	if (nested_svm_check_permissions(svm))
 		return 1;
 
-	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
-	skip_emulated_instruction(&svm->vcpu);
+	/* Save rip after vmrun instruction */
+	kvm_rip_write(&svm->vcpu, kvm_rip_read(&svm->vcpu) + 3);
 
 	if (!nested_svm_vmrun(svm))
 		return 1;
-- 
cgit v1.2.3-18-g5258


From b9a52c4b78ec254ee00cce47d75efd89b09f13dd Mon Sep 17 00:00:00 2001
From: Jes Sorensen <Jes.Sorensen@redhat.com>
Date: Thu, 9 Sep 2010 12:06:45 +0200
Subject: x86: Define MSR_EBC_FREQUENCY_ID

Signed-off-by: Jes Sorensen <Jes.Sorensen@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/msr-index.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 986f7790fdb..83c4bb1d917 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -198,6 +198,7 @@
 #define MSR_IA32_TSC			0x00000010
 #define MSR_IA32_PLATFORM_ID		0x00000017
 #define MSR_IA32_EBL_CR_POWERON		0x0000002a
+#define MSR_EBC_FREQUENCY_ID		0x0000002c
 #define MSR_IA32_FEATURE_CONTROL        0x0000003a
 
 #define FEATURE_CONTROL_LOCKED				(1<<0)
-- 
cgit v1.2.3-18-g5258


From 7b91409822ed37f2a58974e49498bdbe92ddd93c Mon Sep 17 00:00:00 2001
From: Jes Sorensen <Jes.Sorensen@redhat.com>
Date: Thu, 9 Sep 2010 12:06:46 +0200
Subject: KVM: x86: Emulate MSR_EBC_FREQUENCY_ID

Some operating systems store data about the host processor at the
time of installation, and when booted on a more uptodate cpu tries
to read MSR_EBC_FREQUENCY_ID. This has been found with XP.

Signed-off-by: Jes Sorensen <Jes.Sorensen@redhat.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f47db2588a4..9d434777154 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1651,6 +1651,20 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case 0xcd: /* fsb frequency */
 		data = 3;
 		break;
+		/*
+		 * MSR_EBC_FREQUENCY_ID
+		 * Conservative value valid for even the basic CPU models.
+		 * Models 0,1: 000 in bits 23:21 indicating a bus speed of
+		 * 100MHz, model 2 000 in bits 18:16 indicating 100MHz,
+		 * and 266MHz for model 3, or 4. Set Core Clock
+		 * Frequency to System Bus Frequency Ratio to 1 (bits
+		 * 31:24) even though these are only valid for CPU
+		 * models > 2, however guests may end up dividing or
+		 * multiplying by zero otherwise.
+		 */
+	case MSR_EBC_FREQUENCY_ID:
+		data = 1 << 24;
+		break;
 	case MSR_IA32_APICBASE:
 		data = kvm_get_apic_base(vcpu);
 		break;
-- 
cgit v1.2.3-18-g5258


From 957446afce22df9a42b9482fcd55985f4037fe66 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:38 +0200
Subject: KVM: MMU: Check for root_level instead of long mode

The walk_addr function checks for !is_long_mode in its 64
bit version. But what is meant here is a check for pae
paging. Change the condition to really check for pae paging
so that it also works with nested nested paging.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index debe7703536..e4ad3dc84df 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -132,7 +132,7 @@ walk:
 	walker->level = vcpu->arch.mmu.root_level;
 	pte = vcpu->arch.cr3;
 #if PTTYPE == 64
-	if (!is_long_mode(vcpu)) {
+	if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
 		pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3);
 		trace_kvm_mmu_paging_element(pte, walker->level);
 		if (!is_present_gpte(pte)) {
@@ -205,7 +205,7 @@ walk:
 				(PTTYPE == 64 || is_pse(vcpu))) ||
 		    ((walker->level == PT_PDPE_LEVEL) &&
 				is_large_pte(pte) &&
-				is_long_mode(vcpu))) {
+				vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL)) {
 			int lvl = walker->level;
 
 			walker->gfn = gpte_to_gfn_lvl(pte, lvl);
-- 
cgit v1.2.3-18-g5258


From c5a78f2b649ae75ae788e7622ca5a586af2cb35a Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:39 +0200
Subject: KVM: MMU: Make tdp_enabled a mmu-context parameter

This patch changes the tdp_enabled flag from its global
meaning to the mmu-context and renames it to direct_map
there. This is necessary for Nested SVM with emulation of
Nested Paging where we need an extra MMU context to shadow
the Nested Nested Page Table.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/mmu.c              | 22 ++++++++++++++--------
 2 files changed, 15 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 789e9462668..80ef28bddcc 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -249,6 +249,7 @@ struct kvm_mmu {
 	int root_level;
 	int shadow_root_level;
 	union kvm_mmu_page_role base_role;
+	bool direct_map;
 
 	u64 *pae_root;
 	u64 rsvd_bits_mask[2][4];
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b2136f921d7..5c28e979d73 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1448,7 +1448,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	if (role.direct)
 		role.cr4_pae = 0;
 	role.access = access;
-	if (!tdp_enabled && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
+	if (!vcpu->arch.mmu.direct_map
+	    && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
 		quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
 		quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
 		role.quadrant = quadrant;
@@ -1973,7 +1974,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		spte |= shadow_user_mask;
 	if (level > PT_PAGE_TABLE_LEVEL)
 		spte |= PT_PAGE_SIZE_MASK;
-	if (tdp_enabled)
+	if (vcpu->arch.mmu.direct_map)
 		spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
 			kvm_is_mmio_pfn(pfn));
 
@@ -1983,8 +1984,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	spte |= (u64)pfn << PAGE_SHIFT;
 
 	if ((pte_access & ACC_WRITE_MASK)
-	    || (!tdp_enabled && write_fault && !is_write_protection(vcpu)
-		&& !user_fault)) {
+	    || (!vcpu->arch.mmu.direct_map && write_fault
+		&& !is_write_protection(vcpu) && !user_fault)) {
 
 		if (level > PT_PAGE_TABLE_LEVEL &&
 		    has_wrprotected_page(vcpu->kvm, gfn, level)) {
@@ -1995,7 +1996,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 
 		spte |= PT_WRITABLE_MASK;
 
-		if (!tdp_enabled && !(pte_access & ACC_WRITE_MASK))
+		if (!vcpu->arch.mmu.direct_map
+		    && !(pte_access & ACC_WRITE_MASK))
 			spte &= ~PT_USER_MASK;
 
 		/*
@@ -2371,7 +2373,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 		ASSERT(!VALID_PAGE(root));
 		if (mmu_check_root(vcpu, root_gfn))
 			return 1;
-		if (tdp_enabled) {
+		if (vcpu->arch.mmu.direct_map) {
 			direct = 1;
 			root_gfn = 0;
 		}
@@ -2406,7 +2408,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 				return 1;
 		} else if (vcpu->arch.mmu.root_level == 0)
 			root_gfn = 0;
-		if (tdp_enabled) {
+		if (vcpu->arch.mmu.direct_map) {
 			direct = 1;
 			root_gfn = i << 30;
 		}
@@ -2544,6 +2546,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
 	context->root_level = 0;
 	context->shadow_root_level = PT32E_ROOT_LEVEL;
 	context->root_hpa = INVALID_PAGE;
+	context->direct_map = true;
 	return 0;
 }
 
@@ -2663,6 +2666,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
 	context->root_level = level;
 	context->shadow_root_level = level;
 	context->root_hpa = INVALID_PAGE;
+	context->direct_map = false;
 	return 0;
 }
 
@@ -2687,6 +2691,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
 	context->root_level = PT32_ROOT_LEVEL;
 	context->shadow_root_level = PT32E_ROOT_LEVEL;
 	context->root_hpa = INVALID_PAGE;
+	context->direct_map = false;
 	return 0;
 }
 
@@ -2708,6 +2713,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	context->invlpg = nonpaging_invlpg;
 	context->shadow_root_level = kvm_x86_ops->get_tdp_level();
 	context->root_hpa = INVALID_PAGE;
+	context->direct_map = true;
 
 	if (!is_paging(vcpu)) {
 		context->gva_to_gpa = nonpaging_gva_to_gpa;
@@ -3060,7 +3066,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
 	gpa_t gpa;
 	int r;
 
-	if (tdp_enabled)
+	if (vcpu->arch.mmu.direct_map)
 		return 0;
 
 	gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
-- 
cgit v1.2.3-18-g5258


From f43addd46168110d572dcf69100cb215a4e9fd08 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:40 +0200
Subject: KVM: MMU: Make set_cr3 a function pointer in kvm_mmu

This is necessary to implement Nested Nested Paging. As a
side effect this allows some cleanups in the SVM nested
paging code.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 1 +
 arch/x86/kvm/mmu.c              | 6 ++++--
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 80ef28bddcc..53cedede88f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -236,6 +236,7 @@ struct kvm_pio_request {
  */
 struct kvm_mmu {
 	void (*new_cr3)(struct kvm_vcpu *vcpu);
+	void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
 	int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
 	void (*free)(struct kvm_vcpu *vcpu);
 	gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5c28e979d73..c8acb9609ca 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2714,6 +2714,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	context->shadow_root_level = kvm_x86_ops->get_tdp_level();
 	context->root_hpa = INVALID_PAGE;
 	context->direct_map = true;
+	context->set_cr3 = kvm_x86_ops->set_cr3;
 
 	if (!is_paging(vcpu)) {
 		context->gva_to_gpa = nonpaging_gva_to_gpa;
@@ -2752,7 +2753,8 @@ static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
 		r = paging32_init_context(vcpu);
 
 	vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
-	vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
+	vcpu->arch.mmu.base_role.cr0_wp  = is_write_protection(vcpu);
+	vcpu->arch.mmu.set_cr3           = kvm_x86_ops->set_cr3;
 
 	return r;
 }
@@ -2796,7 +2798,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
 	if (r)
 		goto out;
 	/* set_cr3() should ensure TLB has been flushed */
-	kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
+	vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
 out:
 	return r;
 }
-- 
cgit v1.2.3-18-g5258


From 1c97f0a04c74196880f22a563134c8f6d0b9d752 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:41 +0200
Subject: KVM: X86: Introduce a tdp_set_cr3 function

This patch introduces a special set_tdp_cr3 function pointer
in kvm_x86_ops which is only used for tpd enabled mmu
contexts. This allows to remove some hacks from svm code.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/kvm/mmu.c              |  2 +-
 arch/x86/kvm/svm.c              | 23 ++++++++++++++---------
 arch/x86/kvm/vmx.c              |  2 ++
 4 files changed, 19 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 53cedede88f..81a51473f74 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -526,6 +526,8 @@ struct kvm_x86_ops {
 	bool (*rdtscp_supported)(void);
 	void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment);
 
+	void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
+
 	void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry);
 
 	bool (*has_wbinvd_exit)(void);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c8acb9609ca..a55f8d5a798 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2714,7 +2714,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	context->shadow_root_level = kvm_x86_ops->get_tdp_level();
 	context->root_hpa = INVALID_PAGE;
 	context->direct_map = true;
-	context->set_cr3 = kvm_x86_ops->set_cr3;
+	context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
 
 	if (!is_paging(vcpu)) {
 		context->gva_to_gpa = nonpaging_gva_to_gpa;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index fcbc491e1f8..53c9039583f 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3216,9 +3216,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 	savesegment(gs, gs_selector);
 	ldt_selector = kvm_read_ldt();
 	svm->vmcb->save.cr2 = vcpu->arch.cr2;
-	/* required for live migration with NPT */
-	if (npt_enabled)
-		svm->vmcb->save.cr3 = vcpu->arch.cr3;
 
 	clgi();
 
@@ -3340,16 +3337,22 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	if (npt_enabled) {
-		svm->vmcb->control.nested_cr3 = root;
-		force_new_asid(vcpu);
-		return;
-	}
-
 	svm->vmcb->save.cr3 = root;
 	force_new_asid(vcpu);
 }
 
+static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	svm->vmcb->control.nested_cr3 = root;
+
+	/* Also sync guest cr3 here in case we live migrate */
+	svm->vmcb->save.cr3 = vcpu->arch.cr3;
+
+	force_new_asid(vcpu);
+}
+
 static int is_disabled(void)
 {
 	u64 vm_cr;
@@ -3576,6 +3579,8 @@ static struct kvm_x86_ops svm_x86_ops = {
 
 	.write_tsc_offset = svm_write_tsc_offset,
 	.adjust_tsc_offset = svm_adjust_tsc_offset,
+
+	.set_tdp_cr3 = set_tdp_cr3,
 };
 
 static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 275a81d571c..ff7a8d48fd2 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4341,6 +4341,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
 
 	.write_tsc_offset = vmx_write_tsc_offset,
 	.adjust_tsc_offset = vmx_adjust_tsc_offset,
+
+	.set_tdp_cr3 = vmx_set_cr3,
 };
 
 static int __init vmx_init(void)
-- 
cgit v1.2.3-18-g5258


From 5777ed340d89cdc6c76a5c552337a3861b40a806 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:42 +0200
Subject: KVM: MMU: Introduce get_cr3 function pointer

This function pointer in the MMU context is required to
implement Nested Nested Paging.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 1 +
 arch/x86/kvm/mmu.c              | 9 ++++++++-
 arch/x86/kvm/paging_tmpl.h      | 4 ++--
 3 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 81a51473f74..6c97b8debfa 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -237,6 +237,7 @@ struct kvm_pio_request {
 struct kvm_mmu {
 	void (*new_cr3)(struct kvm_vcpu *vcpu);
 	void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
+	unsigned long (*get_cr3)(struct kvm_vcpu *vcpu);
 	int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
 	void (*free)(struct kvm_vcpu *vcpu);
 	gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a55f8d5a798..e4a7de4c8c7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2365,7 +2365,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 	int direct = 0;
 	u64 pdptr;
 
-	root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
+	root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT;
 
 	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
 		hpa_t root = vcpu->arch.mmu.root_hpa;
@@ -2562,6 +2562,11 @@ static void paging_new_cr3(struct kvm_vcpu *vcpu)
 	mmu_free_roots(vcpu);
 }
 
+static unsigned long get_cr3(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.cr3;
+}
+
 static void inject_page_fault(struct kvm_vcpu *vcpu,
 			      u64 addr,
 			      u32 err_code)
@@ -2715,6 +2720,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	context->root_hpa = INVALID_PAGE;
 	context->direct_map = true;
 	context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
+	context->get_cr3 = get_cr3;
 
 	if (!is_paging(vcpu)) {
 		context->gva_to_gpa = nonpaging_gva_to_gpa;
@@ -2755,6 +2761,7 @@ static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
 	vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
 	vcpu->arch.mmu.base_role.cr0_wp  = is_write_protection(vcpu);
 	vcpu->arch.mmu.set_cr3           = kvm_x86_ops->set_cr3;
+	vcpu->arch.mmu.get_cr3           = get_cr3;
 
 	return r;
 }
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index e4ad3dc84df..13d0c06b1bc 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -130,7 +130,7 @@ walk:
 	present = true;
 	eperm = rsvd_fault = false;
 	walker->level = vcpu->arch.mmu.root_level;
-	pte = vcpu->arch.cr3;
+	pte = vcpu->arch.mmu.get_cr3(vcpu);
 #if PTTYPE == 64
 	if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
 		pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3);
@@ -143,7 +143,7 @@ walk:
 	}
 #endif
 	ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
-	       (vcpu->arch.cr3 & CR3_NONPAE_RESERVED_BITS) == 0);
+	       (vcpu->arch.mmu.get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0);
 
 	pt_access = ACC_ALL;
 
-- 
cgit v1.2.3-18-g5258


From cb659db8a7d1ed558898f533a957dfc342f9499d Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:43 +0200
Subject: KVM: MMU: Introduce inject_page_fault function pointer

This patch introduces an inject_page_fault function pointer
into struct kvm_mmu which will be used to inject a page
fault. This will be used later when Nested Nested Paging is
implemented.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 3 +++
 arch/x86/kvm/mmu.c              | 4 +++-
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6c97b8debfa..009a4a1b370 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -239,6 +239,9 @@ struct kvm_mmu {
 	void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
 	unsigned long (*get_cr3)(struct kvm_vcpu *vcpu);
 	int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
+	void (*inject_page_fault)(struct kvm_vcpu *vcpu,
+				  unsigned long addr,
+				  u32 error_code);
 	void (*free)(struct kvm_vcpu *vcpu);
 	gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
 			    u32 *error);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e4a7de4c8c7..a751dfc8526 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2571,7 +2571,7 @@ static void inject_page_fault(struct kvm_vcpu *vcpu,
 			      u64 addr,
 			      u32 err_code)
 {
-	kvm_inject_page_fault(vcpu, addr, err_code);
+	vcpu->arch.mmu.inject_page_fault(vcpu, addr, err_code);
 }
 
 static void paging_free(struct kvm_vcpu *vcpu)
@@ -2721,6 +2721,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	context->direct_map = true;
 	context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
 	context->get_cr3 = get_cr3;
+	context->inject_page_fault = kvm_inject_page_fault;
 
 	if (!is_paging(vcpu)) {
 		context->gva_to_gpa = nonpaging_gva_to_gpa;
@@ -2762,6 +2763,7 @@ static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
 	vcpu->arch.mmu.base_role.cr0_wp  = is_write_protection(vcpu);
 	vcpu->arch.mmu.set_cr3           = kvm_x86_ops->set_cr3;
 	vcpu->arch.mmu.get_cr3           = get_cr3;
+	vcpu->arch.mmu.inject_page_fault = kvm_inject_page_fault;
 
 	return r;
 }
-- 
cgit v1.2.3-18-g5258


From 52fde8df7dd13d90f5f8dc43157418bff968d90a Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:44 +0200
Subject: KVM: MMU: Introduce kvm_init_shadow_mmu helper function

Some logic of the init_kvm_softmmu function is required to
build the Nested Nested Paging context. So factor the
required logic into a seperate function and export it.
Also make the whole init path suitable for more than one mmu
context.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 60 +++++++++++++++++++++++++++++++-----------------------
 arch/x86/kvm/mmu.h |  1 +
 2 files changed, 36 insertions(+), 25 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a751dfc8526..9e48a774fce 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2532,10 +2532,9 @@ static void nonpaging_free(struct kvm_vcpu *vcpu)
 	mmu_free_roots(vcpu);
 }
 
-static int nonpaging_init_context(struct kvm_vcpu *vcpu)
+static int nonpaging_init_context(struct kvm_vcpu *vcpu,
+				  struct kvm_mmu *context)
 {
-	struct kvm_mmu *context = &vcpu->arch.mmu;
-
 	context->new_cr3 = nonpaging_new_cr3;
 	context->page_fault = nonpaging_page_fault;
 	context->gva_to_gpa = nonpaging_gva_to_gpa;
@@ -2595,9 +2594,10 @@ static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level)
 #include "paging_tmpl.h"
 #undef PTTYPE
 
-static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
+static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
+				  struct kvm_mmu *context,
+				  int level)
 {
-	struct kvm_mmu *context = &vcpu->arch.mmu;
 	int maxphyaddr = cpuid_maxphyaddr(vcpu);
 	u64 exb_bit_rsvd = 0;
 
@@ -2656,9 +2656,11 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
 	}
 }
 
-static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
+static int paging64_init_context_common(struct kvm_vcpu *vcpu,
+					struct kvm_mmu *context,
+					int level)
 {
-	struct kvm_mmu *context = &vcpu->arch.mmu;
+	reset_rsvds_bits_mask(vcpu, context, level);
 
 	ASSERT(is_pae(vcpu));
 	context->new_cr3 = paging_new_cr3;
@@ -2675,17 +2677,17 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
 	return 0;
 }
 
-static int paging64_init_context(struct kvm_vcpu *vcpu)
+static int paging64_init_context(struct kvm_vcpu *vcpu,
+				 struct kvm_mmu *context)
 {
-	reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
-	return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
+	return paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL);
 }
 
-static int paging32_init_context(struct kvm_vcpu *vcpu)
+static int paging32_init_context(struct kvm_vcpu *vcpu,
+				 struct kvm_mmu *context)
 {
-	struct kvm_mmu *context = &vcpu->arch.mmu;
+	reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
 
-	reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
 	context->new_cr3 = paging_new_cr3;
 	context->page_fault = paging32_page_fault;
 	context->gva_to_gpa = paging32_gva_to_gpa;
@@ -2700,10 +2702,10 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
-static int paging32E_init_context(struct kvm_vcpu *vcpu)
+static int paging32E_init_context(struct kvm_vcpu *vcpu,
+				  struct kvm_mmu *context)
 {
-	reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
-	return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
+	return paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
 }
 
 static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
@@ -2727,15 +2729,15 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 		context->gva_to_gpa = nonpaging_gva_to_gpa;
 		context->root_level = 0;
 	} else if (is_long_mode(vcpu)) {
-		reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
+		reset_rsvds_bits_mask(vcpu, context, PT64_ROOT_LEVEL);
 		context->gva_to_gpa = paging64_gva_to_gpa;
 		context->root_level = PT64_ROOT_LEVEL;
 	} else if (is_pae(vcpu)) {
-		reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
+		reset_rsvds_bits_mask(vcpu, context, PT32E_ROOT_LEVEL);
 		context->gva_to_gpa = paging64_gva_to_gpa;
 		context->root_level = PT32E_ROOT_LEVEL;
 	} else {
-		reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
+		reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
 		context->gva_to_gpa = paging32_gva_to_gpa;
 		context->root_level = PT32_ROOT_LEVEL;
 	}
@@ -2743,24 +2745,32 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
-static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
+int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
 {
 	int r;
-
 	ASSERT(vcpu);
 	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
 
 	if (!is_paging(vcpu))
-		r = nonpaging_init_context(vcpu);
+		r = nonpaging_init_context(vcpu, context);
 	else if (is_long_mode(vcpu))
-		r = paging64_init_context(vcpu);
+		r = paging64_init_context(vcpu, context);
 	else if (is_pae(vcpu))
-		r = paging32E_init_context(vcpu);
+		r = paging32E_init_context(vcpu, context);
 	else
-		r = paging32_init_context(vcpu);
+		r = paging32_init_context(vcpu, context);
 
 	vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
 	vcpu->arch.mmu.base_role.cr0_wp  = is_write_protection(vcpu);
+
+	return r;
+}
+EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
+
+static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
+{
+	int r = kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu);
+
 	vcpu->arch.mmu.set_cr3           = kvm_x86_ops->set_cr3;
 	vcpu->arch.mmu.get_cr3           = get_cr3;
 	vcpu->arch.mmu.inject_page_fault = kvm_inject_page_fault;
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index f05a03dfba4..7086ca85d3e 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -49,6 +49,7 @@
 #define PFERR_FETCH_MASK (1U << 4)
 
 int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
+int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
 
 static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
 {
-- 
cgit v1.2.3-18-g5258


From 3241f22da85d26505b39f525a88f52ebd1235975 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:45 +0200
Subject: KVM: MMU: Let is_rsvd_bits_set take mmu context instead of vcpu

This patch changes is_rsvd_bits_set() function prototype to
take only a kvm_mmu context instead of a full vcpu.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c         | 6 +++---
 arch/x86/kvm/paging_tmpl.h | 7 ++++---
 2 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9e48a774fce..86f7557cf3f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2578,12 +2578,12 @@ static void paging_free(struct kvm_vcpu *vcpu)
 	nonpaging_free(vcpu);
 }
 
-static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level)
+static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
 {
 	int bit7;
 
 	bit7 = (gpte >> 7) & 1;
-	return (gpte & vcpu->arch.mmu.rsvd_bits_mask[bit7][level-1]) != 0;
+	return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
 }
 
 #define PTTYPE 64
@@ -2859,7 +2859,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
 		return;
         }
 
-	if (is_rsvd_bits_set(vcpu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
+	if (is_rsvd_bits_set(&vcpu->arch.mmu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
 		return;
 
 	++vcpu->kvm->stat.mmu_pte_updated;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 13d0c06b1bc..68ee1b7fa89 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -168,7 +168,7 @@ walk:
 			break;
 		}
 
-		if (is_rsvd_bits_set(vcpu, pte, walker->level)) {
+		if (is_rsvd_bits_set(&vcpu->arch.mmu, pte, walker->level)) {
 			rsvd_fault = true;
 			break;
 		}
@@ -327,6 +327,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
 				u64 *sptep)
 {
 	struct kvm_mmu_page *sp;
+	struct kvm_mmu *mmu = &vcpu->arch.mmu;
 	pt_element_t *gptep = gw->prefetch_ptes;
 	u64 *spte;
 	int i;
@@ -358,7 +359,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
 		gpte = gptep[i];
 
 		if (!is_present_gpte(gpte) ||
-		      is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL)) {
+		      is_rsvd_bits_set(mmu, gpte, PT_PAGE_TABLE_LEVEL)) {
 			if (!sp->unsync)
 				__set_spte(spte, shadow_notrap_nonpresent_pte);
 			continue;
@@ -713,7 +714,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 			return -EINVAL;
 
 		gfn = gpte_to_gfn(gpte);
-		if (is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL)
+		if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)
 		      || gfn != sp->gfns[i] || !is_present_gpte(gpte)
 		      || !(gpte & PT_ACCESSED_MASK)) {
 			u64 nonpresent;
-- 
cgit v1.2.3-18-g5258


From 8df25a328a6ca3bd0f048278f4d5ae0a1f6fadc1 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:46 +0200
Subject: KVM: MMU: Track page fault data in struct vcpu

This patch introduces a struct with two new fields in
vcpu_arch for x86:

	* fault.address
	* fault.error_code

This will be used to correctly propagate page faults back
into the guest when we could have either an ordinary page
fault or a nested page fault. In the case of a nested page
fault the fault-address is different from the original
address that should be walked. So we need to keep track
about the real fault-address.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |  1 -
 arch/x86/include/asm/kvm_host.h    | 17 ++++++++++++-----
 arch/x86/kvm/emulate.c             | 30 ++++++++++++++----------------
 arch/x86/kvm/mmu.c                 |  6 ++----
 arch/x86/kvm/paging_tmpl.h         |  6 +++++-
 arch/x86/kvm/x86.c                 |  9 +++++----
 6 files changed, 38 insertions(+), 31 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 1bf11400ae9..5187dd88019 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -229,7 +229,6 @@ struct x86_emulate_ctxt {
 	int exception; /* exception that happens during emulation or -1 */
 	u32 error_code; /* error code for exception */
 	bool error_code_valid;
-	unsigned long cr2; /* faulted address in case of #PF */
 
 	/* decode cache */
 	struct decode_cache decode;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 009a4a1b370..3fde5b32253 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -239,9 +239,7 @@ struct kvm_mmu {
 	void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
 	unsigned long (*get_cr3)(struct kvm_vcpu *vcpu);
 	int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
-	void (*inject_page_fault)(struct kvm_vcpu *vcpu,
-				  unsigned long addr,
-				  u32 error_code);
+	void (*inject_page_fault)(struct kvm_vcpu *vcpu);
 	void (*free)(struct kvm_vcpu *vcpu);
 	gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
 			    u32 *error);
@@ -288,6 +286,16 @@ struct kvm_vcpu_arch {
 	bool tpr_access_reporting;
 
 	struct kvm_mmu mmu;
+
+	/*
+	 * This struct is filled with the necessary information to propagate a
+	 * page fault into the guest
+	 */
+	struct {
+		u64      address;
+		unsigned error_code;
+	} fault;
+
 	/* only needed in kvm_pv_mmu_op() path, but it's hot so
 	 * put it here to avoid allocation */
 	struct kvm_pv_mmu_op_buffer mmu_op_buffer;
@@ -624,8 +632,7 @@ void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
 void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr);
 void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
-void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
-			   u32 error_code);
+void kvm_inject_page_fault(struct kvm_vcpu *vcpu);
 bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
 
 int kvm_pic_set_irq(void *opaque, int irq, int level);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 27d2c22b114..2b08b78b6ca 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -487,11 +487,9 @@ static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
 	emulate_exception(ctxt, GP_VECTOR, err, true);
 }
 
-static void emulate_pf(struct x86_emulate_ctxt *ctxt, unsigned long addr,
-		       int err)
+static void emulate_pf(struct x86_emulate_ctxt *ctxt)
 {
-	ctxt->cr2 = addr;
-	emulate_exception(ctxt, PF_VECTOR, err, true);
+	emulate_exception(ctxt, PF_VECTOR, 0, true);
 }
 
 static void emulate_ud(struct x86_emulate_ctxt *ctxt)
@@ -834,7 +832,7 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
 		rc = ops->read_emulated(addr, mc->data + mc->end, n, &err,
 					ctxt->vcpu);
 		if (rc == X86EMUL_PROPAGATE_FAULT)
-			emulate_pf(ctxt, addr, err);
+			emulate_pf(ctxt);
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
 		mc->end += n;
@@ -921,7 +919,7 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 	addr = dt.address + index * 8;
 	ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu,  &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT)
-		emulate_pf(ctxt, addr, err);
+		emulate_pf(ctxt);
 
        return ret;
 }
@@ -947,7 +945,7 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 	addr = dt.address + index * 8;
 	ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT)
-		emulate_pf(ctxt, addr, err);
+		emulate_pf(ctxt);
 
 	return ret;
 }
@@ -1117,7 +1115,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
 					&err,
 					ctxt->vcpu);
 		if (rc == X86EMUL_PROPAGATE_FAULT)
-			emulate_pf(ctxt, c->dst.addr.mem, err);
+			emulate_pf(ctxt);
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
 		break;
@@ -1939,7 +1937,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
 			    &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT) {
 		/* FIXME: need to provide precise fault address */
-		emulate_pf(ctxt, old_tss_base, err);
+		emulate_pf(ctxt);
 		return ret;
 	}
 
@@ -1949,7 +1947,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
 			     &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT) {
 		/* FIXME: need to provide precise fault address */
-		emulate_pf(ctxt, old_tss_base, err);
+		emulate_pf(ctxt);
 		return ret;
 	}
 
@@ -1957,7 +1955,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
 			    &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT) {
 		/* FIXME: need to provide precise fault address */
-		emulate_pf(ctxt, new_tss_base, err);
+		emulate_pf(ctxt);
 		return ret;
 	}
 
@@ -1970,7 +1968,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
 				     ctxt->vcpu, &err);
 		if (ret == X86EMUL_PROPAGATE_FAULT) {
 			/* FIXME: need to provide precise fault address */
-			emulate_pf(ctxt, new_tss_base, err);
+			emulate_pf(ctxt);
 			return ret;
 		}
 	}
@@ -2081,7 +2079,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
 			    &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT) {
 		/* FIXME: need to provide precise fault address */
-		emulate_pf(ctxt, old_tss_base, err);
+		emulate_pf(ctxt);
 		return ret;
 	}
 
@@ -2091,7 +2089,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
 			     &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT) {
 		/* FIXME: need to provide precise fault address */
-		emulate_pf(ctxt, old_tss_base, err);
+		emulate_pf(ctxt);
 		return ret;
 	}
 
@@ -2099,7 +2097,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
 			    &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT) {
 		/* FIXME: need to provide precise fault address */
-		emulate_pf(ctxt, new_tss_base, err);
+		emulate_pf(ctxt);
 		return ret;
 	}
 
@@ -2112,7 +2110,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
 				     ctxt->vcpu, &err);
 		if (ret == X86EMUL_PROPAGATE_FAULT) {
 			/* FIXME: need to provide precise fault address */
-			emulate_pf(ctxt, new_tss_base, err);
+			emulate_pf(ctxt);
 			return ret;
 		}
 	}
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 86f7557cf3f..99367274b97 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2566,11 +2566,9 @@ static unsigned long get_cr3(struct kvm_vcpu *vcpu)
 	return vcpu->arch.cr3;
 }
 
-static void inject_page_fault(struct kvm_vcpu *vcpu,
-			      u64 addr,
-			      u32 err_code)
+static void inject_page_fault(struct kvm_vcpu *vcpu)
 {
-	vcpu->arch.mmu.inject_page_fault(vcpu, addr, err_code);
+	vcpu->arch.mmu.inject_page_fault(vcpu);
 }
 
 static void paging_free(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 68ee1b7fa89..d07f48a06f0 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -258,6 +258,10 @@ error:
 		walker->error_code |= PFERR_FETCH_MASK;
 	if (rsvd_fault)
 		walker->error_code |= PFERR_RSVD_MASK;
+
+	vcpu->arch.fault.address    = addr;
+	vcpu->arch.fault.error_code = walker->error_code;
+
 	trace_kvm_mmu_walker_error(walker->error_code);
 	return 0;
 }
@@ -521,7 +525,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	 */
 	if (!r) {
 		pgprintk("%s: guest page fault\n", __func__);
-		inject_page_fault(vcpu, addr, walker.error_code);
+		inject_page_fault(vcpu);
 		vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
 		return 0;
 	}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9d434777154..48b74d2fbfb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -329,11 +329,12 @@ void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
 }
 EXPORT_SYMBOL_GPL(kvm_requeue_exception);
 
-void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
-			   u32 error_code)
+void kvm_inject_page_fault(struct kvm_vcpu *vcpu)
 {
+	unsigned error_code = vcpu->arch.fault.error_code;
+
 	++vcpu->stat.pf_guest;
-	vcpu->arch.cr2 = addr;
+	vcpu->arch.cr2 = vcpu->arch.fault.address;
 	kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
 }
 
@@ -4080,7 +4081,7 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu)
 {
 	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 	if (ctxt->exception == PF_VECTOR)
-		kvm_inject_page_fault(vcpu, ctxt->cr2, ctxt->error_code);
+		kvm_inject_page_fault(vcpu);
 	else if (ctxt->error_code_valid)
 		kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code);
 	else
-- 
cgit v1.2.3-18-g5258


From 1e301feb079e8ee6091bb75283e960fc33059a68 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:47 +0200
Subject: KVM: MMU: Introduce generic walk_addr function

This is the first patch in the series towards a generic
walk_addr implementation which could walk two-dimensional
page tables in the end. In this first step the walk_addr
function is renamed into walk_addr_generic which takes a
mmu context as an additional parameter.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index d07f48a06f0..a704a8130e4 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -114,9 +114,10 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
 /*
  * Fetch a guest pte for a guest virtual address
  */
-static int FNAME(walk_addr)(struct guest_walker *walker,
-			    struct kvm_vcpu *vcpu, gva_t addr,
-			    int write_fault, int user_fault, int fetch_fault)
+static int FNAME(walk_addr_generic)(struct guest_walker *walker,
+				    struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+				    gva_t addr, int write_fault,
+				    int user_fault, int fetch_fault)
 {
 	pt_element_t pte;
 	gfn_t table_gfn;
@@ -129,10 +130,11 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
 walk:
 	present = true;
 	eperm = rsvd_fault = false;
-	walker->level = vcpu->arch.mmu.root_level;
-	pte = vcpu->arch.mmu.get_cr3(vcpu);
+	walker->level = mmu->root_level;
+	pte           = mmu->get_cr3(vcpu);
+
 #if PTTYPE == 64
-	if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
+	if (walker->level == PT32E_ROOT_LEVEL) {
 		pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3);
 		trace_kvm_mmu_paging_element(pte, walker->level);
 		if (!is_present_gpte(pte)) {
@@ -143,7 +145,7 @@ walk:
 	}
 #endif
 	ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
-	       (vcpu->arch.mmu.get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0);
+	       (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0);
 
 	pt_access = ACC_ALL;
 
@@ -205,7 +207,7 @@ walk:
 				(PTTYPE == 64 || is_pse(vcpu))) ||
 		    ((walker->level == PT_PDPE_LEVEL) &&
 				is_large_pte(pte) &&
-				vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL)) {
+				mmu->root_level == PT64_ROOT_LEVEL)) {
 			int lvl = walker->level;
 
 			walker->gfn = gpte_to_gfn_lvl(pte, lvl);
@@ -266,6 +268,14 @@ error:
 	return 0;
 }
 
+static int FNAME(walk_addr)(struct guest_walker *walker,
+			    struct kvm_vcpu *vcpu, gva_t addr,
+			    int write_fault, int user_fault, int fetch_fault)
+{
+	return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.mmu, addr,
+					write_fault, user_fault, fetch_fault);
+}
+
 static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 			      u64 *spte, const void *pte)
 {
-- 
cgit v1.2.3-18-g5258


From c30a358d33e0e111f06e54a4a4125371e6b6693c Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:48 +0200
Subject: KVM: MMU: Add infrastructure for two-level page walker

This patch introduces a mmu-callback to translate gpa
addresses in the walk_addr code. This is later used to
translate l2_gpa addresses into l1_gpa addresses.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 1 +
 arch/x86/kvm/x86.c              | 6 ++++++
 2 files changed, 7 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3fde5b32253..4915b7c8f2e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -243,6 +243,7 @@ struct kvm_mmu {
 	void (*free)(struct kvm_vcpu *vcpu);
 	gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
 			    u32 *error);
+	gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access);
 	void (*prefetch_page)(struct kvm_vcpu *vcpu,
 			      struct kvm_mmu_page *page);
 	int (*sync_page)(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 48b74d2fbfb..2364c2cad89 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3448,6 +3448,11 @@ void kvm_get_segment(struct kvm_vcpu *vcpu,
 	kvm_x86_ops->get_segment(vcpu, var, seg);
 }
 
+static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
+{
+	return gpa;
+}
+
 gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
 {
 	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
@@ -5659,6 +5664,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 
 	vcpu->arch.emulate_ctxt.ops = &emulate_ops;
 	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+	vcpu->arch.mmu.translate_gpa = translate_gpa;
 	if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
 		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 	else
-- 
cgit v1.2.3-18-g5258


From 14dfe855f978181cd611ec018e5ceba860a98545 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:49 +0200
Subject: KVM: X86: Introduce pointer to mmu context used for gva_to_gpa

This patch introduces the walk_mmu pointer which points to
the mmu-context currently used for gva_to_gpa translations.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 13 +++++++++++++
 arch/x86/kvm/mmu.c              | 10 +++++-----
 arch/x86/kvm/x86.c              | 17 ++++++++++-------
 3 files changed, 28 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4915b7c8f2e..1b3eb8a0a1b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -286,8 +286,21 @@ struct kvm_vcpu_arch {
 	u64 ia32_misc_enable_msr;
 	bool tpr_access_reporting;
 
+	/*
+	 * Paging state of the vcpu
+	 *
+	 * If the vcpu runs in guest mode with two level paging this still saves
+	 * the paging mode of the l1 guest. This context is always used to
+	 * handle faults.
+	 */
 	struct kvm_mmu mmu;
 
+	/*
+	 * Pointer to the mmu context currently used for
+	 * gva_to_gpa translations.
+	 */
+	struct kvm_mmu *walk_mmu;
+
 	/*
 	 * This struct is filled with the necessary information to propagate a
 	 * page fault into the guest
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 99367274b97..cb06adac92b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2708,7 +2708,7 @@ static int paging32E_init_context(struct kvm_vcpu *vcpu,
 
 static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 {
-	struct kvm_mmu *context = &vcpu->arch.mmu;
+	struct kvm_mmu *context = vcpu->arch.walk_mmu;
 
 	context->new_cr3 = nonpaging_new_cr3;
 	context->page_fault = tdp_page_fault;
@@ -2767,11 +2767,11 @@ EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
 
 static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
 {
-	int r = kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu);
+	int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
 
-	vcpu->arch.mmu.set_cr3           = kvm_x86_ops->set_cr3;
-	vcpu->arch.mmu.get_cr3           = get_cr3;
-	vcpu->arch.mmu.inject_page_fault = kvm_inject_page_fault;
+	vcpu->arch.walk_mmu->set_cr3           = kvm_x86_ops->set_cr3;
+	vcpu->arch.walk_mmu->get_cr3           = get_cr3;
+	vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
 
 	return r;
 }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2364c2cad89..4196fc71914 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3456,27 +3456,27 @@ static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
 gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
 {
 	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
-	return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
+	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error);
 }
 
  gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
 {
 	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
 	access |= PFERR_FETCH_MASK;
-	return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
+	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error);
 }
 
 gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
 {
 	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
 	access |= PFERR_WRITE_MASK;
-	return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
+	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error);
 }
 
 /* uses this to access any guest's mapped memory without checking CPL */
 gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
 {
-	return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, 0, error);
+	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, error);
 }
 
 static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
@@ -3487,7 +3487,8 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
 	int r = X86EMUL_CONTINUE;
 
 	while (bytes) {
-		gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, access, error);
+		gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access,
+							    error);
 		unsigned offset = addr & (PAGE_SIZE-1);
 		unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
 		int ret;
@@ -3542,8 +3543,9 @@ static int kvm_write_guest_virt_system(gva_t addr, void *val,
 	int r = X86EMUL_CONTINUE;
 
 	while (bytes) {
-		gpa_t gpa =  vcpu->arch.mmu.gva_to_gpa(vcpu, addr,
-						       PFERR_WRITE_MASK, error);
+		gpa_t gpa =  vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
+							     PFERR_WRITE_MASK,
+							     error);
 		unsigned offset = addr & (PAGE_SIZE-1);
 		unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
 		int ret;
@@ -5663,6 +5665,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	kvm = vcpu->kvm;
 
 	vcpu->arch.emulate_ctxt.ops = &emulate_ops;
+	vcpu->arch.walk_mmu = &vcpu->arch.mmu;
 	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
 	vcpu->arch.mmu.translate_gpa = translate_gpa;
 	if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
-- 
cgit v1.2.3-18-g5258


From 6539e738f65a8f1fc7806295d5d701fba4008343 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:50 +0200
Subject: KVM: MMU: Implement nested gva_to_gpa functions

This patch adds the functions to do a nested l2_gva to
l1_gpa page table walk.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 10 ++++++++++
 arch/x86/kvm/mmu.c              |  8 ++++++++
 arch/x86/kvm/paging_tmpl.h      | 31 +++++++++++++++++++++++++++++++
 arch/x86/kvm/x86.h              |  5 +++++
 4 files changed, 54 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 1b3eb8a0a1b..8ec3547c433 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -295,6 +295,16 @@ struct kvm_vcpu_arch {
 	 */
 	struct kvm_mmu mmu;
 
+	/*
+	 * Paging state of an L2 guest (used for nested npt)
+	 *
+	 * This context will save all necessary information to walk page tables
+	 * of the an L2 guest. This context is only initialized for page table
+	 * walking and not for faulting since we never handle l2 page faults on
+	 * the host.
+	 */
+	struct kvm_mmu nested_mmu;
+
 	/*
 	 * Pointer to the mmu context currently used for
 	 * gva_to_gpa translations.
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index cb06adac92b..1e215e8b937 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2466,6 +2466,14 @@ static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
 	return vaddr;
 }
 
+static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
+					 u32 access, u32 *error)
+{
+	if (error)
+		*error = 0;
+	return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
+}
+
 static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
 				u32 error_code)
 {
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index a704a8130e4..eefe363156b 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -276,6 +276,16 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
 					write_fault, user_fault, fetch_fault);
 }
 
+static int FNAME(walk_addr_nested)(struct guest_walker *walker,
+				   struct kvm_vcpu *vcpu, gva_t addr,
+				   int write_fault, int user_fault,
+				   int fetch_fault)
+{
+	return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
+					addr, write_fault, user_fault,
+					fetch_fault);
+}
+
 static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 			      u64 *spte, const void *pte)
 {
@@ -660,6 +670,27 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
 	return gpa;
 }
 
+static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
+				      u32 access, u32 *error)
+{
+	struct guest_walker walker;
+	gpa_t gpa = UNMAPPED_GVA;
+	int r;
+
+	r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr,
+				    access & PFERR_WRITE_MASK,
+				    access & PFERR_USER_MASK,
+				    access & PFERR_FETCH_MASK);
+
+	if (r) {
+		gpa = gfn_to_gpa(walker.gfn);
+		gpa |= vaddr & ~PAGE_MASK;
+	} else if (error)
+		*error = walker.error_code;
+
+	return gpa;
+}
+
 static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
 				 struct kvm_mmu_page *sp)
 {
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 2d6385e44cc..bf4dc2f40d7 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -50,6 +50,11 @@ static inline int is_long_mode(struct kvm_vcpu *vcpu)
 #endif
 }
 
+static inline bool mmu_is_nested(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu;
+}
+
 static inline int is_pae(struct kvm_vcpu *vcpu)
 {
 	return kvm_read_cr4_bits(vcpu, X86_CR4_PAE);
-- 
cgit v1.2.3-18-g5258


From ec92fe44e7ff94d04d8305e49efcffd8773e1cf6 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:51 +0200
Subject: KVM: X86: Add kvm_read_guest_page_mmu function

This patch adds a function which can read from the guests
physical memory or from the guest's guest physical memory.
This will be used in the two-dimensional page table walker.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  3 +++
 arch/x86/kvm/x86.c              | 23 +++++++++++++++++++++++
 2 files changed, 26 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8ec3547c433..08bc383083f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -657,6 +657,9 @@ void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
 void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr);
 void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
 void kvm_inject_page_fault(struct kvm_vcpu *vcpu);
+int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+			    gfn_t gfn, void *data, int offset, int len,
+			    u32 access);
 bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
 
 int kvm_pic_set_irq(void *opaque, int irq, int level);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4196fc71914..a2efb70f4cc 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -369,6 +369,29 @@ bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
 }
 EXPORT_SYMBOL_GPL(kvm_require_cpl);
 
+/*
+ * This function will be used to read from the physical memory of the currently
+ * running guest. The difference to kvm_read_guest_page is that this function
+ * can read from guest physical or from the guest's guest physical memory.
+ */
+int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+			    gfn_t ngfn, void *data, int offset, int len,
+			    u32 access)
+{
+	gfn_t real_gfn;
+	gpa_t ngpa;
+
+	ngpa     = gfn_to_gpa(ngfn);
+	real_gfn = mmu->translate_gpa(vcpu, ngpa, access);
+	if (real_gfn == UNMAPPED_GVA)
+		return -EFAULT;
+
+	real_gfn = gpa_to_gfn(real_gfn);
+
+	return kvm_read_guest_page(vcpu->kvm, real_gfn, data, offset, len);
+}
+EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
+
 /*
  * Load the pae pdptrs.  Return true is they are all valid.
  */
-- 
cgit v1.2.3-18-g5258


From 2329d46d213d0721dafae18db29f54b196f11468 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:52 +0200
Subject: KVM: MMU: Make walk_addr_generic capable for two-level walking

This patch uses kvm_read_guest_page_tdp to make the
walk_addr_generic functions suitable for two-level page
table walking.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 30 +++++++++++++++++++++++-------
 1 file changed, 23 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index eefe363156b..f4e09d341e2 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -124,6 +124,8 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 	unsigned index, pt_access, uninitialized_var(pte_access);
 	gpa_t pte_gpa;
 	bool eperm, present, rsvd_fault;
+	int offset;
+	u32 access = 0;
 
 	trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
 				     fetch_fault);
@@ -153,12 +155,14 @@ walk:
 		index = PT_INDEX(addr, walker->level);
 
 		table_gfn = gpte_to_gfn(pte);
-		pte_gpa = gfn_to_gpa(table_gfn);
-		pte_gpa += index * sizeof(pt_element_t);
+		offset    = index * sizeof(pt_element_t);
+		pte_gpa   = gfn_to_gpa(table_gfn) + offset;
 		walker->table_gfn[walker->level - 1] = table_gfn;
 		walker->pte_gpa[walker->level - 1] = pte_gpa;
 
-		if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) {
+		if (kvm_read_guest_page_mmu(vcpu, mmu, table_gfn, &pte,
+					    offset, sizeof(pte),
+					    PFERR_USER_MASK|PFERR_WRITE_MASK)) {
 			present = false;
 			break;
 		}
@@ -209,15 +213,27 @@ walk:
 				is_large_pte(pte) &&
 				mmu->root_level == PT64_ROOT_LEVEL)) {
 			int lvl = walker->level;
+			gpa_t real_gpa;
+			gfn_t gfn;
 
-			walker->gfn = gpte_to_gfn_lvl(pte, lvl);
-			walker->gfn += (addr & PT_LVL_OFFSET_MASK(lvl))
-					>> PAGE_SHIFT;
+			gfn = gpte_to_gfn_lvl(pte, lvl);
+			gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT;
 
 			if (PTTYPE == 32 &&
 			    walker->level == PT_DIRECTORY_LEVEL &&
 			    is_cpuid_PSE36())
-				walker->gfn += pse36_gfn_delta(pte);
+				gfn += pse36_gfn_delta(pte);
+
+			access |= write_fault ? PFERR_WRITE_MASK : 0;
+			access |= fetch_fault ? PFERR_FETCH_MASK : 0;
+			access |= user_fault  ? PFERR_USER_MASK  : 0;
+
+			real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn),
+						      access);
+			if (real_gpa == UNMAPPED_GVA)
+				return 0;
+
+			walker->gfn = real_gpa >> PAGE_SHIFT;
 
 			break;
 		}
-- 
cgit v1.2.3-18-g5258


From 3d06b8bfd44ec421c386241f7c5af66c8200cbf4 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:53 +0200
Subject: KVM: MMU: Introduce kvm_read_nested_guest_page()

This patch introduces the kvm_read_guest_page_x86 function
which reads from the physical memory of the guest. If the
guest is running in guest-mode itself with nested paging
enabled it will read from the guest's guest physical memory
instead.
The patch also changes changes the code to use this function
where it is necessary.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a2efb70f4cc..46843ed36dc 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -392,6 +392,13 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 }
 EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
 
+int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
+			       void *data, int offset, int len, u32 access)
+{
+	return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
+				       data, offset, len, access);
+}
+
 /*
  * Load the pae pdptrs.  Return true is they are all valid.
  */
@@ -403,8 +410,9 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
 	int ret;
 	u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
 
-	ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
-				  offset * sizeof(u64), sizeof(pdpte));
+	ret = kvm_read_nested_guest_page(vcpu, pdpt_gfn, pdpte,
+					 offset * sizeof(u64), sizeof(pdpte),
+					 PFERR_USER_MASK|PFERR_WRITE_MASK);
 	if (ret < 0) {
 		ret = 0;
 		goto out;
@@ -433,6 +441,8 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
 {
 	u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
 	bool changed = true;
+	int offset;
+	gfn_t gfn;
 	int r;
 
 	if (is_long_mode(vcpu) || !is_pae(vcpu))
@@ -442,7 +452,10 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
 		      (unsigned long *)&vcpu->arch.regs_avail))
 		return true;
 
-	r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
+	gfn = (vcpu->arch.cr3 & ~31u) >> PAGE_SHIFT;
+	offset = (vcpu->arch.cr3 & ~31u) & (PAGE_SIZE - 1);
+	r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
+				       PFERR_USER_MASK | PFERR_WRITE_MASK);
 	if (r < 0)
 		goto out;
 	changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
-- 
cgit v1.2.3-18-g5258


From 02f59dc9f1f51d2148d87d48f84adb455a4fd697 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:54 +0200
Subject: KVM: MMU: Introduce init_kvm_nested_mmu()

This patch introduces the init_kvm_nested_mmu() function
which is used to re-initialize the nested mmu when the l2
guest changes its paging mode.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 37 ++++++++++++++++++++++++++++++++++++-
 arch/x86/kvm/mmu.h |  1 +
 arch/x86/kvm/x86.c | 17 +++++++++++++++++
 3 files changed, 54 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 1e215e8b937..a26f13bd34e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2784,11 +2784,46 @@ static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
 	return r;
 }
 
+static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
+{
+	struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
+
+	g_context->get_cr3           = get_cr3;
+	g_context->inject_page_fault = kvm_inject_page_fault;
+
+	/*
+	 * Note that arch.mmu.gva_to_gpa translates l2_gva to l1_gpa. The
+	 * translation of l2_gpa to l1_gpa addresses is done using the
+	 * arch.nested_mmu.gva_to_gpa function. Basically the gva_to_gpa
+	 * functions between mmu and nested_mmu are swapped.
+	 */
+	if (!is_paging(vcpu)) {
+		g_context->root_level = 0;
+		g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
+	} else if (is_long_mode(vcpu)) {
+		reset_rsvds_bits_mask(vcpu, g_context, PT64_ROOT_LEVEL);
+		g_context->root_level = PT64_ROOT_LEVEL;
+		g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
+	} else if (is_pae(vcpu)) {
+		reset_rsvds_bits_mask(vcpu, g_context, PT32E_ROOT_LEVEL);
+		g_context->root_level = PT32E_ROOT_LEVEL;
+		g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
+	} else {
+		reset_rsvds_bits_mask(vcpu, g_context, PT32_ROOT_LEVEL);
+		g_context->root_level = PT32_ROOT_LEVEL;
+		g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
+	}
+
+	return 0;
+}
+
 static int init_kvm_mmu(struct kvm_vcpu *vcpu)
 {
 	vcpu->arch.update_pte.pfn = bad_pfn;
 
-	if (tdp_enabled)
+	if (mmu_is_nested(vcpu))
+		return init_kvm_nested_mmu(vcpu);
+	else if (tdp_enabled)
 		return init_kvm_tdp_mmu(vcpu);
 	else
 		return init_kvm_softmmu(vcpu);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 7086ca85d3e..513abbb5ff4 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -47,6 +47,7 @@
 #define PFERR_USER_MASK (1U << 2)
 #define PFERR_RSVD_MASK (1U << 3)
 #define PFERR_FETCH_MASK (1U << 4)
+#define PFERR_NESTED_MASK (1U << 31)
 
 int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
 int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 46843ed36dc..e4c76bf8608 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3489,6 +3489,22 @@ static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
 	return gpa;
 }
 
+static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
+{
+	gpa_t t_gpa;
+	u32 error;
+
+	BUG_ON(!mmu_is_nested(vcpu));
+
+	/* NPT walks are always user-walks */
+	access |= PFERR_USER_MASK;
+	t_gpa  = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &error);
+	if (t_gpa == UNMAPPED_GVA)
+		vcpu->arch.fault.error_code |= PFERR_NESTED_MASK;
+
+	return t_gpa;
+}
+
 gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
 {
 	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
@@ -5704,6 +5720,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	vcpu->arch.walk_mmu = &vcpu->arch.mmu;
 	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
 	vcpu->arch.mmu.translate_gpa = translate_gpa;
+	vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
 	if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
 		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 	else
-- 
cgit v1.2.3-18-g5258


From d4f8cf664e4c1fd579df6b6e6378335c9f79d790 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:55 +0200
Subject: KVM: MMU: Propagate the right fault back to the guest after
 gva_to_gpa

This patch implements logic to make sure that either a
page-fault/page-fault-vmexit or a nested-page-fault-vmexit
is propagated back to the guest.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/x86.c              | 18 +++++++++++++++++-
 2 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 08bc383083f..574db6d1532 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -660,6 +660,7 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu);
 int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 			    gfn_t gfn, void *data, int offset, int len,
 			    u32 access);
+void kvm_propagate_fault(struct kvm_vcpu *vcpu);
 bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
 
 int kvm_pic_set_irq(void *opaque, int irq, int level);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e4c76bf8608..0281d920e9e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -338,6 +338,22 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu)
 	kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
 }
 
+void kvm_propagate_fault(struct kvm_vcpu *vcpu)
+{
+	u32 nested, error;
+
+	error   = vcpu->arch.fault.error_code;
+	nested  = error &  PFERR_NESTED_MASK;
+	error   = error & ~PFERR_NESTED_MASK;
+
+	vcpu->arch.fault.error_code = error;
+
+	if (mmu_is_nested(vcpu) && !nested)
+		vcpu->arch.nested_mmu.inject_page_fault(vcpu);
+	else
+		vcpu->arch.mmu.inject_page_fault(vcpu);
+}
+
 void kvm_inject_nmi(struct kvm_vcpu *vcpu)
 {
 	vcpu->arch.nmi_pending = 1;
@@ -4140,7 +4156,7 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu)
 {
 	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 	if (ctxt->exception == PF_VECTOR)
-		kvm_inject_page_fault(vcpu);
+		kvm_propagate_fault(vcpu);
 	else if (ctxt->error_code_valid)
 		kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code);
 	else
-- 
cgit v1.2.3-18-g5258


From d47f00a62b2e14b4a811b87bdb9ea1809693a377 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:56 +0200
Subject: KVM: X86: Propagate fetch faults

KVM currently ignores fetch faults in the instruction
emulator. With nested-npt we could have such faults. This
patch adds the code to handle these.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 3 +++
 arch/x86/kvm/x86.c     | 4 ++++
 2 files changed, 7 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 2b08b78b6ca..aead72e141b 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1198,6 +1198,9 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
 	*(unsigned long *)dest =
 		(ctxt->eflags & ~change_mask) | (val & change_mask);
 
+	if (rc == X86EMUL_PROPAGATE_FAULT)
+		emulate_pf(ctxt);
+
 	return rc;
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0281d920e9e..3101060033a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4247,6 +4247,9 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 		vcpu->arch.emulate_ctxt.perm_ok = false;
 
 		r = x86_decode_insn(&vcpu->arch.emulate_ctxt);
+		if (r == X86EMUL_PROPAGATE_FAULT)
+			goto done;
+
 		trace_kvm_emulate_insn_start(vcpu);
 
 		/* Only allow emulation of specific instructions on #UD
@@ -4305,6 +4308,7 @@ restart:
 		return handle_emulation_failure(vcpu);
 	}
 
+done:
 	if (vcpu->arch.emulate_ctxt.exception >= 0) {
 		inject_emulated_exception(vcpu);
 		r = EMULATE_DONE;
-- 
cgit v1.2.3-18-g5258


From ff03a073e715d49b5cfeeec862649b1df2481ae0 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:57 +0200
Subject: KVM: MMU: Add kvm_mmu parameter to load_pdptrs function

This function need to be able to load the pdptrs from any
mmu context currently in use. So change this function to
take an kvm_mmu parameter to fit these needs.
As a side effect this patch also moves the cached pdptrs
from vcpu_arch into the kvm_mmu struct.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  5 +++--
 arch/x86/kvm/kvm_cache_regs.h   |  2 +-
 arch/x86/kvm/svm.c              |  2 +-
 arch/x86/kvm/vmx.c              | 16 ++++++++--------
 arch/x86/kvm/x86.c              | 26 ++++++++++++++------------
 5 files changed, 27 insertions(+), 24 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 574db6d1532..9e70de37654 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -257,6 +257,8 @@ struct kvm_mmu {
 
 	u64 *pae_root;
 	u64 rsvd_bits_mask[2][4];
+
+	u64 pdptrs[4]; /* pae */
 };
 
 struct kvm_vcpu_arch {
@@ -276,7 +278,6 @@ struct kvm_vcpu_arch {
 	unsigned long cr4_guest_owned_bits;
 	unsigned long cr8;
 	u32 hflags;
-	u64 pdptrs[4]; /* pae */
 	u64 efer;
 	u64 apic_base;
 	struct kvm_lapic *apic;    /* kernel irqchip context */
@@ -592,7 +593,7 @@ void kvm_mmu_zap_all(struct kvm *kvm);
 unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
 
-int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
+int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3);
 
 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
 			  const void *val, int bytes);
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 6491ac8e755..a37abe2ec39 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -42,7 +42,7 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
 		      (unsigned long *)&vcpu->arch.regs_avail))
 		kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR);
 
-	return vcpu->arch.pdptrs[index];
+	return vcpu->arch.walk_mmu->pdptrs[index];
 }
 
 static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 53c9039583f..ca711cb27a1 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1010,7 +1010,7 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
 	switch (reg) {
 	case VCPU_EXREG_PDPTR:
 		BUG_ON(!npt_enabled);
-		load_pdptrs(vcpu, vcpu->arch.cr3);
+		load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3);
 		break;
 	default:
 		BUG();
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ff7a8d48fd2..1a7691a8717 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1842,20 +1842,20 @@ static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
 		return;
 
 	if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
-		vmcs_write64(GUEST_PDPTR0, vcpu->arch.pdptrs[0]);
-		vmcs_write64(GUEST_PDPTR1, vcpu->arch.pdptrs[1]);
-		vmcs_write64(GUEST_PDPTR2, vcpu->arch.pdptrs[2]);
-		vmcs_write64(GUEST_PDPTR3, vcpu->arch.pdptrs[3]);
+		vmcs_write64(GUEST_PDPTR0, vcpu->arch.mmu.pdptrs[0]);
+		vmcs_write64(GUEST_PDPTR1, vcpu->arch.mmu.pdptrs[1]);
+		vmcs_write64(GUEST_PDPTR2, vcpu->arch.mmu.pdptrs[2]);
+		vmcs_write64(GUEST_PDPTR3, vcpu->arch.mmu.pdptrs[3]);
 	}
 }
 
 static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
 {
 	if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
-		vcpu->arch.pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
-		vcpu->arch.pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
-		vcpu->arch.pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
-		vcpu->arch.pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
+		vcpu->arch.mmu.pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
+		vcpu->arch.mmu.pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
+		vcpu->arch.mmu.pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
+		vcpu->arch.mmu.pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
 	}
 
 	__set_bit(VCPU_EXREG_PDPTR,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3101060033a..bbd9f4af444 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -418,17 +418,17 @@ int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
 /*
  * Load the pae pdptrs.  Return true is they are all valid.
  */
-int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
+int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
 {
 	gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
 	unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
 	int i;
 	int ret;
-	u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
+	u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
 
-	ret = kvm_read_nested_guest_page(vcpu, pdpt_gfn, pdpte,
-					 offset * sizeof(u64), sizeof(pdpte),
-					 PFERR_USER_MASK|PFERR_WRITE_MASK);
+	ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte,
+				      offset * sizeof(u64), sizeof(pdpte),
+				      PFERR_USER_MASK|PFERR_WRITE_MASK);
 	if (ret < 0) {
 		ret = 0;
 		goto out;
@@ -442,7 +442,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
 	}
 	ret = 1;
 
-	memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
+	memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
 	__set_bit(VCPU_EXREG_PDPTR,
 		  (unsigned long *)&vcpu->arch.regs_avail);
 	__set_bit(VCPU_EXREG_PDPTR,
@@ -455,7 +455,7 @@ EXPORT_SYMBOL_GPL(load_pdptrs);
 
 static bool pdptrs_changed(struct kvm_vcpu *vcpu)
 {
-	u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
+	u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
 	bool changed = true;
 	int offset;
 	gfn_t gfn;
@@ -474,7 +474,7 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
 				       PFERR_USER_MASK | PFERR_WRITE_MASK);
 	if (r < 0)
 		goto out;
-	changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
+	changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
 out:
 
 	return changed;
@@ -513,7 +513,8 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 				return 1;
 		} else
 #endif
-		if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3))
+		if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
+						 vcpu->arch.cr3))
 			return 1;
 	}
 
@@ -602,7 +603,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 			return 1;
 	} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
 		   && ((cr4 ^ old_cr4) & pdptr_bits)
-		   && !load_pdptrs(vcpu, vcpu->arch.cr3))
+		   && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3))
 		return 1;
 
 	if (cr4 & X86_CR4_VMXE)
@@ -635,7 +636,8 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 		if (is_pae(vcpu)) {
 			if (cr3 & CR3_PAE_RESERVED_BITS)
 				return 1;
-			if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3))
+			if (is_paging(vcpu) &&
+			    !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
 				return 1;
 		}
 		/*
@@ -5422,7 +5424,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
 	kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
 	if (!is_long_mode(vcpu) && is_pae(vcpu)) {
-		load_pdptrs(vcpu, vcpu->arch.cr3);
+		load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3);
 		mmu_reset_needed = 1;
 	}
 
-- 
cgit v1.2.3-18-g5258


From d41d1895eb856b5d1c82f3be106b7a3e75e4216b Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:58 +0200
Subject: KVM: MMU: Introduce kvm_pdptr_read_mmu

This function is implemented to load the pdptr pointers of
the currently running guest (l1 or l2 guest). Therefore it
takes care about the current paging mode and can read pdptrs
out of l2 guest physical memory.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/kvm_cache_regs.h | 7 +++++++
 arch/x86/kvm/mmu.c            | 2 +-
 arch/x86/kvm/paging_tmpl.h    | 2 +-
 3 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index a37abe2ec39..975bb45329a 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -45,6 +45,13 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
 	return vcpu->arch.walk_mmu->pdptrs[index];
 }
 
+static inline u64 kvm_pdptr_read_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, int index)
+{
+	load_pdptrs(vcpu, mmu, mmu->get_cr3(vcpu));
+
+	return mmu->pdptrs[index];
+}
+
 static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
 {
 	ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a26f13bd34e..a25173a0d8b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2398,7 +2398,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 
 		ASSERT(!VALID_PAGE(root));
 		if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
-			pdptr = kvm_pdptr_read(vcpu, i);
+			pdptr = kvm_pdptr_read_mmu(vcpu, &vcpu->arch.mmu, i);
 			if (!is_present_gpte(pdptr)) {
 				vcpu->arch.mmu.pae_root[i] = 0;
 				continue;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index f4e09d341e2..a28f09bb76c 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -137,7 +137,7 @@ walk:
 
 #if PTTYPE == 64
 	if (walker->level == PT32E_ROOT_LEVEL) {
-		pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3);
+		pte = kvm_pdptr_read_mmu(vcpu, mmu, (addr >> 30) & 3);
 		trace_kvm_mmu_paging_element(pte, walker->level);
 		if (!is_present_gpte(pte)) {
 			present = false;
-- 
cgit v1.2.3-18-g5258


From 651dd37a9ce6fdacdcd75da86619c62111efcbc2 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:30:59 +0200
Subject: KVM: MMU: Refactor mmu_alloc_roots function

This patch factors out the direct-mapping paths of the
mmu_alloc_roots function into a seperate function. This
makes it a lot easier to avoid all the unnecessary checks
done in the shadow path which may break when running direct.
In fact, this patch already fixes a problem when running PAE
guests on a PAE shadow page table.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 82 +++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 60 insertions(+), 22 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a25173a0d8b..9cd5a717ede 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2357,42 +2357,77 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
 	return ret;
 }
 
-static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
+static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
+{
+	struct kvm_mmu_page *sp;
+	int i;
+
+	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+		spin_lock(&vcpu->kvm->mmu_lock);
+		kvm_mmu_free_some_pages(vcpu);
+		sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL,
+				      1, ACC_ALL, NULL);
+		++sp->root_count;
+		spin_unlock(&vcpu->kvm->mmu_lock);
+		vcpu->arch.mmu.root_hpa = __pa(sp->spt);
+	} else if (vcpu->arch.mmu.shadow_root_level == PT32E_ROOT_LEVEL) {
+		for (i = 0; i < 4; ++i) {
+			hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+			ASSERT(!VALID_PAGE(root));
+			spin_lock(&vcpu->kvm->mmu_lock);
+			kvm_mmu_free_some_pages(vcpu);
+			sp = kvm_mmu_get_page(vcpu, i << 30, i << 30,
+					      PT32_ROOT_LEVEL, 1, ACC_ALL,
+					      NULL);
+			root = __pa(sp->spt);
+			++sp->root_count;
+			spin_unlock(&vcpu->kvm->mmu_lock);
+			vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
+			vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
+		}
+	} else
+		BUG();
+
+	return 0;
+}
+
+static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 {
 	int i;
 	gfn_t root_gfn;
 	struct kvm_mmu_page *sp;
-	int direct = 0;
 	u64 pdptr;
 
 	root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT;
 
-	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+	if (mmu_check_root(vcpu, root_gfn))
+		return 1;
+
+	/*
+	 * Do we shadow a long mode page table? If so we need to
+	 * write-protect the guests page table root.
+	 */
+	if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
 		hpa_t root = vcpu->arch.mmu.root_hpa;
 
 		ASSERT(!VALID_PAGE(root));
-		if (mmu_check_root(vcpu, root_gfn))
-			return 1;
-		if (vcpu->arch.mmu.direct_map) {
-			direct = 1;
-			root_gfn = 0;
-		}
+
 		spin_lock(&vcpu->kvm->mmu_lock);
 		kvm_mmu_free_some_pages(vcpu);
-		sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
-				      PT64_ROOT_LEVEL, direct,
-				      ACC_ALL, NULL);
+		sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL,
+				      0, ACC_ALL, NULL);
 		root = __pa(sp->spt);
 		++sp->root_count;
 		spin_unlock(&vcpu->kvm->mmu_lock);
 		vcpu->arch.mmu.root_hpa = root;
 		return 0;
 	}
-	direct = !is_paging(vcpu);
-
-	if (mmu_check_root(vcpu, root_gfn))
-		return 1;
 
+	/*
+	 * We shadow a 32 bit page table. This may be a legacy 2-level
+	 * or a PAE 3-level page table.
+	 */
 	for (i = 0; i < 4; ++i) {
 		hpa_t root = vcpu->arch.mmu.pae_root[i];
 
@@ -2406,16 +2441,11 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 			root_gfn = pdptr >> PAGE_SHIFT;
 			if (mmu_check_root(vcpu, root_gfn))
 				return 1;
-		} else if (vcpu->arch.mmu.root_level == 0)
-			root_gfn = 0;
-		if (vcpu->arch.mmu.direct_map) {
-			direct = 1;
-			root_gfn = i << 30;
 		}
 		spin_lock(&vcpu->kvm->mmu_lock);
 		kvm_mmu_free_some_pages(vcpu);
 		sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
-				      PT32_ROOT_LEVEL, direct,
+				      PT32_ROOT_LEVEL, 0,
 				      ACC_ALL, NULL);
 		root = __pa(sp->spt);
 		++sp->root_count;
@@ -2427,6 +2457,14 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
+{
+	if (vcpu->arch.mmu.direct_map)
+		return mmu_alloc_direct_roots(vcpu);
+	else
+		return mmu_alloc_shadow_roots(vcpu);
+}
+
 static void mmu_sync_roots(struct kvm_vcpu *vcpu)
 {
 	int i;
-- 
cgit v1.2.3-18-g5258


From 81407ca553c0c852b8cd3f38f3ec362d307f829b Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:31:00 +0200
Subject: KVM: MMU: Allow long mode shadows for legacy page tables

Currently the KVM softmmu implementation can not shadow a 32
bit legacy or PAE page table with a long mode page table.
This is a required feature for nested paging emulation
because the nested page table must alway be in host format.
So this patch implements the missing pieces to allow long
mode page tables for page table types.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/mmu.c              | 60 +++++++++++++++++++++++++++++++++++------
 2 files changed, 53 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9e70de37654..bd59b482f1a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -256,6 +256,7 @@ struct kvm_mmu {
 	bool direct_map;
 
 	u64 *pae_root;
+	u64 *lm_root;
 	u64 rsvd_bits_mask[2][4];
 
 	u64 pdptrs[4]; /* pae */
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9cd5a717ede..dd76765310c 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1504,6 +1504,12 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
 	iterator->addr = addr;
 	iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
 	iterator->level = vcpu->arch.mmu.shadow_root_level;
+
+	if (iterator->level == PT64_ROOT_LEVEL &&
+	    vcpu->arch.mmu.root_level < PT64_ROOT_LEVEL &&
+	    !vcpu->arch.mmu.direct_map)
+		--iterator->level;
+
 	if (iterator->level == PT32E_ROOT_LEVEL) {
 		iterator->shadow_addr
 			= vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
@@ -2314,7 +2320,9 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
 	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
 		return;
 	spin_lock(&vcpu->kvm->mmu_lock);
-	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL &&
+	    (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL ||
+	     vcpu->arch.mmu.direct_map)) {
 		hpa_t root = vcpu->arch.mmu.root_hpa;
 
 		sp = page_header(root);
@@ -2394,10 +2402,10 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
 
 static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 {
-	int i;
-	gfn_t root_gfn;
 	struct kvm_mmu_page *sp;
-	u64 pdptr;
+	u64 pdptr, pm_mask;
+	gfn_t root_gfn;
+	int i;
 
 	root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT;
 
@@ -2426,8 +2434,13 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 
 	/*
 	 * We shadow a 32 bit page table. This may be a legacy 2-level
-	 * or a PAE 3-level page table.
+	 * or a PAE 3-level page table. In either case we need to be aware that
+	 * the shadow page table may be a PAE or a long mode page table.
 	 */
+	pm_mask = PT_PRESENT_MASK;
+	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL)
+		pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
+
 	for (i = 0; i < 4; ++i) {
 		hpa_t root = vcpu->arch.mmu.pae_root[i];
 
@@ -2451,9 +2464,35 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 		++sp->root_count;
 		spin_unlock(&vcpu->kvm->mmu_lock);
 
-		vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
+		vcpu->arch.mmu.pae_root[i] = root | pm_mask;
+		vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
 	}
-	vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
+
+	/*
+	 * If we shadow a 32 bit page table with a long mode page
+	 * table we enter this path.
+	 */
+	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+		if (vcpu->arch.mmu.lm_root == NULL) {
+			/*
+			 * The additional page necessary for this is only
+			 * allocated on demand.
+			 */
+
+			u64 *lm_root;
+
+			lm_root = (void*)get_zeroed_page(GFP_KERNEL);
+			if (lm_root == NULL)
+				return 1;
+
+			lm_root[0] = __pa(vcpu->arch.mmu.pae_root) | pm_mask;
+
+			vcpu->arch.mmu.lm_root = lm_root;
+		}
+
+		vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.lm_root);
+	}
+
 	return 0;
 }
 
@@ -2470,9 +2509,12 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
 	int i;
 	struct kvm_mmu_page *sp;
 
+	if (vcpu->arch.mmu.direct_map)
+		return;
+
 	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
 		return;
-	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+	if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
 		hpa_t root = vcpu->arch.mmu.root_hpa;
 		sp = page_header(root);
 		mmu_sync_children(vcpu, sp);
@@ -3253,6 +3295,8 @@ EXPORT_SYMBOL_GPL(kvm_disable_tdp);
 static void free_mmu_pages(struct kvm_vcpu *vcpu)
 {
 	free_page((unsigned long)vcpu->arch.mmu.pae_root);
+	if (vcpu->arch.mmu.lm_root != NULL)
+		free_page((unsigned long)vcpu->arch.mmu.lm_root);
 }
 
 static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3-18-g5258


From 2d48a985c7bbcd72b4e92e301ea96bf1252ffc61 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:31:01 +0200
Subject: KVM: MMU: Track NX state in struct kvm_mmu

With Nested Paging emulation the NX state between the two
MMU contexts may differ. To make sure that always the right
fault error code is recorded this patch moves the NX state
into struct kvm_mmu so that the code can distinguish between
L1 and L2 NX state.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/kvm/mmu.c              | 16 +++++++++++++++-
 arch/x86/kvm/paging_tmpl.h      |  4 ++--
 3 files changed, 19 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index bd59b482f1a..b43686a4487 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -259,6 +259,8 @@ struct kvm_mmu {
 	u64 *lm_root;
 	u64 rsvd_bits_mask[2][4];
 
+	bool nx;
+
 	u64 pdptrs[4]; /* pae */
 };
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index dd76765310c..95cbeed74cf 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2634,6 +2634,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu,
 	context->shadow_root_level = PT32E_ROOT_LEVEL;
 	context->root_hpa = INVALID_PAGE;
 	context->direct_map = true;
+	context->nx = false;
 	return 0;
 }
 
@@ -2687,7 +2688,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
 	int maxphyaddr = cpuid_maxphyaddr(vcpu);
 	u64 exb_bit_rsvd = 0;
 
-	if (!is_nx(vcpu))
+	if (!context->nx)
 		exb_bit_rsvd = rsvd_bits(63, 63);
 	switch (level) {
 	case PT32_ROOT_LEVEL:
@@ -2746,6 +2747,8 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
 					struct kvm_mmu *context,
 					int level)
 {
+	context->nx = is_nx(vcpu);
+
 	reset_rsvds_bits_mask(vcpu, context, level);
 
 	ASSERT(is_pae(vcpu));
@@ -2772,6 +2775,8 @@ static int paging64_init_context(struct kvm_vcpu *vcpu,
 static int paging32_init_context(struct kvm_vcpu *vcpu,
 				 struct kvm_mmu *context)
 {
+	context->nx = false;
+
 	reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
 
 	context->new_cr3 = paging_new_cr3;
@@ -2810,19 +2815,24 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
 	context->get_cr3 = get_cr3;
 	context->inject_page_fault = kvm_inject_page_fault;
+	context->nx = is_nx(vcpu);
 
 	if (!is_paging(vcpu)) {
+		context->nx = false;
 		context->gva_to_gpa = nonpaging_gva_to_gpa;
 		context->root_level = 0;
 	} else if (is_long_mode(vcpu)) {
+		context->nx = is_nx(vcpu);
 		reset_rsvds_bits_mask(vcpu, context, PT64_ROOT_LEVEL);
 		context->gva_to_gpa = paging64_gva_to_gpa;
 		context->root_level = PT64_ROOT_LEVEL;
 	} else if (is_pae(vcpu)) {
+		context->nx = is_nx(vcpu);
 		reset_rsvds_bits_mask(vcpu, context, PT32E_ROOT_LEVEL);
 		context->gva_to_gpa = paging64_gva_to_gpa;
 		context->root_level = PT32E_ROOT_LEVEL;
 	} else {
+		context->nx = false;
 		reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
 		context->gva_to_gpa = paging32_gva_to_gpa;
 		context->root_level = PT32_ROOT_LEVEL;
@@ -2878,17 +2888,21 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
 	 * functions between mmu and nested_mmu are swapped.
 	 */
 	if (!is_paging(vcpu)) {
+		g_context->nx = false;
 		g_context->root_level = 0;
 		g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
 	} else if (is_long_mode(vcpu)) {
+		g_context->nx = is_nx(vcpu);
 		reset_rsvds_bits_mask(vcpu, g_context, PT64_ROOT_LEVEL);
 		g_context->root_level = PT64_ROOT_LEVEL;
 		g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
 	} else if (is_pae(vcpu)) {
+		g_context->nx = is_nx(vcpu);
 		reset_rsvds_bits_mask(vcpu, g_context, PT32E_ROOT_LEVEL);
 		g_context->root_level = PT32E_ROOT_LEVEL;
 		g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
 	} else {
+		g_context->nx = false;
 		reset_rsvds_bits_mask(vcpu, g_context, PT32_ROOT_LEVEL);
 		g_context->root_level = PT32_ROOT_LEVEL;
 		g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index a28f09bb76c..2bdd843ad63 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -105,7 +105,7 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
 
 	access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
 #if PTTYPE == 64
-	if (is_nx(vcpu))
+	if (vcpu->arch.mmu.nx)
 		access &= ~(gpte >> PT64_NX_SHIFT);
 #endif
 	return access;
@@ -272,7 +272,7 @@ error:
 		walker->error_code |= PFERR_WRITE_MASK;
 	if (user_fault)
 		walker->error_code |= PFERR_USER_MASK;
-	if (fetch_fault && is_nx(vcpu))
+	if (fetch_fault && mmu->nx)
 		walker->error_code |= PFERR_FETCH_MASK;
 	if (rsvd_fault)
 		walker->error_code |= PFERR_RSVD_MASK;
-- 
cgit v1.2.3-18-g5258


From 5bd2edc341d11af175e759a546e4335ba3e0584f Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:31:02 +0200
Subject: KVM: SVM: Implement MMU helper functions for Nested Nested Paging

This patch adds the helper functions which will be used in
the mmu context for handling nested nested page faults.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ca711cb27a1..9a9a4405b57 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -104,6 +104,8 @@ struct nested_state {
 	u32 intercept_exceptions;
 	u64 intercept;
 
+	/* Nested Paging related state */
+	u64 nested_cr3;
 };
 
 #define MSRPM_OFFSETS	16
@@ -1600,6 +1602,34 @@ static int vmmcall_interception(struct vcpu_svm *svm)
 	return 1;
 }
 
+static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	return svm->nested.nested_cr3;
+}
+
+static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
+				   unsigned long root)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	svm->vmcb->control.nested_cr3 = root;
+	force_new_asid(vcpu);
+}
+
+static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	svm->vmcb->control.exit_code = SVM_EXIT_NPF;
+	svm->vmcb->control.exit_code_hi = 0;
+	svm->vmcb->control.exit_info_1 = vcpu->arch.fault.error_code;
+	svm->vmcb->control.exit_info_2 = vcpu->arch.fault.address;
+
+	nested_svm_vmexit(svm);
+}
+
 static int nested_svm_check_permissions(struct vcpu_svm *svm)
 {
 	if (!(svm->vcpu.arch.efer & EFER_SVME)
-- 
cgit v1.2.3-18-g5258


From 4b16184c1ccafa4b0c188c622ea532fb90e6f5b0 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:31:03 +0200
Subject: KVM: SVM: Initialize Nested Nested MMU context on VMRUN

This patch adds code to initialize the Nested Nested Paging
MMU context when the L1 guest executes a VMRUN instruction
and has nested paging enabled in its VMCB.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c |  1 +
 arch/x86/kvm/svm.c | 50 +++++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 42 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 95cbeed74cf..6e248d80e35 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2962,6 +2962,7 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu)
 {
 	mmu_free_roots(vcpu);
 }
+EXPORT_SYMBOL_GPL(kvm_mmu_unload);
 
 static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
 				  struct kvm_mmu_page *sp,
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 9a9a4405b57..3184772dedf 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -294,6 +294,15 @@ static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
 	force_new_asid(vcpu);
 }
 
+static int get_npt_level(void)
+{
+#ifdef CONFIG_X86_64
+	return PT64_ROOT_LEVEL;
+#else
+	return PT32E_ROOT_LEVEL;
+#endif
+}
+
 static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 {
 	vcpu->arch.efer = efer;
@@ -1630,6 +1639,26 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu)
 	nested_svm_vmexit(svm);
 }
 
+static int nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
+{
+	int r;
+
+	r = kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu);
+
+	vcpu->arch.mmu.set_cr3           = nested_svm_set_tdp_cr3;
+	vcpu->arch.mmu.get_cr3           = nested_svm_get_tdp_cr3;
+	vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit;
+	vcpu->arch.mmu.shadow_root_level = get_npt_level();
+	vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
+
+	return r;
+}
+
+static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.walk_mmu = &vcpu->arch.mmu;
+}
+
 static int nested_svm_check_permissions(struct vcpu_svm *svm)
 {
 	if (!(svm->vcpu.arch.efer & EFER_SVME)
@@ -1998,6 +2027,8 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 	kvm_clear_exception_queue(&svm->vcpu);
 	kvm_clear_interrupt_queue(&svm->vcpu);
 
+	svm->nested.nested_cr3 = 0;
+
 	/* Restore selected save entries */
 	svm->vmcb->save.es = hsave->save.es;
 	svm->vmcb->save.cs = hsave->save.cs;
@@ -2024,6 +2055,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 
 	nested_svm_unmap(page);
 
+	nested_svm_uninit_mmu_context(&svm->vcpu);
 	kvm_mmu_reset_context(&svm->vcpu);
 	kvm_mmu_load(&svm->vcpu);
 
@@ -2071,6 +2103,9 @@ static bool nested_vmcb_checks(struct vmcb *vmcb)
 	if (vmcb->control.asid == 0)
 		return false;
 
+	if (vmcb->control.nested_ctl && !npt_enabled)
+		return false;
+
 	return true;
 }
 
@@ -2143,6 +2178,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 	else
 		svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
 
+	if (nested_vmcb->control.nested_ctl) {
+		kvm_mmu_unload(&svm->vcpu);
+		svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3;
+		nested_svm_init_mmu_context(&svm->vcpu);
+	}
+
 	/* Load the nested guest state */
 	svm->vmcb->save.es = nested_vmcb->save.es;
 	svm->vmcb->save.cs = nested_vmcb->save.cs;
@@ -3415,15 +3456,6 @@ static bool svm_cpu_has_accelerated_tpr(void)
 	return false;
 }
 
-static int get_npt_level(void)
-{
-#ifdef CONFIG_X86_64
-	return PT64_ROOT_LEVEL;
-#else
-	return PT32E_ROOT_LEVEL;
-#endif
-}
-
 static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
 {
 	return 0;
-- 
cgit v1.2.3-18-g5258


From 55c5e464fcc28ee763d40561abf2b259131dd703 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:31:04 +0200
Subject: KVM: SVM: Expect two more candiates for exit_int_info

This patch adds INTR and NMI intercepts to the list of
expected intercepts with an exit_int_info set. While this
can't happen on bare metal it is architectural legal and may
happen with KVMs SVM emulation.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 3184772dedf..de1930ee2ab 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2991,7 +2991,8 @@ static int handle_exit(struct kvm_vcpu *vcpu)
 
 	if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
 	    exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
-	    exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH)
+	    exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&
+	    exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI)
 		printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x "
 		       "exit_code 0x%x\n",
 		       __func__, svm->vmcb->control.exit_int_info,
-- 
cgit v1.2.3-18-g5258


From 3d4aeaad8bb8f8084a414819934b73ab49c26c92 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:31:05 +0200
Subject: KVM: SVM: Report Nested Paging support to userspace

This patch implements the reporting of the nested paging
feature support to userspace.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index de1930ee2ab..36e6c88913d 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3481,6 +3481,10 @@ static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
 		if (svm_has(SVM_FEATURE_NRIP))
 			entry->edx |= SVM_FEATURE_NRIP;
 
+		/* Support NPT for the guest if enabled */
+		if (npt_enabled)
+			entry->edx |= SVM_FEATURE_NPT;
+
 		break;
 	}
 }
-- 
cgit v1.2.3-18-g5258


From 4c62a2dc92518c5adf434df8e5c2283c6762672a Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 10 Sep 2010 17:31:06 +0200
Subject: KVM: X86: Report SVM bit to userspace only when supported

This patch fixes a bug in KVM where it _always_ reports the
support of the SVM feature to userspace. But KVM only
supports SVM on AMD hardware and only when it is enabled in
the kernel module. This patch fixes the wrong reporting.

Cc: stable@kernel.org
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 4 ++++
 arch/x86/kvm/x86.c | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 36e6c88913d..e0f4da07f98 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3469,6 +3469,10 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu)
 static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
 {
 	switch (func) {
+	case 0x80000001:
+		if (nested)
+			entry->ecx |= (1 << 2); /* Set SVM bit */
+		break;
 	case 0x8000000A:
 		entry->eax = 1; /* SVM revision 1 */
 		entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bbd9f4af444..3ff0a8ff275 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2209,7 +2209,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		0 /* Reserved, AES */ | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX);
 	/* cpuid 0x80000001.ecx */
 	const u32 kvm_supported_word6_x86_features =
-		F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
+		F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
 		F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
 		F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) |
 		0 /* SKINIT */ | 0 /* WDT */;
-- 
cgit v1.2.3-18-g5258


From b0bc3ee2b54fcea0df42cc9aa05103b1ccd89db0 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 13 Sep 2010 16:45:28 +0200
Subject: KVM: MMU: Fix regression with ept memory types merged into non-ept
 page tables

Commit "KVM: MMU: Make tdp_enabled a mmu-context parameter" made real-mode
set ->direct_map, and changed the code that merges in the memory type depend
on direct_map instead of tdp_enabled.  However, in this case what really
matters is tdp, not direct_map, since tdp changes the pte format regardless
of whether the mapping is direct or not.

As a result, real-mode shadow mappings got corrupted with ept memory types.
The result was a huge slowdown, likely due to the cache being disabled.

Change it back as the simplest fix for the regression (real fix is to move
all that to vmx code, and not use tdp_enabled as a synonym for ept).

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 6e248d80e35..3ce56bfe056 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1980,7 +1980,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		spte |= shadow_user_mask;
 	if (level > PT_PAGE_TABLE_LEVEL)
 		spte |= PT_PAGE_SIZE_MASK;
-	if (vcpu->arch.mmu.direct_map)
+	if (tdp_enabled)
 		spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
 			kvm_is_mmio_pfn(pfn));
 
-- 
cgit v1.2.3-18-g5258


From 3842d135ff246b6543f1df77f5600e12094a6845 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 27 Jul 2010 12:30:24 +0300
Subject: KVM: Check for pending events before attempting injection

Instead of blindly attempting to inject an event before each guest entry,
check for a possible event first in vcpu->requests.  Sites that can trigger
event injection are modified to set KVM_REQ_EVENT:

- interrupt, nmi window opening
- ppr updates
- i8259 output changes
- local apic irr changes
- rflags updates
- gif flag set
- event set on exit

This improves non-injecting entry performance, and sets the stage for
non-atomic injection.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/i8259.c |  1 +
 arch/x86/kvm/lapic.c | 13 +++++++++++--
 arch/x86/kvm/svm.c   |  8 +++++++-
 arch/x86/kvm/vmx.c   |  6 ++++++
 arch/x86/kvm/x86.c   | 41 ++++++++++++++++++++++++++++++++---------
 5 files changed, 57 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 6e77471951e..ab1bb8ff9a8 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -67,6 +67,7 @@ static void pic_unlock(struct kvm_pic *s)
 		if (!found)
 			return;
 
+		kvm_make_request(KVM_REQ_EVENT, found);
 		kvm_vcpu_kick(found);
 	}
 }
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 77d8c0f4817..c6f2f159384 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -259,9 +259,10 @@ static inline int apic_find_highest_isr(struct kvm_lapic *apic)
 
 static void apic_update_ppr(struct kvm_lapic *apic)
 {
-	u32 tpr, isrv, ppr;
+	u32 tpr, isrv, ppr, old_ppr;
 	int isr;
 
+	old_ppr = apic_get_reg(apic, APIC_PROCPRI);
 	tpr = apic_get_reg(apic, APIC_TASKPRI);
 	isr = apic_find_highest_isr(apic);
 	isrv = (isr != -1) ? isr : 0;
@@ -274,7 +275,10 @@ static void apic_update_ppr(struct kvm_lapic *apic)
 	apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x",
 		   apic, ppr, isr, isrv);
 
-	apic_set_reg(apic, APIC_PROCPRI, ppr);
+	if (old_ppr != ppr) {
+		apic_set_reg(apic, APIC_PROCPRI, ppr);
+		kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
+	}
 }
 
 static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
@@ -391,6 +395,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 			break;
 		}
 
+		kvm_make_request(KVM_REQ_EVENT, vcpu);
 		kvm_vcpu_kick(vcpu);
 		break;
 
@@ -416,6 +421,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 				       "INIT on a runnable vcpu %d\n",
 				       vcpu->vcpu_id);
 			vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
+			kvm_make_request(KVM_REQ_EVENT, vcpu);
 			kvm_vcpu_kick(vcpu);
 		} else {
 			apic_debug("Ignoring de-assert INIT to vcpu %d\n",
@@ -430,6 +436,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 			result = 1;
 			vcpu->arch.sipi_vector = vector;
 			vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
+			kvm_make_request(KVM_REQ_EVENT, vcpu);
 			kvm_vcpu_kick(vcpu);
 		}
 		break;
@@ -475,6 +482,7 @@ static void apic_set_eoi(struct kvm_lapic *apic)
 		trigger_mode = IOAPIC_EDGE_TRIG;
 	if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI))
 		kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
+	kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
 }
 
 static void apic_send_ipi(struct kvm_lapic *apic)
@@ -1152,6 +1160,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
 	update_divide_count(apic);
 	start_apic_timer(apic);
 	apic->irr_pending = true;
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
 }
 
 void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index e0f4da07f98..1d2ea65d353 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2371,6 +2371,7 @@ static int stgi_interception(struct vcpu_svm *svm)
 
 	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
 	skip_emulated_instruction(&svm->vcpu);
+	kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
 
 	enable_gif(svm);
 
@@ -2763,6 +2764,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm)
 {
 	struct kvm_run *kvm_run = svm->vcpu.run;
 
+	kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
 	svm_clear_vintr(svm);
 	svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
 	/*
@@ -3209,8 +3211,10 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
 
 	svm->int3_injected = 0;
 
-	if (svm->vcpu.arch.hflags & HF_IRET_MASK)
+	if (svm->vcpu.arch.hflags & HF_IRET_MASK) {
 		svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
+		kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+	}
 
 	svm->vcpu.arch.nmi_injected = false;
 	kvm_clear_exception_queue(&svm->vcpu);
@@ -3219,6 +3223,8 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
 	if (!(exitintinfo & SVM_EXITINTINFO_VALID))
 		return;
 
+	kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+
 	vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
 	type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 1a7691a8717..2ce2e0b13ed 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3327,6 +3327,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu)
 
 static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
 {
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
 	return 1;
 }
 
@@ -3339,6 +3340,8 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu)
 	cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
 	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
 
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
+
 	++vcpu->stat.irq_window_exits;
 
 	/*
@@ -3595,6 +3598,7 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu)
 	cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
 	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
 	++vcpu->stat.nmi_window_exits;
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
 
 	return 1;
 }
@@ -3828,6 +3832,8 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 	if (!idtv_info_valid)
 		return;
 
+	kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
+
 	vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
 	type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3ff0a8ff275..e7198036db6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -284,6 +284,8 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
 	u32 prev_nr;
 	int class1, class2;
 
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
+
 	if (!vcpu->arch.exception.pending) {
 	queue:
 		vcpu->arch.exception.pending = true;
@@ -356,6 +358,7 @@ void kvm_propagate_fault(struct kvm_vcpu *vcpu)
 
 void kvm_inject_nmi(struct kvm_vcpu *vcpu)
 {
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
 	vcpu->arch.nmi_pending = 1;
 }
 EXPORT_SYMBOL_GPL(kvm_inject_nmi);
@@ -2418,6 +2421,7 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
 		return -ENXIO;
 
 	kvm_queue_interrupt(vcpu, irq->irq, false);
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
 
 	return 0;
 }
@@ -2571,6 +2575,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 	if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR)
 		vcpu->arch.sipi_vector = events->sipi_vector;
 
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
+
 	return 0;
 }
 
@@ -4329,6 +4335,7 @@ done:
 
 	toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility);
 	kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
 	memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
 	kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
 
@@ -4998,6 +5005,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	int r;
 	bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
 		vcpu->run->request_interrupt_window;
+	bool req_event;
 
 	if (vcpu->requests) {
 		if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
@@ -5045,8 +5053,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
 	local_irq_disable();
 
+	req_event = kvm_check_request(KVM_REQ_EVENT, vcpu);
+
 	if (!atomic_read(&vcpu->guest_mode) || vcpu->requests
 	    || need_resched() || signal_pending(current)) {
+		if (req_event)
+			kvm_make_request(KVM_REQ_EVENT, vcpu);
 		atomic_set(&vcpu->guest_mode, 0);
 		smp_wmb();
 		local_irq_enable();
@@ -5055,17 +5067,19 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		goto out;
 	}
 
-	inject_pending_event(vcpu);
+	if (req_event || req_int_win) {
+		inject_pending_event(vcpu);
 
-	/* enable NMI/IRQ window open exits if needed */
-	if (vcpu->arch.nmi_pending)
-		kvm_x86_ops->enable_nmi_window(vcpu);
-	else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
-		kvm_x86_ops->enable_irq_window(vcpu);
+		/* enable NMI/IRQ window open exits if needed */
+		if (vcpu->arch.nmi_pending)
+			kvm_x86_ops->enable_nmi_window(vcpu);
+		else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
+			kvm_x86_ops->enable_irq_window(vcpu);
 
-	if (kvm_lapic_enabled(vcpu)) {
-		update_cr8_intercept(vcpu);
-		kvm_lapic_sync_to_vapic(vcpu);
+		if (kvm_lapic_enabled(vcpu)) {
+			update_cr8_intercept(vcpu);
+			kvm_lapic_sync_to_vapic(vcpu);
+		}
 	}
 
 	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
@@ -5305,6 +5319,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 
 	vcpu->arch.exception.pending = false;
 
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
+
 	return 0;
 }
 
@@ -5368,6 +5384,7 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 				    struct kvm_mp_state *mp_state)
 {
 	vcpu->arch.mp_state = mp_state->mp_state;
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
 	return 0;
 }
 
@@ -5389,6 +5406,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
 	memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
 	kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
 	kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
 	return EMULATE_DONE;
 }
 EXPORT_SYMBOL_GPL(kvm_task_switch);
@@ -5459,6 +5477,8 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	    !is_protmode(vcpu))
 		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
+
 	return 0;
 }
 
@@ -5691,6 +5711,8 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
 	vcpu->arch.dr6 = DR6_FIXED_1;
 	vcpu->arch.dr7 = DR7_FIXED_1;
 
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
+
 	return kvm_x86_ops->vcpu_reset(vcpu);
 }
 
@@ -6001,6 +6023,7 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 	    kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
 		rflags |= X86_EFLAGS_TF;
 	kvm_x86_ops->set_rflags(vcpu, rflags);
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_set_rflags);
 
-- 
cgit v1.2.3-18-g5258


From 51aa01d13d4a64422cf8095205fc4a02322aca2c Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 20 Jul 2010 14:31:20 +0300
Subject: KVM: VMX: Split up vmx_complete_interrupts()

vmx_complete_interrupts() does too much, split it up:
 - vmx_vcpu_run() gets the "cache important vmcs fields" part
 - a new vmx_complete_atomic_exit() gets the parts that must be done atomically
 - a new vmx_recover_nmi_blocking() does what its name says
 - vmx_complete_interrupts() retains the event injection recovery code

This helps in reducing the work done in atomic context.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 39 +++++++++++++++++++++++++++------------
 1 file changed, 27 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2ce2e0b13ed..927d8404505 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -125,6 +125,7 @@ struct vcpu_vmx {
 	unsigned long         host_rsp;
 	int                   launched;
 	u8                    fail;
+	u32                   exit_intr_info;
 	u32                   idt_vectoring_info;
 	struct shared_msr_entry *guest_msrs;
 	int                   nmsrs;
@@ -3775,18 +3776,9 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
 	vmcs_write32(TPR_THRESHOLD, irr);
 }
 
-static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
+static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
 {
-	u32 exit_intr_info;
-	u32 idt_vectoring_info = vmx->idt_vectoring_info;
-	bool unblock_nmi;
-	u8 vector;
-	int type;
-	bool idtv_info_valid;
-
-	exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
-
-	vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
+	u32 exit_intr_info = vmx->exit_intr_info;
 
 	/* Handle machine checks before interrupts are enabled */
 	if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
@@ -3801,8 +3793,16 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 		asm("int $2");
 		kvm_after_handle_nmi(&vmx->vcpu);
 	}
+}
 
-	idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
+static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
+{
+	u32 exit_intr_info = vmx->exit_intr_info;
+	bool unblock_nmi;
+	u8 vector;
+	bool idtv_info_valid;
+
+	idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
 
 	if (cpu_has_virtual_nmis()) {
 		unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
@@ -3824,6 +3824,16 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 	} else if (unlikely(vmx->soft_vnmi_blocked))
 		vmx->vnmi_blocked_time +=
 			ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
+}
+
+static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
+{
+	u32 idt_vectoring_info = vmx->idt_vectoring_info;
+	u8 vector;
+	int type;
+	bool idtv_info_valid;
+
+	idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
 
 	vmx->vcpu.arch.nmi_injected = false;
 	kvm_clear_exception_queue(&vmx->vcpu);
@@ -4036,6 +4046,11 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
 	vmx->launched = 1;
 
+	vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
+	vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
+	vmx_complete_atomic_exit(vmx);
+	vmx_recover_nmi_blocking(vmx);
 	vmx_complete_interrupts(vmx);
 }
 
-- 
cgit v1.2.3-18-g5258


From 537b37e2674b7e4390a490e03cae53ca9ca99e30 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 22 Jul 2010 12:54:21 +0300
Subject: KVM: VMX: Move real-mode interrupt injection fixup to
 vmx_complete_interrupts()

This allows reuse of vmx_complete_interrupts() for cancelling injections.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 927d8404505..541f0d2412b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -182,6 +182,7 @@ static int init_rmode(struct kvm *kvm);
 static u64 construct_eptp(unsigned long root_hpa);
 static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
+static void fixup_rmode_irq(struct vcpu_vmx *vmx);
 
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -3828,11 +3829,15 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
 
 static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 {
-	u32 idt_vectoring_info = vmx->idt_vectoring_info;
+	u32 idt_vectoring_info;
 	u8 vector;
 	int type;
 	bool idtv_info_valid;
 
+	if (vmx->rmode.irq.pending)
+		fixup_rmode_irq(vmx);
+
+	idt_vectoring_info = vmx->idt_vectoring_info;
 	idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
 
 	vmx->vcpu.arch.nmi_injected = false;
@@ -4040,8 +4045,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	vcpu->arch.regs_dirty = 0;
 
 	vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
-	if (vmx->rmode.irq.pending)
-		fixup_rmode_irq(vmx);
 
 	asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
 	vmx->launched = 1;
-- 
cgit v1.2.3-18-g5258


From 83422e17c19d61399cab7dbf9bf40ff9af2a7dd2 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 20 Jul 2010 14:43:23 +0300
Subject: KVM: VMX: Parameterize vmx_complete_interrupts() for both exit and
 entry

Currently vmx_complete_interrupts() can decode event information from vmx
exit fields into the generic kvm event queues.  Make it able to decode
the information from the entry fields as well by parametrizing it.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 34 +++++++++++++++++++++-------------
 1 file changed, 21 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 541f0d2412b..3237f6cc930 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -182,7 +182,7 @@ static int init_rmode(struct kvm *kvm);
 static u64 construct_eptp(unsigned long root_hpa);
 static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
-static void fixup_rmode_irq(struct vcpu_vmx *vmx);
+static void fixup_rmode_irq(struct vcpu_vmx *vmx, u32 *idt_vectoring_info);
 
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -3827,17 +3827,18 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
 			ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
 }
 
-static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
+static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
+				      u32 idt_vectoring_info,
+				      int instr_len_field,
+				      int error_code_field)
 {
-	u32 idt_vectoring_info;
 	u8 vector;
 	int type;
 	bool idtv_info_valid;
 
 	if (vmx->rmode.irq.pending)
-		fixup_rmode_irq(vmx);
+		fixup_rmode_irq(vmx, &idt_vectoring_info);
 
-	idt_vectoring_info = vmx->idt_vectoring_info;
 	idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
 
 	vmx->vcpu.arch.nmi_injected = false;
@@ -3865,18 +3866,18 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 		break;
 	case INTR_TYPE_SOFT_EXCEPTION:
 		vmx->vcpu.arch.event_exit_inst_len =
-			vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+			vmcs_read32(instr_len_field);
 		/* fall through */
 	case INTR_TYPE_HARD_EXCEPTION:
 		if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
-			u32 err = vmcs_read32(IDT_VECTORING_ERROR_CODE);
+			u32 err = vmcs_read32(error_code_field);
 			kvm_queue_exception_e(&vmx->vcpu, vector, err);
 		} else
 			kvm_queue_exception(&vmx->vcpu, vector);
 		break;
 	case INTR_TYPE_SOFT_INTR:
 		vmx->vcpu.arch.event_exit_inst_len =
-			vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+			vmcs_read32(instr_len_field);
 		/* fall through */
 	case INTR_TYPE_EXT_INTR:
 		kvm_queue_interrupt(&vmx->vcpu, vector,
@@ -3887,24 +3888,31 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 	}
 }
 
+static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
+{
+	__vmx_complete_interrupts(vmx, vmx->idt_vectoring_info,
+				  VM_EXIT_INSTRUCTION_LEN,
+				  IDT_VECTORING_ERROR_CODE);
+}
+
 /*
  * Failure to inject an interrupt should give us the information
  * in IDT_VECTORING_INFO_FIELD.  However, if the failure occurs
  * when fetching the interrupt redirection bitmap in the real-mode
  * tss, this doesn't happen.  So we do it ourselves.
  */
-static void fixup_rmode_irq(struct vcpu_vmx *vmx)
+static void fixup_rmode_irq(struct vcpu_vmx *vmx, u32 *idt_vectoring_info)
 {
 	vmx->rmode.irq.pending = 0;
 	if (kvm_rip_read(&vmx->vcpu) + 1 != vmx->rmode.irq.rip)
 		return;
 	kvm_rip_write(&vmx->vcpu, vmx->rmode.irq.rip);
-	if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
-		vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK;
-		vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR;
+	if (*idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
+		*idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK;
+		*idt_vectoring_info |= INTR_TYPE_EXT_INTR;
 		return;
 	}
-	vmx->idt_vectoring_info =
+	*idt_vectoring_info =
 		VECTORING_INFO_VALID_MASK
 		| INTR_TYPE_EXT_INTR
 		| vmx->rmode.irq.vector;
-- 
cgit v1.2.3-18-g5258


From b463a6f744a263fccd7da14db1afdc880371a280 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 20 Jul 2010 15:06:17 +0300
Subject: KVM: Non-atomic interrupt injection

Change the interrupt injection code to work from preemptible, interrupts
enabled context.  This works by adding a ->cancel_injection() operation
that undoes an injection in case we were not able to actually enter the guest
(this condition could never happen with atomic injection).

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/svm.c              | 12 ++++++++++++
 arch/x86/kvm/vmx.c              | 11 +++++++++++
 arch/x86/kvm/x86.c              | 36 ++++++++++++++++--------------------
 4 files changed, 40 insertions(+), 20 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b43686a4487..80224bf5d4f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -552,6 +552,7 @@ struct kvm_x86_ops {
 	void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
 				bool has_error_code, u32 error_code,
 				bool reinject);
+	void (*cancel_injection)(struct kvm_vcpu *vcpu);
 	int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
 	int (*nmi_allowed)(struct kvm_vcpu *vcpu);
 	bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 1d2ea65d353..1a85fc507cf 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3261,6 +3261,17 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
 	}
 }
 
+static void svm_cancel_injection(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	struct vmcb_control_area *control = &svm->vmcb->control;
+
+	control->exit_int_info = control->event_inj;
+	control->exit_int_info_err = control->event_inj_err;
+	control->event_inj = 0;
+	svm_complete_interrupts(svm);
+}
+
 #ifdef CONFIG_X86_64
 #define R "r"
 #else
@@ -3631,6 +3642,7 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.set_irq = svm_set_irq,
 	.set_nmi = svm_inject_nmi,
 	.queue_exception = svm_queue_exception,
+	.cancel_injection = svm_cancel_injection,
 	.interrupt_allowed = svm_interrupt_allowed,
 	.nmi_allowed = svm_nmi_allowed,
 	.get_nmi_mask = svm_get_nmi_mask,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 3237f6cc930..70af3db372d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3895,6 +3895,16 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 				  IDT_VECTORING_ERROR_CODE);
 }
 
+static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
+{
+	__vmx_complete_interrupts(to_vmx(vcpu),
+				  vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
+				  VM_ENTRY_INSTRUCTION_LEN,
+				  VM_ENTRY_EXCEPTION_ERROR_CODE);
+
+	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
+}
+
 /*
  * Failure to inject an interrupt should give us the information
  * in IDT_VECTORING_INFO_FIELD.  However, if the failure occurs
@@ -4348,6 +4358,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.set_irq = vmx_inject_irq,
 	.set_nmi = vmx_inject_nmi,
 	.queue_exception = vmx_queue_exception,
+	.cancel_injection = vmx_cancel_injection,
 	.interrupt_allowed = vmx_interrupt_allowed,
 	.nmi_allowed = vmx_nmi_allowed,
 	.get_nmi_mask = vmx_get_nmi_mask,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e7198036db6..a465bd29f38 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5005,7 +5005,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	int r;
 	bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
 		vcpu->run->request_interrupt_window;
-	bool req_event;
 
 	if (vcpu->requests) {
 		if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
@@ -5041,6 +5040,21 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	if (unlikely(r))
 		goto out;
 
+	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
+		inject_pending_event(vcpu);
+
+		/* enable NMI/IRQ window open exits if needed */
+		if (vcpu->arch.nmi_pending)
+			kvm_x86_ops->enable_nmi_window(vcpu);
+		else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
+			kvm_x86_ops->enable_irq_window(vcpu);
+
+		if (kvm_lapic_enabled(vcpu)) {
+			update_cr8_intercept(vcpu);
+			kvm_lapic_sync_to_vapic(vcpu);
+		}
+	}
+
 	preempt_disable();
 
 	kvm_x86_ops->prepare_guest_switch(vcpu);
@@ -5053,35 +5067,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
 	local_irq_disable();
 
-	req_event = kvm_check_request(KVM_REQ_EVENT, vcpu);
-
 	if (!atomic_read(&vcpu->guest_mode) || vcpu->requests
 	    || need_resched() || signal_pending(current)) {
-		if (req_event)
-			kvm_make_request(KVM_REQ_EVENT, vcpu);
 		atomic_set(&vcpu->guest_mode, 0);
 		smp_wmb();
 		local_irq_enable();
 		preempt_enable();
+		kvm_x86_ops->cancel_injection(vcpu);
 		r = 1;
 		goto out;
 	}
 
-	if (req_event || req_int_win) {
-		inject_pending_event(vcpu);
-
-		/* enable NMI/IRQ window open exits if needed */
-		if (vcpu->arch.nmi_pending)
-			kvm_x86_ops->enable_nmi_window(vcpu);
-		else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
-			kvm_x86_ops->enable_irq_window(vcpu);
-
-		if (kvm_lapic_enabled(vcpu)) {
-			update_cr8_intercept(vcpu);
-			kvm_lapic_sync_to_vapic(vcpu);
-		}
-	}
-
 	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
 
 	kvm_guest_enter();
-- 
cgit v1.2.3-18-g5258


From 625831a3f40d330c611fe37cf501d80d611921f9 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 22 Jul 2010 13:09:54 +0300
Subject: KVM: VMX: Move fixup_rmode_irq() to avoid forward declaration

No code changes.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 47 +++++++++++++++++++++++------------------------
 1 file changed, 23 insertions(+), 24 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 70af3db372d..32315935201 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -182,7 +182,6 @@ static int init_rmode(struct kvm *kvm);
 static u64 construct_eptp(unsigned long root_hpa);
 static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
-static void fixup_rmode_irq(struct vcpu_vmx *vmx, u32 *idt_vectoring_info);
 
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -3827,6 +3826,29 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
 			ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
 }
 
+/*
+ * Failure to inject an interrupt should give us the information
+ * in IDT_VECTORING_INFO_FIELD.  However, if the failure occurs
+ * when fetching the interrupt redirection bitmap in the real-mode
+ * tss, this doesn't happen.  So we do it ourselves.
+ */
+static void fixup_rmode_irq(struct vcpu_vmx *vmx, u32 *idt_vectoring_info)
+{
+	vmx->rmode.irq.pending = 0;
+	if (kvm_rip_read(&vmx->vcpu) + 1 != vmx->rmode.irq.rip)
+		return;
+	kvm_rip_write(&vmx->vcpu, vmx->rmode.irq.rip);
+	if (*idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
+		*idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK;
+		*idt_vectoring_info |= INTR_TYPE_EXT_INTR;
+		return;
+	}
+	*idt_vectoring_info =
+		VECTORING_INFO_VALID_MASK
+		| INTR_TYPE_EXT_INTR
+		| vmx->rmode.irq.vector;
+}
+
 static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
 				      u32 idt_vectoring_info,
 				      int instr_len_field,
@@ -3905,29 +3927,6 @@ static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
 	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
 }
 
-/*
- * Failure to inject an interrupt should give us the information
- * in IDT_VECTORING_INFO_FIELD.  However, if the failure occurs
- * when fetching the interrupt redirection bitmap in the real-mode
- * tss, this doesn't happen.  So we do it ourselves.
- */
-static void fixup_rmode_irq(struct vcpu_vmx *vmx, u32 *idt_vectoring_info)
-{
-	vmx->rmode.irq.pending = 0;
-	if (kvm_rip_read(&vmx->vcpu) + 1 != vmx->rmode.irq.rip)
-		return;
-	kvm_rip_write(&vmx->vcpu, vmx->rmode.irq.rip);
-	if (*idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
-		*idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK;
-		*idt_vectoring_info |= INTR_TYPE_EXT_INTR;
-		return;
-	}
-	*idt_vectoring_info =
-		VECTORING_INFO_VALID_MASK
-		| INTR_TYPE_EXT_INTR
-		| vmx->rmode.irq.vector;
-}
-
 #ifdef CONFIG_X86_64
 #define R "r"
 #define Q "q"
-- 
cgit v1.2.3-18-g5258


From 0959ffacf39b1ae7f56072b0c64429ee528100ca Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 14 Sep 2010 17:46:12 +0200
Subject: KVM: MMU: Don't track nested fault info in error-code

This patch moves the detection whether a page-fault was
nested or not out of the error code and moves it into a
separate variable in the fault struct.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/mmu.h              |  1 -
 arch/x86/kvm/x86.c              | 14 ++++----------
 3 files changed, 5 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 80224bf5d4f..519d6f78498 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -322,6 +322,7 @@ struct kvm_vcpu_arch {
 	struct {
 		u64      address;
 		unsigned error_code;
+		bool     nested;
 	} fault;
 
 	/* only needed in kvm_pv_mmu_op() path, but it's hot so
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 513abbb5ff4..7086ca85d3e 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -47,7 +47,6 @@
 #define PFERR_USER_MASK (1U << 2)
 #define PFERR_RSVD_MASK (1U << 3)
 #define PFERR_FETCH_MASK (1U << 4)
-#define PFERR_NESTED_MASK (1U << 31)
 
 int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
 int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a465bd29f38..a51635ee85e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -342,18 +342,12 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu)
 
 void kvm_propagate_fault(struct kvm_vcpu *vcpu)
 {
-	u32 nested, error;
-
-	error   = vcpu->arch.fault.error_code;
-	nested  = error &  PFERR_NESTED_MASK;
-	error   = error & ~PFERR_NESTED_MASK;
-
-	vcpu->arch.fault.error_code = error;
-
-	if (mmu_is_nested(vcpu) && !nested)
+	if (mmu_is_nested(vcpu) && !vcpu->arch.fault.nested)
 		vcpu->arch.nested_mmu.inject_page_fault(vcpu);
 	else
 		vcpu->arch.mmu.inject_page_fault(vcpu);
+
+	vcpu->arch.fault.nested = false;
 }
 
 void kvm_inject_nmi(struct kvm_vcpu *vcpu)
@@ -3524,7 +3518,7 @@ static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
 	access |= PFERR_USER_MASK;
 	t_gpa  = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &error);
 	if (t_gpa == UNMAPPED_GVA)
-		vcpu->arch.fault.error_code |= PFERR_NESTED_MASK;
+		vcpu->arch.fault.nested = true;
 
 	return t_gpa;
 }
-- 
cgit v1.2.3-18-g5258


From 28e4639adf0c9f26f6bb56149b7ab547bf33bb95 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zamsden@redhat.com>
Date: Sat, 18 Sep 2010 14:38:12 -1000
Subject: KVM: x86: Fix kvmclock bug

If preempted after kvmclock values are updated, but before hardware
virtualization is entered, the last tsc time as read by the guest is
never set.  It underflows the next time kvmclock is updated if there
has not yet been a successful entry / exit into hardware virt.

Fix this by simply setting last_tsc to the newly read tsc value so
that any computed nsec advance of kvmclock is nulled.

Signed-off-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a51635ee85e..0b021e16f9d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1095,6 +1095,7 @@ static int kvm_write_guest_time(struct kvm_vcpu *v)
 	vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
 	vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
 	vcpu->last_kernel_ns = kernel_ns;
+	vcpu->last_guest_tsc = tsc_timestamp;
 	vcpu->hv_clock.flags = 0;
 
 	/*
-- 
cgit v1.2.3-18-g5258


From f4f510508741680e423524c222f615276ca6222c Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 19 Sep 2010 18:44:07 +0200
Subject: KVM: Convert PIC lock from raw spinlock to ordinary spinlock

The PIC code used to be called from preempt_disable() context, which
wasn't very good for PREEMPT_RT.  That is no longer the case, so move
back from raw_spinlock_t to spinlock_t.

Signed-off-by: Avi Kivity <avi@redhat.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/i8259.c | 6 +++---
 arch/x86/kvm/irq.h   | 2 +-
 arch/x86/kvm/x86.c   | 8 ++++----
 3 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index ab1bb8ff9a8..dd54c5bb2e5 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -39,7 +39,7 @@ static void pic_irq_request(struct kvm *kvm, int level);
 static void pic_lock(struct kvm_pic *s)
 	__acquires(&s->lock)
 {
-	raw_spin_lock(&s->lock);
+	spin_lock(&s->lock);
 }
 
 static void pic_unlock(struct kvm_pic *s)
@@ -51,7 +51,7 @@ static void pic_unlock(struct kvm_pic *s)
 
 	s->wakeup_needed = false;
 
-	raw_spin_unlock(&s->lock);
+	spin_unlock(&s->lock);
 
 	if (wakeup) {
 		kvm_for_each_vcpu(i, vcpu, s->kvm) {
@@ -569,7 +569,7 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
 	s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
 	if (!s)
 		return NULL;
-	raw_spin_lock_init(&s->lock);
+	spin_lock_init(&s->lock);
 	s->kvm = kvm;
 	s->pics[0].elcr_mask = 0xf8;
 	s->pics[1].elcr_mask = 0xde;
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 63c31450299..ba910d14941 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -60,7 +60,7 @@ struct kvm_kpic_state {
 };
 
 struct kvm_pic {
-	raw_spinlock_t lock;
+	spinlock_t lock;
 	bool wakeup_needed;
 	unsigned pending_acks;
 	struct kvm *kvm;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0b021e16f9d..3adf6921e88 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3011,18 +3011,18 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
 	r = 0;
 	switch (chip->chip_id) {
 	case KVM_IRQCHIP_PIC_MASTER:
-		raw_spin_lock(&pic_irqchip(kvm)->lock);
+		spin_lock(&pic_irqchip(kvm)->lock);
 		memcpy(&pic_irqchip(kvm)->pics[0],
 			&chip->chip.pic,
 			sizeof(struct kvm_pic_state));
-		raw_spin_unlock(&pic_irqchip(kvm)->lock);
+		spin_unlock(&pic_irqchip(kvm)->lock);
 		break;
 	case KVM_IRQCHIP_PIC_SLAVE:
-		raw_spin_lock(&pic_irqchip(kvm)->lock);
+		spin_lock(&pic_irqchip(kvm)->lock);
 		memcpy(&pic_irqchip(kvm)->pics[1],
 			&chip->chip.pic,
 			sizeof(struct kvm_pic_state));
-		raw_spin_unlock(&pic_irqchip(kvm)->lock);
+		spin_unlock(&pic_irqchip(kvm)->lock);
 		break;
 	case KVM_IRQCHIP_IOAPIC:
 		r = kvm_set_ioapic(kvm, &chip->chip.ioapic);
-- 
cgit v1.2.3-18-g5258


From a0a07cd2c5fc8703db8a07287cdde3d29a286082 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 20 Sep 2010 10:15:32 +0200
Subject: KVM: SVM: do not generate "external interrupt exit" if other exit is
 pending

Nested SVM checks for external interrupt after injecting nested exception.
In case there is external interrupt pending the code generates "external
interrupt exit" and overwrites previous exit info. If previously injected
exception already generated exit it will be lost.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Acked-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/svm.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 1a85fc507cf..c929d007696 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1707,6 +1707,14 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm)
 	if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
 		return false;
 
+	/*
+	 * if vmexit was already requested (by intercepted exception
+	 * for instance) do not overwrite it with "external interrupt"
+	 * vmexit.
+	 */
+	if (svm->nested.exit_required)
+		return false;
+
 	svm->vmcb->control.exit_code   = SVM_EXIT_INTR;
 	svm->vmcb->control.exit_info_1 = 0;
 	svm->vmcb->control.exit_info_2 = 0;
-- 
cgit v1.2.3-18-g5258


From cb16a7b3872e9a806f16b1f09b59103fafc7b796 Mon Sep 17 00:00:00 2001
From: Hillf Danton <dhillf@gmail.com>
Date: Sat, 18 Sep 2010 08:41:02 +0800
Subject: KVM: MMU: fix counting of rmap entries in rmap_add()

It seems that rmap entries are under counted.

Signed-off-by: Hillf Danton <dhillf@gmail.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3ce56bfe056..c94c43289f5 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -632,6 +632,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 		desc->sptes[0] = (u64 *)*rmapp;
 		desc->sptes[1] = spte;
 		*rmapp = (unsigned long)desc | 1;
+		++count;
 	} else {
 		rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
 		desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
@@ -644,7 +645,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 			desc = desc->more;
 		}
 		for (i = 0; desc->sptes[i]; ++i)
-			;
+			++count;
 		desc->sptes[i] = spte;
 	}
 	return count;
-- 
cgit v1.2.3-18-g5258


From 4ab8e02404fcbc16beefac66de24dbb2706fe2f3 Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <m.gamal005@gmail.com>
Date: Sun, 19 Sep 2010 14:34:05 +0200
Subject: KVM: x86 emulator: Expose emulate_int_real()

Signed-off-by: Mohammed Gamal <m.gamal005@gmail.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 5187dd88019..b36c6b3fe14 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -260,5 +260,6 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt);
 int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
 			 u16 tss_selector, int reason,
 			 bool has_error_code, u32 error_code);
-
+int emulate_int_real(struct x86_emulate_ctxt *ctxt,
+		     struct x86_emulate_ops *ops, int irq);
 #endif /* _ASM_X86_KVM_X86_EMULATE_H */
-- 
cgit v1.2.3-18-g5258


From 63995653ade16deacaea5b49ceaf6376314593ac Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <m.gamal005@gmail.com>
Date: Sun, 19 Sep 2010 14:34:06 +0200
Subject: KVM: Add kvm_inject_realmode_interrupt() wrapper

This adds a wrapper function kvm_inject_realmode_interrupt() around the
emulator function emulate_int_real() to allow real mode interrupt injection.

[avi: initialize operand and address sizes before emulating interrupts]
[avi: initialize rip for real mode interrupt injection]
[avi: clear interrupt pending flag after emulating interrupt injection]

Signed-off-by: Mohammed Gamal <m.gamal005@gmail.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 29 +++++++++++++++++++++++++++++
 arch/x86/kvm/x86.h |  1 +
 2 files changed, 30 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3adf6921e88..7d2880500fa 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4188,6 +4188,35 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
 	memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
 }
 
+int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq)
+{
+	struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
+	int ret;
+
+	init_emulate_ctxt(vcpu);
+
+	vcpu->arch.emulate_ctxt.decode.op_bytes = 2;
+	vcpu->arch.emulate_ctxt.decode.ad_bytes = 2;
+	vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip;
+	ret = emulate_int_real(&vcpu->arch.emulate_ctxt, &emulate_ops, irq);
+
+	if (ret != X86EMUL_CONTINUE)
+		return EMULATE_FAIL;
+
+	vcpu->arch.emulate_ctxt.eip = c->eip;
+	memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
+	kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
+	kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+
+	if (irq == NMI_VECTOR)
+		vcpu->arch.nmi_pending = false;
+	else
+		vcpu->arch.interrupt.pending = false;
+
+	return EMULATE_DONE;
+}
+EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
+
 static int handle_emulation_failure(struct kvm_vcpu *vcpu)
 {
 	++vcpu->stat.insn_emulation_fail;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index bf4dc2f40d7..2cea414489f 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -72,6 +72,7 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
 
 void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
 void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
+int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq);
 
 void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data);
 
-- 
cgit v1.2.3-18-g5258


From a92601bb707f6f49fd5563ef3d09928e70cc222e Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <m.gamal005@gmail.com>
Date: Sun, 19 Sep 2010 14:34:07 +0200
Subject: KVM: VMX: Emulated real mode interrupt injection

Replace the inject-as-software-interrupt hack we currently have with
emulated injection.

Signed-off-by: Mohammed Gamal <m.gamal005@gmail.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 65 +++++-------------------------------------------------
 1 file changed, 6 insertions(+), 59 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 32315935201..9d3f972aa19 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -155,11 +155,6 @@ struct vcpu_vmx {
 			u32 limit;
 			u32 ar;
 		} tr, es, ds, fs, gs;
-		struct {
-			bool pending;
-			u8 vector;
-			unsigned rip;
-		} irq;
 	} rmode;
 	int vpid;
 	bool emulation_required;
@@ -1028,16 +1023,8 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 	}
 
 	if (vmx->rmode.vm86_active) {
-		vmx->rmode.irq.pending = true;
-		vmx->rmode.irq.vector = nr;
-		vmx->rmode.irq.rip = kvm_rip_read(vcpu);
-		if (kvm_exception_is_soft(nr))
-			vmx->rmode.irq.rip +=
-				vmx->vcpu.arch.event_exit_inst_len;
-		intr_info |= INTR_TYPE_SOFT_INTR;
-		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
-		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
-		kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
+		if (kvm_inject_realmode_interrupt(vcpu, nr) != EMULATE_DONE)
+			kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
 		return;
 	}
 
@@ -2816,16 +2803,8 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
 
 	++vcpu->stat.irq_injections;
 	if (vmx->rmode.vm86_active) {
-		vmx->rmode.irq.pending = true;
-		vmx->rmode.irq.vector = irq;
-		vmx->rmode.irq.rip = kvm_rip_read(vcpu);
-		if (vcpu->arch.interrupt.soft)
-			vmx->rmode.irq.rip +=
-				vmx->vcpu.arch.event_exit_inst_len;
-		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
-			     irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK);
-		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
-		kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
+		if (kvm_inject_realmode_interrupt(vcpu, irq) != EMULATE_DONE)
+			kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
 		return;
 	}
 	intr = irq | INTR_INFO_VALID_MASK;
@@ -2857,14 +2836,8 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
 
 	++vcpu->stat.nmi_injections;
 	if (vmx->rmode.vm86_active) {
-		vmx->rmode.irq.pending = true;
-		vmx->rmode.irq.vector = NMI_VECTOR;
-		vmx->rmode.irq.rip = kvm_rip_read(vcpu);
-		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
-			     NMI_VECTOR | INTR_TYPE_SOFT_INTR |
-			     INTR_INFO_VALID_MASK);
-		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
-		kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
+		if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR) != EMULATE_DONE)
+			kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
 		return;
 	}
 	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
@@ -3826,29 +3799,6 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
 			ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
 }
 
-/*
- * Failure to inject an interrupt should give us the information
- * in IDT_VECTORING_INFO_FIELD.  However, if the failure occurs
- * when fetching the interrupt redirection bitmap in the real-mode
- * tss, this doesn't happen.  So we do it ourselves.
- */
-static void fixup_rmode_irq(struct vcpu_vmx *vmx, u32 *idt_vectoring_info)
-{
-	vmx->rmode.irq.pending = 0;
-	if (kvm_rip_read(&vmx->vcpu) + 1 != vmx->rmode.irq.rip)
-		return;
-	kvm_rip_write(&vmx->vcpu, vmx->rmode.irq.rip);
-	if (*idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
-		*idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK;
-		*idt_vectoring_info |= INTR_TYPE_EXT_INTR;
-		return;
-	}
-	*idt_vectoring_info =
-		VECTORING_INFO_VALID_MASK
-		| INTR_TYPE_EXT_INTR
-		| vmx->rmode.irq.vector;
-}
-
 static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
 				      u32 idt_vectoring_info,
 				      int instr_len_field,
@@ -3858,9 +3808,6 @@ static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
 	int type;
 	bool idtv_info_valid;
 
-	if (vmx->rmode.irq.pending)
-		fixup_rmode_irq(vmx, &idt_vectoring_info);
-
 	idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
 
 	vmx->vcpu.arch.nmi_injected = false;
-- 
cgit v1.2.3-18-g5258


From 49e9d557f9b6e9639390b63b645f2def8dde5f1b Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 19 Sep 2010 14:34:08 +0200
Subject: KVM: VMX: Respect interrupt window in big real mode

If an interrupt is pending, we need to stop emulation so we
can inject it.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9d3f972aa19..28c72da93a1 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3582,8 +3582,17 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	enum emulation_result err = EMULATE_DONE;
 	int ret = 1;
+	u32 cpu_exec_ctrl;
+	bool intr_window_requested;
+
+	cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+	intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING;
 
 	while (!guest_state_valid(vcpu)) {
+		if (intr_window_requested
+		    && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF))
+			return handle_interrupt_window(&vmx->vcpu);
+
 		err = emulate_instruction(vcpu, 0, 0, 0);
 
 		if (err == EMULATE_DO_MMIO) {
-- 
cgit v1.2.3-18-g5258


From 5f4e3f882731c65b5d64a2ff743fda96eaebb9ee Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zamsden@redhat.com>
Date: Sat, 18 Sep 2010 14:38:13 -1000
Subject: KVM: x86: Make math work for other scales

The math in kvm_get_time_scale relies on the fact that
NSEC_PER_SEC < 2^32.  To use the same function to compute
arbitrary time scales, we must extend the first reduction
step to shrink the base rate to a 32-bit value, and
possibly reduce the scaled rate into a 32-bit as well.

Note we must take care to avoid an arithmetic overflow
when scaling up the tps32 value (this could not happen
with the fixed scaled value of NSEC_PER_SEC, but can
happen with scaled rates above 2^31.

Signed-off-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7d2880500fa..6666af84019 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -920,31 +920,35 @@ static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
 	return quotient;
 }
 
-static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock)
+static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
+			       s8 *pshift, u32 *pmultiplier)
 {
-	uint64_t nsecs = 1000000000LL;
+	uint64_t scaled64;
 	int32_t  shift = 0;
 	uint64_t tps64;
 	uint32_t tps32;
 
-	tps64 = tsc_khz * 1000LL;
-	while (tps64 > nsecs*2) {
+	tps64 = base_khz * 1000LL;
+	scaled64 = scaled_khz * 1000LL;
+	while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000UL) {
 		tps64 >>= 1;
 		shift--;
 	}
 
 	tps32 = (uint32_t)tps64;
-	while (tps32 <= (uint32_t)nsecs) {
-		tps32 <<= 1;
+	while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000UL) {
+		if (scaled64 & 0xffffffff00000000UL || tps32 & 0x80000000)
+			scaled64 >>= 1;
+		else
+			tps32 <<= 1;
 		shift++;
 	}
 
-	hv_clock->tsc_shift = shift;
-	hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
+	*pshift = shift;
+	*pmultiplier = div_frac(scaled64, tps32);
 
-	pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n",
-		 __func__, tsc_khz, hv_clock->tsc_shift,
-		 hv_clock->tsc_to_system_mul);
+	pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n",
+		 __func__, base_khz, scaled_khz, shift, *pmultiplier);
 }
 
 static inline u64 get_kernel_ns(void)
@@ -1084,7 +1088,9 @@ static int kvm_write_guest_time(struct kvm_vcpu *v)
 	}
 
 	if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
-		kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock);
+		kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz,
+				   &vcpu->hv_clock.tsc_shift,
+				   &vcpu->hv_clock.tsc_to_system_mul);
 		vcpu->hw_tsc_khz = this_tsc_khz;
 	}
 
-- 
cgit v1.2.3-18-g5258


From 34c238a1d1832d7b1f655641f52782e86396b30a Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zamsden@redhat.com>
Date: Sat, 18 Sep 2010 14:38:14 -1000
Subject: KVM: x86: Rename timer function

This just changes some names to better reflect the usage they
will be given.  Separated out to keep confusion to a minimum.

Signed-off-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6666af84019..ce57cd899a6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -892,7 +892,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
 
 	/*
 	 * The guest calculates current wall clock time by adding
-	 * system time (updated by kvm_write_guest_time below) to the
+	 * system time (updated by kvm_guest_time_update below) to the
 	 * wall clock specified here.  guest system time equals host
 	 * system time for us, thus we must fill in host boot time here.
 	 */
@@ -1032,7 +1032,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
 }
 EXPORT_SYMBOL_GPL(kvm_write_tsc);
 
-static int kvm_write_guest_time(struct kvm_vcpu *v)
+static int kvm_guest_time_update(struct kvm_vcpu *v)
 {
 	unsigned long flags;
 	struct kvm_vcpu_arch *vcpu = &v->arch;
@@ -1052,7 +1052,7 @@ static int kvm_write_guest_time(struct kvm_vcpu *v)
 	local_irq_restore(flags);
 
 	if (unlikely(this_tsc_khz == 0)) {
-		kvm_make_request(KVM_REQ_KVMCLOCK_UPDATE, v);
+		kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
 		return 1;
 	}
 
@@ -1128,7 +1128,7 @@ static int kvm_request_guest_time_update(struct kvm_vcpu *v)
 
 	if (!vcpu->time_page)
 		return 0;
-	kvm_make_request(KVM_REQ_KVMCLOCK_UPDATE, v);
+	kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
 	return 1;
 }
 
@@ -5041,8 +5041,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 			kvm_mmu_unload(vcpu);
 		if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
 			__kvm_migrate_timers(vcpu);
-		if (kvm_check_request(KVM_REQ_KVMCLOCK_UPDATE, vcpu)) {
-			r = kvm_write_guest_time(vcpu);
+		if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
+			r = kvm_guest_time_update(vcpu);
 			if (unlikely(r))
 				goto out;
 		}
-- 
cgit v1.2.3-18-g5258


From c285545f813d7b0ce989fd34e42ad1fe785dc65d Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zamsden@redhat.com>
Date: Sat, 18 Sep 2010 14:38:15 -1000
Subject: KVM: x86: TSC catchup mode

Negate the effects of AN TYM spell while kvm thread is preempted by tracking
conversion factor to the highest TSC rate and catching the TSC up when it has
fallen behind the kernel view of time.  Note that once triggered, we don't
turn off catchup mode.

A slightly more clever version of this is possible, which only does catchup
when TSC rate drops, and which specifically targets only CPUs with broken
TSC, but since these all are considered unstable_tsc(), this patch covers
all necessary cases.

Signed-off-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  6 +++
 arch/x86/kvm/x86.c              | 87 +++++++++++++++++++++++++++++++----------
 2 files changed, 72 insertions(+), 21 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 519d6f78498..9e6fe391094 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -384,6 +384,9 @@ struct kvm_vcpu_arch {
 	u64 last_host_tsc;
 	u64 last_guest_tsc;
 	u64 last_kernel_ns;
+	u64 last_tsc_nsec;
+	u64 last_tsc_write;
+	bool tsc_catchup;
 
 	bool nmi_pending;
 	bool nmi_injected;
@@ -444,6 +447,9 @@ struct kvm_arch {
 	u64 last_tsc_nsec;
 	u64 last_tsc_offset;
 	u64 last_tsc_write;
+	u32 virtual_tsc_khz;
+	u32 virtual_tsc_mult;
+	s8 virtual_tsc_shift;
 
 	struct kvm_xen_hvm_config xen_hvm_config;
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ce57cd899a6..bfcf8fd5e08 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -962,6 +962,7 @@ static inline u64 get_kernel_ns(void)
 }
 
 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
+unsigned long max_tsc_khz;
 
 static inline int kvm_tsc_changes_freq(void)
 {
@@ -985,6 +986,24 @@ static inline u64 nsec_to_cycles(u64 nsec)
 	return ret;
 }
 
+static void kvm_arch_set_tsc_khz(struct kvm *kvm, u32 this_tsc_khz)
+{
+	/* Compute a scale to convert nanoseconds in TSC cycles */
+	kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
+			   &kvm->arch.virtual_tsc_shift,
+			   &kvm->arch.virtual_tsc_mult);
+	kvm->arch.virtual_tsc_khz = this_tsc_khz;
+}
+
+static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
+{
+	u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec,
+				      vcpu->kvm->arch.virtual_tsc_mult,
+				      vcpu->kvm->arch.virtual_tsc_shift);
+	tsc += vcpu->arch.last_tsc_write;
+	return tsc;
+}
+
 void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
 {
 	struct kvm *kvm = vcpu->kvm;
@@ -1029,6 +1048,8 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
 
 	/* Reset of TSC must disable overshoot protection below */
 	vcpu->arch.hv_clock.tsc_timestamp = 0;
+	vcpu->arch.last_tsc_write = data;
+	vcpu->arch.last_tsc_nsec = ns;
 }
 EXPORT_SYMBOL_GPL(kvm_write_tsc);
 
@@ -1041,21 +1062,41 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 	s64 kernel_ns, max_kernel_ns;
 	u64 tsc_timestamp;
 
-	if ((!vcpu->time_page))
-		return 0;
-
 	/* Keep irq disabled to prevent changes to the clock */
 	local_irq_save(flags);
 	kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp);
 	kernel_ns = get_kernel_ns();
 	this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
-	local_irq_restore(flags);
 
 	if (unlikely(this_tsc_khz == 0)) {
+		local_irq_restore(flags);
 		kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
 		return 1;
 	}
 
+	/*
+	 * We may have to catch up the TSC to match elapsed wall clock
+	 * time for two reasons, even if kvmclock is used.
+	 *   1) CPU could have been running below the maximum TSC rate
+	 *   2) Broken TSC compensation resets the base at each VCPU
+	 *      entry to avoid unknown leaps of TSC even when running
+	 *      again on the same CPU.  This may cause apparent elapsed
+	 *      time to disappear, and the guest to stand still or run
+	 *	very slowly.
+	 */
+	if (vcpu->tsc_catchup) {
+		u64 tsc = compute_guest_tsc(v, kernel_ns);
+		if (tsc > tsc_timestamp) {
+			kvm_x86_ops->adjust_tsc_offset(v, tsc - tsc_timestamp);
+			tsc_timestamp = tsc;
+		}
+	}
+
+	local_irq_restore(flags);
+
+	if (!vcpu->time_page)
+		return 0;
+
 	/*
 	 * Time as measured by the TSC may go backwards when resetting the base
 	 * tsc_timestamp.  The reason for this is that the TSC resolution is
@@ -1122,16 +1163,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 	return 0;
 }
 
-static int kvm_request_guest_time_update(struct kvm_vcpu *v)
-{
-	struct kvm_vcpu_arch *vcpu = &v->arch;
-
-	if (!vcpu->time_page)
-		return 0;
-	kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
-	return 1;
-}
-
 static bool msr_mtrr_valid(unsigned msr)
 {
 	switch (msr) {
@@ -1455,6 +1486,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 		}
 
 		vcpu->arch.time = data;
+		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 
 		/* we verify if the enable bit is set... */
 		if (!(data & 1))
@@ -1470,8 +1502,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 			kvm_release_page_clean(vcpu->arch.time_page);
 			vcpu->arch.time_page = NULL;
 		}
-
-		kvm_request_guest_time_update(vcpu);
 		break;
 	}
 	case MSR_IA32_MCG_CTL:
@@ -2028,9 +2058,13 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 				native_read_tsc() - vcpu->arch.last_host_tsc;
 		if (tsc_delta < 0)
 			mark_tsc_unstable("KVM discovered backwards TSC");
-		if (check_tsc_unstable())
+		if (check_tsc_unstable()) {
 			kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta);
-		kvm_migrate_timers(vcpu);
+			vcpu->arch.tsc_catchup = 1;
+			kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+		}
+		if (vcpu->cpu != cpu)
+			kvm_migrate_timers(vcpu);
 		vcpu->cpu = cpu;
 	}
 }
@@ -4461,8 +4495,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
 		kvm_for_each_vcpu(i, vcpu, kvm) {
 			if (vcpu->cpu != freq->cpu)
 				continue;
-			if (!kvm_request_guest_time_update(vcpu))
-				continue;
+			kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 			if (vcpu->cpu != smp_processor_id())
 				send_ipi = 1;
 		}
@@ -4517,11 +4550,20 @@ static void kvm_timer_init(void)
 {
 	int cpu;
 
+	max_tsc_khz = tsc_khz;
 	register_hotcpu_notifier(&kvmclock_cpu_notifier_block);
 	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+#ifdef CONFIG_CPU_FREQ
+		struct cpufreq_policy policy;
+		memset(&policy, 0, sizeof(policy));
+		cpufreq_get_policy(&policy, get_cpu());
+		if (policy.cpuinfo.max_freq)
+			max_tsc_khz = policy.cpuinfo.max_freq;
+#endif
 		cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
 					  CPUFREQ_TRANSITION_NOTIFIER);
 	}
+	pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz);
 	for_each_online_cpu(cpu)
 		smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
 }
@@ -5752,7 +5794,7 @@ int kvm_arch_hardware_enable(void *garbage)
 	list_for_each_entry(kvm, &vm_list, vm_list)
 		kvm_for_each_vcpu(i, vcpu, kvm)
 			if (vcpu->cpu == smp_processor_id())
-				kvm_request_guest_time_update(vcpu);
+				kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 	return kvm_x86_ops->hardware_enable(garbage);
 }
 
@@ -5803,6 +5845,9 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	}
 	vcpu->arch.pio_data = page_address(page);
 
+	if (!kvm->arch.virtual_tsc_khz)
+		kvm_arch_set_tsc_khz(kvm, max_tsc_khz);
+
 	r = kvm_mmu_create(vcpu);
 	if (r < 0)
 		goto fail_free_pio_data;
-- 
cgit v1.2.3-18-g5258


From 19b6a85b78a5d4b466c537bdbf0eaecae5e2c4e2 Mon Sep 17 00:00:00 2001
From: Arjan Koers <0h61vkll2ly8@xutrox.com>
Date: Mon, 2 Aug 2010 23:35:28 +0200
Subject: KVM guest: Move a printk that's using the clock before it's ready

Fix a hang during SMP kernel boot on KVM that showed up
after commit 489fb490dbf8dab0249ad82b56688ae3842a79e8
(2.6.35) and 59aab522154a2f17b25335b63c1cf68a51fb6ae0
(2.6.34.1). The problem only occurs when
CONFIG_PRINTK_TIME is set.

KVM-Stable-Tag.
Signed-off-by: Arjan Koers <0h61vkll2ly8@xutrox.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kernel/kvmclock.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index eb9b76c716c..ca43ce31a19 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -128,13 +128,15 @@ static struct clocksource kvm_clock = {
 static int kvm_register_clock(char *txt)
 {
 	int cpu = smp_processor_id();
-	int low, high;
+	int low, high, ret;
+
 	low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1;
 	high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32);
+	ret = native_write_msr_safe(msr_kvm_system_time, low, high);
 	printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
 	       cpu, high, low, txt);
 
-	return native_write_msr_safe(msr_kvm_system_time, low, high);
+	return ret;
 }
 
 #ifdef CONFIG_X86_LOCAL_APIC
-- 
cgit v1.2.3-18-g5258


From 07d6f555d536aad1d74bb8b41dae9385007ecc26 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Tue, 28 Sep 2010 16:37:42 +0200
Subject: KVM: VMX: Add AX to list of registers clobbered by guest switch

By chance this caused no harm so far. We overwrite AX during switch
to/from guest context, so we must declare this.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 28c72da93a1..007be8402ef 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4007,7 +4007,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
 #endif
 		[cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2))
 	      : "cc", "memory"
-		, R"bx", R"di", R"si"
+		, R"ax", R"bx", R"di", R"si"
 #ifdef CONFIG_X86_64
 		, "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
 #endif
-- 
cgit v1.2.3-18-g5258


From 50933623e50d8730cc1a65853c153b3b4c93b629 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Sun, 26 Sep 2010 13:00:53 +0200
Subject: KVM: x86: Fix constant type in kvm_get_time_scale

Older gcc versions complain about the improper type (for x86-32), 4.5
seems to fix this silently. However, we should better use the right type
initially.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bfcf8fd5e08..ffcb90669ec 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -930,14 +930,14 @@ static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
 
 	tps64 = base_khz * 1000LL;
 	scaled64 = scaled_khz * 1000LL;
-	while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000UL) {
+	while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
 		tps64 >>= 1;
 		shift--;
 	}
 
 	tps32 = (uint32_t)tps64;
-	while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000UL) {
-		if (scaled64 & 0xffffffff00000000UL || tps32 & 0x80000000)
+	while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) {
+		if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000)
 			scaled64 >>= 1;
 		else
 			tps32 <<= 1;
-- 
cgit v1.2.3-18-g5258


From 7129eecac10681f69cb00c0323ee915feceb57eb Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Tue, 28 Sep 2010 16:33:32 +0800
Subject: KVM: x86 emulator: Eliminate compilation warning in x86_decode_insn()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Eliminate:
arch/x86/kvm/emulate.c:801: warning: ‘sv’ may be used uninitialized in this
function

on gcc 4.1.2

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index aead72e141b..d0df25d84ac 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -798,7 +798,7 @@ done:
 
 static void fetch_bit_operand(struct decode_cache *c)
 {
-	long sv, mask;
+	long sv = 0, mask;
 
 	if (c->dst.type == OP_MEM && c->src.type == OP_REG) {
 		mask = ~(c->dst.bytes * 8 - 1);
-- 
cgit v1.2.3-18-g5258


From 6292757fb0e758748fdb441861f8c50d397de9f0 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Mon, 27 Sep 2010 18:02:12 +0800
Subject: KVM: MMU: update 'root_hpa' out of loop in PAE shadow path

The value of 'vcpu->arch.mmu.pae_root' is not modified, so we can update
'root_hpa' out of the loop.

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c94c43289f5..36300469901 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2393,8 +2393,8 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
 			++sp->root_count;
 			spin_unlock(&vcpu->kvm->mmu_lock);
 			vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
-			vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
 		}
+		vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
 	} else
 		BUG();
 
@@ -2466,8 +2466,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 		spin_unlock(&vcpu->kvm->mmu_lock);
 
 		vcpu->arch.mmu.pae_root[i] = root | pm_mask;
-		vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
 	}
+	vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
 
 	/*
 	 * If we shadow a 32 bit page table with a long mode page
-- 
cgit v1.2.3-18-g5258


From 20bd40dc6492da293993559555df07d467fd202e Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Mon, 27 Sep 2010 18:03:27 +0800
Subject: KVM: MMU: cleanup for error mask set while walk guest page table

Small cleanup for set page fault error code

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 2bdd843ad63..a83ff379405 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -224,9 +224,7 @@ walk:
 			    is_cpuid_PSE36())
 				gfn += pse36_gfn_delta(pte);
 
-			access |= write_fault ? PFERR_WRITE_MASK : 0;
-			access |= fetch_fault ? PFERR_FETCH_MASK : 0;
-			access |= user_fault  ? PFERR_USER_MASK  : 0;
+			access |= write_fault | fetch_fault | user_fault;
 
 			real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn),
 						      access);
@@ -268,10 +266,9 @@ error:
 	walker->error_code = 0;
 	if (present)
 		walker->error_code |= PFERR_PRESENT_MASK;
-	if (write_fault)
-		walker->error_code |= PFERR_WRITE_MASK;
-	if (user_fault)
-		walker->error_code |= PFERR_USER_MASK;
+
+	walker->error_code |= write_fault | user_fault;
+
 	if (fetch_fault && mmu->nx)
 		walker->error_code |= PFERR_FETCH_MASK;
 	if (rsvd_fault)
@@ -673,9 +670,9 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
 	int r;
 
 	r = FNAME(walk_addr)(&walker, vcpu, vaddr,
-			     !!(access & PFERR_WRITE_MASK),
-			     !!(access & PFERR_USER_MASK),
-			     !!(access & PFERR_FETCH_MASK));
+			     access & PFERR_WRITE_MASK,
+			     access & PFERR_USER_MASK,
+			     access & PFERR_FETCH_MASK);
 
 	if (r) {
 		gpa = gfn_to_gpa(walker.gfn);
-- 
cgit v1.2.3-18-g5258


From 33f91edb9211f5c0392071f9eb01958ec69f2193 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Mon, 27 Sep 2010 18:05:00 +0800
Subject: KVM: MMU: set access bit for direct mapping

Set access bit while setup up direct page table if it's nonpaing or npt enabled,
it's good for CPU's speculate access

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 36300469901..88203fa4ef0 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2240,7 +2240,8 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 			__set_spte(iterator.sptep,
 				   __pa(sp->spt)
 				   | PT_PRESENT_MASK | PT_WRITABLE_MASK
-				   | shadow_user_mask | shadow_x_mask);
+				   | shadow_user_mask | shadow_x_mask
+				   | shadow_accessed_mask);
 		}
 	}
 	return pt_write;
-- 
cgit v1.2.3-18-g5258


From 98224bf1d1783a25ccede29ab08309424ec8de25 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Mon, 27 Sep 2010 18:06:16 +0800
Subject: KVM: MMU: audit: fix vcpu's spte walking

After nested nested paging, it may using long mode to shadow 32/PAE paging
guest, so this patch fix it

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu_audit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index bd2b1be7066..dcca3e7d7b4 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -51,7 +51,7 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
 	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
 		return;
 
-	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+	if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
 		hpa_t root = vcpu->arch.mmu.root_hpa;
 
 		sp = page_header(root);
-- 
cgit v1.2.3-18-g5258


From c42fffe3a3aa8c62b8028fff32d18156f5325c3b Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Mon, 27 Sep 2010 18:07:07 +0800
Subject: KVM: MMU: audit: unregister audit tracepoints before module unloaded

fix:

Call Trace:
 [<ffffffffa01e46ba>] ? kvm_mmu_pte_write+0x229/0x911 [kvm]
 [<ffffffffa01c6ba9>] ? gfn_to_memslot+0x39/0xa0 [kvm]
 [<ffffffffa01c6c26>] ? mark_page_dirty+0x16/0x2e [kvm]
 [<ffffffffa01c6d6f>] ? kvm_write_guest_page+0x67/0x7f [kvm]
 [<ffffffff81066fbd>] ? local_clock+0x2a/0x3b
 [<ffffffffa01d52ce>] emulator_write_phys+0x46/0x54 [kvm]
 ......
Code:  Bad RIP value.
RIP  [<ffffffffa0172056>] 0xffffffffa0172056
 RSP <ffff880134f69a70>
CR2: ffffffffa0172056

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 88203fa4ef0..afde64ba118 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3355,15 +3355,6 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu)
 	return init_kvm_mmu(vcpu);
 }
 
-void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
-{
-	ASSERT(vcpu);
-
-	destroy_kvm_mmu(vcpu);
-	free_mmu_pages(vcpu);
-	mmu_free_memory_caches(vcpu);
-}
-
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 {
 	struct kvm_mmu_page *sp;
@@ -3662,4 +3653,16 @@ EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
 
 #ifdef CONFIG_KVM_MMU_AUDIT
 #include "mmu_audit.c"
+#else
+static void mmu_audit_disable(void) { }
 #endif
+
+void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
+{
+	ASSERT(vcpu);
+
+	destroy_kvm_mmu(vcpu);
+	free_mmu_pages(vcpu);
+	mmu_free_memory_caches(vcpu);
+	mmu_audit_disable();
+}
-- 
cgit v1.2.3-18-g5258


From 38904e128778c38809daf44a1dabc7f25fa8d83e Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Mon, 27 Sep 2010 18:07:59 +0800
Subject: KVM: MMU: audit: introduce audit_printk to cleanup audit code

Introduce audit_printk, and record audit point instead audit name

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu_audit.c | 42 ++++++++++++++++++------------------------
 1 file changed, 18 insertions(+), 24 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index dcca3e7d7b4..66219afcc91 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -19,7 +19,11 @@
 
 #include <linux/ratelimit.h>
 
-static const char *audit_msg;
+static int audit_point;
+
+#define audit_printk(fmt, args...)		\
+	printk(KERN_ERR "audit: (%s) error: "	\
+		fmt, audit_point_name[audit_point], ##args)
 
 typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level);
 
@@ -93,21 +97,18 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
 
 	if (sp->unsync) {
 		if (level != PT_PAGE_TABLE_LEVEL) {
-			printk(KERN_ERR "audit: (%s) error: unsync sp: %p level = %d\n",
-				audit_msg, sp, level);
+			audit_printk("unsync sp: %p level = %d\n", sp, level);
 			return;
 		}
 
 		if (*sptep == shadow_notrap_nonpresent_pte) {
-			printk(KERN_ERR "audit: (%s) error: notrap spte in unsync sp: %p\n",
-				audit_msg, sp);
+			audit_printk("notrap spte in unsync sp: %p\n", sp);
 			return;
 		}
 	}
 
 	if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
-		printk(KERN_ERR "audit: (%s) error: notrap spte in direct sp: %p\n",
-			audit_msg, sp);
+		audit_printk("notrap spte in direct sp: %p\n", sp);
 		return;
 	}
 
@@ -124,10 +125,8 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
 
 	hpa =  pfn << PAGE_SHIFT;
 	if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
-		printk(KERN_ERR "xx audit error: (%s) levels %d"
-				   "pfn %llx hpa %llx ent %llxn",
-				   audit_msg, vcpu->arch.mmu.root_level,
-				   pfn, hpa, *sptep);
+		audit_printk("levels %d pfn %llx hpa %llx ent %llxn",
+				   vcpu->arch.mmu.root_level, pfn, hpa, *sptep);
 }
 
 static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
@@ -143,11 +142,9 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
 	if (!gfn_to_memslot(kvm, gfn)) {
 		if (!printk_ratelimit())
 			return;
-		printk(KERN_ERR "%s: no memslot for gfn %llx\n",
-				 audit_msg, gfn);
-		printk(KERN_ERR "%s: index %ld of sp (gfn=%llx)\n",
-		       audit_msg, (long int)(sptep - rev_sp->spt),
-				rev_sp->gfn);
+		audit_printk("no memslot for gfn %llx\n", gfn);
+		audit_printk("index %ld of sp (gfn=%llx)\n",
+		       (long int)(sptep - rev_sp->spt), rev_sp->gfn);
 		dump_stack();
 		return;
 	}
@@ -156,8 +153,7 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
 	if (!*rmapp) {
 		if (!printk_ratelimit())
 			return;
-		printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
-				 audit_msg, *sptep);
+		audit_printk("no rmap for writable spte %llx\n", *sptep);
 		dump_stack();
 	}
 }
@@ -198,10 +194,8 @@ void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
 	spte = rmap_next(kvm, rmapp, NULL);
 	while (spte) {
 		if (is_writable_pte(*spte))
-			printk(KERN_ERR "%s: (%s) shadow page has "
-				"writable mappings: gfn %llx role %x\n",
-			       __func__, audit_msg, sp->gfn,
-			       sp->role.word);
+			audit_printk("shadow page has writable mappings: gfn "
+				     "%llx role %x\n", sp->gfn, sp->role.word);
 		spte = rmap_next(kvm, rmapp, spte);
 	}
 }
@@ -228,14 +222,14 @@ static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
 	mmu_spte_walk(vcpu, audit_spte);
 }
 
-static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int audit_point)
+static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int point)
 {
 	static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
 
 	if (!__ratelimit(&ratelimit_state))
 		return;
 
-	audit_msg = audit_point_name[audit_point];
+	audit_point = point;
 	audit_all_active_sps(vcpu->kvm);
 	audit_vcpu_spte(vcpu);
 }
-- 
cgit v1.2.3-18-g5258


From 6903074c367cfb13166c2974d6a886fdc7a00d21 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Mon, 27 Sep 2010 18:09:29 +0800
Subject: KVM: MMU: audit: check whether have unsync sps after root sync

After root synced, all unsync sps are synced, this patch add a check to make
sure it's no unsync sps in VCPU's page table

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c       | 11 +++++++++--
 arch/x86/kvm/mmu_audit.c | 11 ++++++++++-
 2 files changed, 19 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index afde64ba118..ba7e7646fb7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -53,14 +53,18 @@ enum {
 	AUDIT_PRE_PAGE_FAULT,
 	AUDIT_POST_PAGE_FAULT,
 	AUDIT_PRE_PTE_WRITE,
-	AUDIT_POST_PTE_WRITE
+	AUDIT_POST_PTE_WRITE,
+	AUDIT_PRE_SYNC,
+	AUDIT_POST_SYNC
 };
 
 char *audit_point_name[] = {
 	"pre page fault",
 	"post page fault",
 	"pre pte write",
-	"post pte write"
+	"post pte write",
+	"pre sync",
+	"post sync"
 };
 
 #undef MMU_DEBUG
@@ -2516,6 +2520,8 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
 
 	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
 		return;
+
+	trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
 	if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
 		hpa_t root = vcpu->arch.mmu.root_hpa;
 		sp = page_header(root);
@@ -2531,6 +2537,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
 			mmu_sync_children(vcpu, sp);
 		}
 	}
+	trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
 }
 
 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 66219afcc91..4aee32c3cf9 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -164,6 +164,14 @@ static void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu, u64 *sptep, int level)
 		inspect_spte_has_rmap(vcpu->kvm, sptep);
 }
 
+static void audit_spte_after_sync(struct kvm_vcpu *vcpu, u64 *sptep, int level)
+{
+	struct kvm_mmu_page *sp = page_header(__pa(sptep));
+
+	if (audit_point == AUDIT_POST_SYNC && sp->unsync)
+		audit_printk("meet unsync sp(%p) after sync root.\n", sp);
+}
+
 static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
 	int i;
@@ -179,7 +187,7 @@ static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
 	}
 }
 
-void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
+static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
 	struct kvm_memory_slot *slot;
 	unsigned long *rmapp;
@@ -215,6 +223,7 @@ static void audit_spte(struct kvm_vcpu *vcpu, u64 *sptep, int level)
 {
 	audit_sptes_have_rmaps(vcpu, sptep, level);
 	audit_mappings(vcpu, sptep, level);
+	audit_spte_after_sync(vcpu, sptep, level);
 }
 
 static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3-18-g5258


From 3377078027dc54dc2a5acb2efa09587e7ac1cbd9 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Tue, 28 Sep 2010 17:03:14 +0800
Subject: KVM: MMU: move access code parsing to FNAME(walk_addr) function

Move access code parsing from caller site to FNAME(walk_addr) function

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 40 ++++++++++++++++------------------------
 1 file changed, 16 insertions(+), 24 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index a83ff379405..9a5f7bb5f84 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -116,16 +116,18 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
  */
 static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 				    struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
-				    gva_t addr, int write_fault,
-				    int user_fault, int fetch_fault)
+				    gva_t addr, u32 access)
 {
 	pt_element_t pte;
 	gfn_t table_gfn;
 	unsigned index, pt_access, uninitialized_var(pte_access);
 	gpa_t pte_gpa;
 	bool eperm, present, rsvd_fault;
-	int offset;
-	u32 access = 0;
+	int offset, write_fault, user_fault, fetch_fault;
+
+	write_fault = access & PFERR_WRITE_MASK;
+	user_fault = access & PFERR_USER_MASK;
+	fetch_fault = access & PFERR_FETCH_MASK;
 
 	trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
 				     fetch_fault);
@@ -215,6 +217,7 @@ walk:
 			int lvl = walker->level;
 			gpa_t real_gpa;
 			gfn_t gfn;
+			u32 ac;
 
 			gfn = gpte_to_gfn_lvl(pte, lvl);
 			gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT;
@@ -224,10 +227,10 @@ walk:
 			    is_cpuid_PSE36())
 				gfn += pse36_gfn_delta(pte);
 
-			access |= write_fault | fetch_fault | user_fault;
+			ac = write_fault | fetch_fault | user_fault;
 
 			real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn),
-						      access);
+						      ac);
 			if (real_gpa == UNMAPPED_GVA)
 				return 0;
 
@@ -282,21 +285,18 @@ error:
 }
 
 static int FNAME(walk_addr)(struct guest_walker *walker,
-			    struct kvm_vcpu *vcpu, gva_t addr,
-			    int write_fault, int user_fault, int fetch_fault)
+			    struct kvm_vcpu *vcpu, gva_t addr, u32 access)
 {
 	return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.mmu, addr,
-					write_fault, user_fault, fetch_fault);
+					access);
 }
 
 static int FNAME(walk_addr_nested)(struct guest_walker *walker,
 				   struct kvm_vcpu *vcpu, gva_t addr,
-				   int write_fault, int user_fault,
-				   int fetch_fault)
+				   u32 access)
 {
 	return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
-					addr, write_fault, user_fault,
-					fetch_fault);
+					addr, access);
 }
 
 static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
@@ -532,7 +532,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 {
 	int write_fault = error_code & PFERR_WRITE_MASK;
 	int user_fault = error_code & PFERR_USER_MASK;
-	int fetch_fault = error_code & PFERR_FETCH_MASK;
 	struct guest_walker walker;
 	u64 *sptep;
 	int write_pt = 0;
@@ -550,8 +549,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	/*
 	 * Look up the guest pte for the faulting address.
 	 */
-	r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault,
-			     fetch_fault);
+	r = FNAME(walk_addr)(&walker, vcpu, addr, error_code);
 
 	/*
 	 * The page is not mapped by the guest.  Let the guest handle it.
@@ -669,10 +667,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
 	gpa_t gpa = UNMAPPED_GVA;
 	int r;
 
-	r = FNAME(walk_addr)(&walker, vcpu, vaddr,
-			     access & PFERR_WRITE_MASK,
-			     access & PFERR_USER_MASK,
-			     access & PFERR_FETCH_MASK);
+	r = FNAME(walk_addr)(&walker, vcpu, vaddr, access);
 
 	if (r) {
 		gpa = gfn_to_gpa(walker.gfn);
@@ -690,10 +685,7 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
 	gpa_t gpa = UNMAPPED_GVA;
 	int r;
 
-	r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr,
-				    access & PFERR_WRITE_MASK,
-				    access & PFERR_USER_MASK,
-				    access & PFERR_FETCH_MASK);
+	r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, access);
 
 	if (r) {
 		gpa = gfn_to_gpa(walker.gfn);
-- 
cgit v1.2.3-18-g5258


From 7ebaf15eefe7b019def72bd9d4420c7bc51ed69e Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 3 Oct 2010 18:51:39 +0200
Subject: KVM: MMU: Avoid sign extension in mmu_alloc_direct_roots() pae root
 address

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ba7e7646fb7..dc1b4fb299b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2374,7 +2374,7 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
 static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
 {
 	struct kvm_mmu_page *sp;
-	int i;
+	unsigned i;
 
 	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
 		spin_lock(&vcpu->kvm->mmu_lock);
-- 
cgit v1.2.3-18-g5258


From 395c6b0a9d56fe7fdb7aeda12795d0eb02475d24 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 4 Oct 2010 12:55:49 +0200
Subject: KVM: Disable interrupts around get_kernel_ns()

get_kernel_ns() wants preemption disabled.  It doesn't make a lot of sense
during the get/set ioctls (no way to make them non-racy) but the callee wants
it.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ffcb90669ec..e96038e1bc3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3469,8 +3469,10 @@ long kvm_arch_vm_ioctl(struct file *filp,
 			goto out;
 
 		r = 0;
+		local_irq_disable();
 		now_ns = get_kernel_ns();
 		delta = user_ns.clock - now_ns;
+		local_irq_enable();
 		kvm->arch.kvmclock_offset = delta;
 		break;
 	}
@@ -3478,8 +3480,10 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		struct kvm_clock_data user_ns;
 		u64 now_ns;
 
+		local_irq_disable();
 		now_ns = get_kernel_ns();
 		user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
+		local_irq_enable();
 		user_ns.flags = 0;
 
 		r = -EFAULT;
-- 
cgit v1.2.3-18-g5258


From 9611c187774f0e20c258c23ced2599c44bd2fef4 Mon Sep 17 00:00:00 2001
From: Nicolas Kaiser <nikai@nikai.net>
Date: Wed, 6 Oct 2010 14:23:22 +0200
Subject: KVM: fix typo in copyright notice

Fix typo in copyright notice.

Signed-off-by: Nicolas Kaiser <nikai@nikai.net>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c     | 2 +-
 arch/x86/kvm/i8254.c       | 2 +-
 arch/x86/kvm/i8259.c       | 2 +-
 arch/x86/kvm/irq.c         | 2 +-
 arch/x86/kvm/lapic.c       | 2 +-
 arch/x86/kvm/mmu.c         | 2 +-
 arch/x86/kvm/mmu_audit.c   | 2 +-
 arch/x86/kvm/paging_tmpl.h | 2 +-
 arch/x86/kvm/svm.c         | 2 +-
 arch/x86/kvm/timer.c       | 2 +-
 arch/x86/kvm/vmx.c         | 2 +-
 arch/x86/kvm/x86.c         | 2 +-
 12 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index d0df25d84ac..38b6e8dafaf 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -9,7 +9,7 @@
  * privileged instructions:
  *
  * Copyright (C) 2006 Qumranet
- * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  *
  *   Avi Kivity <avi@qumranet.com>
  *   Yaniv Kamay <yaniv@qumranet.com>
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 2ad40a4ddc3..efad7238505 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -5,7 +5,7 @@
  * Copyright (c) 2006 Intel Corporation
  * Copyright (c) 2007 Keir Fraser, XenSource Inc
  * Copyright (c) 2008 Intel Corporation
- * Copyright 2009 Red Hat, Inc. and/or its affilates.
+ * Copyright 2009 Red Hat, Inc. and/or its affiliates.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index dd54c5bb2e5..f628234fbec 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -3,7 +3,7 @@
  *
  * Copyright (c) 2003-2004 Fabrice Bellard
  * Copyright (c) 2007 Intel Corporation
- * Copyright 2009 Red Hat, Inc. and/or its affilates.
+ * Copyright 2009 Red Hat, Inc. and/or its affiliates.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index f994da40ad9..7e06ba1618b 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -1,7 +1,7 @@
 /*
  * irq.c: API for in kernel interrupt controller
  * Copyright (c) 2007, Intel Corporation.
- * Copyright 2009 Red Hat, Inc. and/or its affilates.
+ * Copyright 2009 Red Hat, Inc. and/or its affiliates.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index c6f2f159384..82118087d9e 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -5,7 +5,7 @@
  * Copyright (C) 2006 Qumranet, Inc.
  * Copyright (C) 2007 Novell
  * Copyright (C) 2007 Intel
- * Copyright 2009 Red Hat, Inc. and/or its affilates.
+ * Copyright 2009 Red Hat, Inc. and/or its affiliates.
  *
  * Authors:
  *   Dor Laor <dor.laor@qumranet.com>
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index dc1b4fb299b..eb65b9c5ea4 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -7,7 +7,7 @@
  * MMU support
  *
  * Copyright (C) 2006 Qumranet, Inc.
- * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  *
  * Authors:
  *   Yaniv Kamay  <yaniv@qumranet.com>
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 4aee32c3cf9..ba2bcdde622 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -4,7 +4,7 @@
  * Audit code for KVM MMU
  *
  * Copyright (C) 2006 Qumranet, Inc.
- * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  *
  * Authors:
  *   Yaniv Kamay  <yaniv@qumranet.com>
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 9a5f7bb5f84..cd7a833a3b5 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -7,7 +7,7 @@
  * MMU support
  *
  * Copyright (C) 2006 Qumranet, Inc.
- * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  *
  * Authors:
  *   Yaniv Kamay  <yaniv@qumranet.com>
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index c929d007696..82e144a4e51 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4,7 +4,7 @@
  * AMD SVM support
  *
  * Copyright (C) 2006 Qumranet, Inc.
- * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  *
  * Authors:
  *   Yaniv Kamay  <yaniv@qumranet.com>
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
index e16a0dbe74d..fc7a101c4a3 100644
--- a/arch/x86/kvm/timer.c
+++ b/arch/x86/kvm/timer.c
@@ -6,7 +6,7 @@
  *
  * timer support
  *
- * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  *
  * This work is licensed under the terms of the GNU GPL, version 2.  See
  * the COPYING file in the top-level directory.
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 007be8402ef..8da0e45ff7c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5,7 +5,7 @@
  * machines without emulation or binary translation.
  *
  * Copyright (C) 2006 Qumranet, Inc.
- * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  *
  * Authors:
  *   Avi Kivity   <avi@qumranet.com>
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e96038e1bc3..dcee64e4434 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6,7 +6,7 @@
  * Copyright (C) 2006 Qumranet, Inc.
  * Copyright (C) 2008 Qumranet, Inc.
  * Copyright IBM Corporation, 2008
- * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  *
  * Authors:
  *   Avi Kivity   <avi@qumranet.com>
-- 
cgit v1.2.3-18-g5258


From 5854dbca9b235f8cdd414a0961018763d2d5bf77 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Fri, 8 Oct 2010 16:24:14 +0800
Subject: KVM: MCE: Add MCG_SER_P into KVM_MCE_CAP_SUPPORTED

Now we have MCG_SER_P (and corresponding SRAO/SRAR MCE) support in
kernel and QEMU-KVM, the MCG_SER_P should be added into
KVM_MCE_CAP_SUPPORTED to make all these code really works.

Reported-by: Dean Nelson <dnelson@redhat.com>
Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index dcee64e4434..2e090784863 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -73,7 +73,7 @@
 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
 
 #define KVM_MAX_MCE_BANKS 32
-#define KVM_MCE_CAP_SUPPORTED MCG_CTL_P
+#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P)
 
 /* EFER defaults:
  * - enable syscall per default because its emulated by KVM
-- 
cgit v1.2.3-18-g5258


From 77db5cbd29b7cb0e0fb4fd146e7f7ac2831a025a Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Fri, 8 Oct 2010 16:24:15 +0800
Subject: KVM: MCE: Send SRAR SIGBUS directly

Originally, SRAR SIGBUS is sent to QEMU-KVM via touching the poisoned
page. But commit 96054569190bdec375fe824e48ca1f4e3b53dd36 prevents the
signal from being sent. So now the signal is sent via
force_sig_info_fault directly.

[marcelo: use send_sig_info instead]

Reported-by: Dean Nelson <dnelson@redhat.com>
Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index eb65b9c5ea4..908ea5464a5 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2251,22 +2251,24 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 	return pt_write;
 }
 
-static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn)
+static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
 {
-	char buf[1];
-	void __user *hva;
-	int r;
+	siginfo_t info;
+
+	info.si_signo	= SIGBUS;
+	info.si_errno	= 0;
+	info.si_code	= BUS_MCEERR_AR;
+	info.si_addr	= (void __user *)address;
+	info.si_addr_lsb = PAGE_SHIFT;
 
-	/* Touch the page, so send SIGBUS */
-	hva = (void __user *)gfn_to_hva(kvm, gfn);
-	r = copy_from_user(buf, hva, 1);
+	send_sig_info(SIGBUS, &info, tsk);
 }
 
 static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
 {
 	kvm_release_pfn_clean(pfn);
 	if (is_hwpoison_pfn(pfn)) {
-		kvm_send_hwpoison_signal(kvm, gfn);
+		kvm_send_hwpoison_signal(gfn_to_hva(kvm, gfn), current);
 		return 0;
 	} else if (is_fault_pfn(pfn))
 		return -EFAULT;
-- 
cgit v1.2.3-18-g5258


From 2c78ffeca98fcd5a1dfd4a322438944506ed5e64 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 25 Oct 2010 08:41:09 +0200
Subject: x86/oprofile: Fix uninitialized variable use in debug printk

Stephen Rothwell reported this build warning:

  arch/x86/oprofile/op_model_amd.c: In function 'ibs_eilvt_valid':
  arch/x86/oprofile/op_model_amd.c:289: warning: 'offset' may be used uninitialized in this function

And correctly observed that indeed the variable is used uninitialized in
this function. The result of this bug can be a debug printk with a bogus
value.

Also fix a few more small details that made this function hard to read
and which probably contributed to the bug being introduced to begin with:

 - Use more symmetric error conditions

 - Remove the !0 obfuscation

 - Add newlines to the printk output

 - Remove bogus linebreaks in printk strings and elsewhere

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <20101025115736.41d51abe.sfr@canb.auug.org.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/oprofile/op_model_amd.c | 26 +++++++++++---------------
 1 file changed, 11 insertions(+), 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index 42fb46f8388..68759e716f0 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -281,29 +281,25 @@ static inline int eilvt_is_available(int offset)
 
 static inline int ibs_eilvt_valid(void)
 {
-	u64 val;
 	int offset;
+	u64 val;
 
 	rdmsrl(MSR_AMD64_IBSCTL, val);
+	offset = val & IBSCTL_LVT_OFFSET_MASK;
+
 	if (!(val & IBSCTL_LVT_OFFSET_VALID)) {
-		pr_err(FW_BUG "cpu %d, invalid IBS "
-		       "interrupt offset %d (MSR%08X=0x%016llx)",
-		       smp_processor_id(), offset,
-		       MSR_AMD64_IBSCTL, val);
+		pr_err(FW_BUG "cpu %d, invalid IBS interrupt offset %d (MSR%08X=0x%016llx)\n",
+		       smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
 		return 0;
 	}
 
-	offset = val & IBSCTL_LVT_OFFSET_MASK;
-
-	if (eilvt_is_available(offset))
-		return !0;
-
-	pr_err(FW_BUG "cpu %d, IBS interrupt offset %d "
-	       "not available (MSR%08X=0x%016llx)",
-	       smp_processor_id(), offset,
-	       MSR_AMD64_IBSCTL, val);
+	if (!eilvt_is_available(offset)) {
+		pr_err(FW_BUG "cpu %d, IBS interrupt offset %d not available (MSR%08X=0x%016llx)\n",
+		       smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
+		return 0;
+	}
 
-	return 0;
+	return 1;
 }
 
 static inline int get_ibs_offset(void)
-- 
cgit v1.2.3-18-g5258


From 9afd281a152702143961c09b5482a66eeefe5e03 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@alien8.de>
Date: Mon, 25 Oct 2010 18:15:22 +0200
Subject: x86-32, mm: Remove duplicated include

Commit b40827fa7268 ("x86-32, mm: Add an initial page table for core
bootstrapping") added an include directive which is needless and is
taken care of by a previous one.  Remove it.

Caught-by: Jaswinder Singh Rajput <jaswinderlinux@gmail.com>
Signed-off-by: Borislav Petkov <bp@alien8.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/acpi/sleep.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 74a847835ba..69fd72aa559 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -15,7 +15,6 @@
 
 #ifdef CONFIG_X86_32
 #include <asm/pgtable.h>
-#include <asm/pgtable_32.h>
 #endif
 
 #include "realmode/wakeup.h"
-- 
cgit v1.2.3-18-g5258


From 610470ce804f0326ca63fbcdc5be06b750debeb1 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@alien8.de>
Date: Mon, 25 Oct 2010 18:25:23 +0200
Subject: x86-32, mm: Remove duplicated #include

b40827fa7268fda8a62490728a61c2856f33830b added an include
directive which is needless and is taken care of by a previous
one. Remove it.

Caught-by: Jaswinder Singh Rajput <jaswinderlinux@gmail.com>
Signed-off-by: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Jaswinder Singh Rajput <jaswinderlinux@gmail.com>
Cc: H. Peter Anvin <hpa@linux.intel.com>
LKML-Reference: <20101025162523.GA4712@a1.tnic>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/acpi/sleep.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 74a847835ba..69fd72aa559 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -15,7 +15,6 @@
 
 #ifdef CONFIG_X86_32
 #include <asm/pgtable.h>
-#include <asm/pgtable_32.h>
 #endif
 
 #include "realmode/wakeup.h"
-- 
cgit v1.2.3-18-g5258


From 45263cb0993de738e158c625c84a5feb18bed317 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Mon, 25 Oct 2010 16:32:29 -0700
Subject: xen: include xen/xen.h for definition of xen_initial_domain()

          CC      arch/x86/xen/setup.o
        arch/x86/xen/setup.c: In function 'xen_memory_setup':
        arch/x86/xen/setup.c:161: error: implicit declaration of function 'xen_initial_domain'

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/setup.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 0ce9d58cb29..8e2c9f21fa3 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -17,6 +17,7 @@
 #include <asm/xen/hypervisor.h>
 #include <asm/xen/hypercall.h>
 
+#include <xen/xen.h>
 #include <xen/page.h>
 #include <xen/interface/callback.h>
 #include <xen/interface/memory.h>
-- 
cgit v1.2.3-18-g5258


From ea5b8f73933e34d2b47a65284c46d26d49e7edb9 Mon Sep 17 00:00:00 2001
From: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Date: Tue, 26 Oct 2010 17:28:33 +0100
Subject: xen: initialize cpu masks for pv guests in xen_smp_init

Pv guests don't have ACPI and need the cpu masks to be set
correctly as early as possible so we call xen_fill_possible_map from
xen_smp_init.
On the other hand the initial domain supports ACPI so in this case we skip
xen_fill_possible_map and rely on it. However Xen might limit the number
of cpus usable by the domain, so we filter those masks during smp
initialization using the VCPUOP_is_up hypercall.
It is important that the filtering is done before
xen_setup_vcpu_info_placement.

Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
---
 arch/x86/xen/smp.c | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 138676781dd..834dfeb54e3 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -28,6 +28,7 @@
 #include <asm/xen/interface.h>
 #include <asm/xen/hypercall.h>
 
+#include <xen/xen.h>
 #include <xen/page.h>
 #include <xen/events.h>
 
@@ -156,6 +157,25 @@ static void __init xen_fill_possible_map(void)
 {
 	int i, rc;
 
+	if (xen_initial_domain())
+		return;
+
+	for (i = 0; i < nr_cpu_ids; i++) {
+		rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
+		if (rc >= 0) {
+			num_processors++;
+			set_cpu_possible(i, true);
+		}
+	}
+}
+
+static void __init xen_filter_cpu_maps(void)
+{
+	int i, rc;
+
+	if (!xen_initial_domain())
+		return;
+
 	num_processors = 0;
 	disabled_cpus = 0;
 	for (i = 0; i < nr_cpu_ids; i++) {
@@ -179,6 +199,7 @@ static void __init xen_smp_prepare_boot_cpu(void)
 	   old memory can be recycled */
 	make_lowmem_page_readwrite(xen_initial_gdt);
 
+	xen_filter_cpu_maps();
 	xen_setup_vcpu_info_placement();
 }
 
@@ -195,8 +216,6 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
 	if (xen_smp_intr_init(0))
 		BUG();
 
-	xen_fill_possible_map();
-
 	if (!alloc_cpumask_var(&xen_cpu_initialized_map, GFP_KERNEL))
 		panic("could not allocate xen_cpu_initialized_map\n");
 
@@ -487,5 +506,6 @@ static const struct smp_ops xen_smp_ops __initdata = {
 void __init xen_smp_init(void)
 {
 	smp_ops = xen_smp_ops;
+	xen_fill_possible_map();
 	xen_init_spinlocks();
 }
-- 
cgit v1.2.3-18-g5258


From c8f730b1ab825f06733e1c074264f0078721f365 Mon Sep 17 00:00:00 2001
From: Russ Anderson <rja@sgi.com>
Date: Tue, 26 Oct 2010 16:27:28 -0500
Subject: x86, uv: Enable Westmere support on SGI UV

Enable Westmere support on SGI UV.  The UV initialization code is dependent on
the APICID bits.  Westmere-EX uses different APIC bit mapping than Nehalem-EX.
This code reads the apic shift value from a UV MMR to do the proper bit
decoding to determint the pnode.

Signed-off-by: Russ Anderson <rja@sgi.com>
LKML-Reference: <20101026212728.GB15071@sgi.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/uv/uv_hub.h   | 21 ++++++++++++++++++---
 arch/x86/kernel/apic/x2apic_uv_x.c | 25 +++++++++++++++++++++++--
 2 files changed, 41 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index bf6b88ef8ee..e969f691cbf 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -5,7 +5,7 @@
  *
  * SGI UV architectural definitions
  *
- * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 2007-2010 Silicon Graphics, Inc. All rights reserved.
  */
 
 #ifndef _ASM_X86_UV_UV_HUB_H
@@ -77,7 +77,8 @@
  *
  *		1111110000000000
  *		5432109876543210
- *		pppppppppplc0cch
+ *		pppppppppplc0cch	Nehalem-EX
+ *		ppppppppplcc0cch	Westmere-EX
  *		sssssssssss
  *
  *			p  = pnode bits
@@ -148,12 +149,25 @@ struct uv_hub_info_s {
 	unsigned char		m_val;
 	unsigned char		n_val;
 	struct uv_scir_s	scir;
+	unsigned char		apic_pnode_shift;
 };
 
 DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
 #define uv_hub_info		(&__get_cpu_var(__uv_hub_info))
 #define uv_cpu_hub_info(cpu)	(&per_cpu(__uv_hub_info, cpu))
 
+union uvh_apicid {
+    unsigned long       v;
+    struct uvh_apicid_s {
+        unsigned long   local_apic_mask  : 24;
+        unsigned long   local_apic_shift :  5;
+        unsigned long   unused1          :  3;
+        unsigned long   pnode_mask       : 24;
+        unsigned long   pnode_shift      :  5;
+        unsigned long   unused2          :  3;
+    } s;
+};
+
 /*
  * Local & Global MMR space macros.
  *	Note: macros are intended to be used ONLY by inline functions
@@ -182,6 +196,7 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
 #define UV_GLOBAL_MMR64_PNODE_BITS(p)					\
 	(((unsigned long)(p)) << UV_GLOBAL_MMR64_PNODE_SHIFT)
 
+#define UVH_APICID		0x002D0E00L
 #define UV_APIC_PNODE_SHIFT	6
 
 /* Local Bus from cpu's perspective */
@@ -280,7 +295,7 @@ static inline void *uv_pnode_offset_to_vaddr(int pnode, unsigned long offset)
  */
 static inline int uv_apicid_to_pnode(int apicid)
 {
-	return (apicid >> UV_APIC_PNODE_SHIFT);
+	return (apicid >> uv_hub_info->apic_pnode_shift);
 }
 
 /*
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index f744f54cb24..0a2918eaab3 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -5,7 +5,7 @@
  *
  * SGI UV APIC functions (note: not an Intel compatible APIC)
  *
- * Copyright (C) 2007-2009 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 2007-2010 Silicon Graphics, Inc. All rights reserved.
  */
 #include <linux/cpumask.h>
 #include <linux/hardirq.h>
@@ -41,6 +41,7 @@ DEFINE_PER_CPU(int, x2apic_extra_bits);
 
 static enum uv_system_type uv_system_type;
 static u64 gru_start_paddr, gru_end_paddr;
+static union uvh_apicid uvh_apicid;
 int uv_min_hub_revision_id;
 EXPORT_SYMBOL_GPL(uv_min_hub_revision_id);
 static DEFINE_SPINLOCK(uv_nmi_lock);
@@ -70,6 +71,22 @@ static int early_get_nodeid(void)
 	return node_id.s.node_id;
 }
 
+static int __init early_get_apic_pnode_shift(void)
+{
+	unsigned long *mmr;
+
+	mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_APICID, sizeof(*mmr));
+	uvh_apicid.v = *mmr;
+	early_iounmap(mmr, sizeof(*mmr));
+	if (!uvh_apicid.v)
+		/*
+		 * Old bios, use default value
+		 */
+		uvh_apicid.s.pnode_shift = UV_APIC_PNODE_SHIFT;
+
+	return uvh_apicid.s.pnode_shift;
+}
+
 static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 {
 	int nodeid;
@@ -84,7 +101,7 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 			uv_system_type = UV_X2APIC;
 		else if (!strcmp(oem_table_id, "UVH")) {
 			__get_cpu_var(x2apic_extra_bits) =
-				nodeid << (UV_APIC_PNODE_SHIFT - 1);
+				nodeid << (early_get_apic_pnode_shift() - 1);
 			uv_system_type = UV_NON_UNIQUE_APIC;
 			return 1;
 		}
@@ -716,6 +733,10 @@ void __init uv_system_init(void)
 		int apicid = per_cpu(x86_cpu_to_apicid, cpu);
 
 		nid = cpu_to_node(cpu);
+		/*
+		 * apic_pnode_shift must be set before calling uv_apicid_to_pnode();
+		 */
+		uv_cpu_hub_info(cpu)->apic_pnode_shift = uvh_apicid.s.pnode_shift;
 		pnode = uv_apicid_to_pnode(apicid);
 		blade = boot_pnode_to_blade(pnode);
 		lcpu = uv_blade_info[blade].nr_possible_cpus;
-- 
cgit v1.2.3-18-g5258


From dc9887dc02e37bcf83f4e792aa14b07782ef54cf Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bjorn.helgaas@hp.com>
Date: Tue, 26 Oct 2010 15:41:44 -0600
Subject: x86/PCI: allocate space from the end of a region, not the beginning

Allocate from the end of a region, not the beginning.

For example, if we need to allocate 0x800 bytes for a device on bus
0000:00 given these resources:

    [mem 0xbff00000-0xdfffffff] PCI Bus 0000:00
      [mem 0xc0000000-0xdfffffff] PCI Bus 0000:02

the available space at [mem 0xbff00000-0xbfffffff] is passed to the
alignment callback (pcibios_align_resource()).  Prior to this patch, we
would put the new 0x800 byte resource at the beginning of that available
space, i.e., at [mem 0xbff00000-0xbff007ff].

With this patch, we put it at the end, at [mem 0xbffff800-0xbfffffff].

Reference: https://bugzilla.kernel.org/show_bug.cgi?id=16228#c41
Signed-off-by: Bjorn Helgaas <bjorn.helgaas@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/i386.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 55253095be8..826140af3c3 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -65,16 +65,21 @@ pcibios_align_resource(void *data, const struct resource *res,
 			resource_size_t size, resource_size_t align)
 {
 	struct pci_dev *dev = data;
-	resource_size_t start = res->start;
+	resource_size_t start = round_down(res->end - size + 1, align);
 
 	if (res->flags & IORESOURCE_IO) {
-		if (skip_isa_ioresource_align(dev))
-			return start;
-		if (start & 0x300)
-			start = (start + 0x3ff) & ~0x3ff;
+
+		/*
+		 * If we're avoiding ISA aliases, the largest contiguous I/O
+		 * port space is 256 bytes.  Clearing bits 9 and 10 preserves
+		 * all 256-byte and smaller alignments, so the result will
+		 * still be correctly aligned.
+		 */
+		if (!skip_isa_ioresource_align(dev))
+			start &= ~0x300;
 	} else if (res->flags & IORESOURCE_MEM) {
 		if (start < BIOS_END)
-			start = BIOS_END;
+			start = res->end;	/* fail; no space */
 	}
 	return start;
 }
-- 
cgit v1.2.3-18-g5258


From 419afdf53cca794a190014593b4778e2e9d64cf3 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bjorn.helgaas@hp.com>
Date: Tue, 26 Oct 2010 15:41:49 -0600
Subject: x86: update iomem_resource end based on CPU physical address
 capabilities

The iomem_resource map reflects the available physical address space.
We statically initialize the end to -1, i.e., 0xffffffff_ffffffff, but
of course we can only use as much as the CPU can address.

This patch updates the end based on the CPU capabilities, so we don't
mistakenly allocate space that isn't usable, as we're likely to do when
allocating from the top-down.

Signed-off-by: Bjorn Helgaas <bjorn.helgaas@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/kernel/setup.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index c3a4fbb2b99..922b5a1f978 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -788,6 +788,7 @@ void __init setup_arch(char **cmdline_p)
 
 	x86_init.oem.arch_setup();
 
+	iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1;
 	setup_memory_map();
 	parse_setup_data();
 	/* update the e820_saved too */
-- 
cgit v1.2.3-18-g5258


From 1af3c2e45e7a641e774bbb84fa428f2f0bf2d9c9 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bjorn.helgaas@hp.com>
Date: Tue, 26 Oct 2010 15:41:54 -0600
Subject: x86: allocate space within a region top-down

Request that allocate_resource() use available space from high addresses
first, rather than the default of using low addresses first.

The most common place this makes a difference is when we move or assign
new PCI device resources.  Low addresses are generally scarce, so it's
better to use high addresses when possible.  This follows Windows practice
for PCI allocation.

Reference: https://bugzilla.kernel.org/show_bug.cgi?id=16228#c42
Signed-off-by: Bjorn Helgaas <bjorn.helgaas@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/kernel/setup.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 922b5a1f978..0fe76df866d 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -788,6 +788,7 @@ void __init setup_arch(char **cmdline_p)
 
 	x86_init.oem.arch_setup();
 
+	resource_alloc_from_bottom = 0;
 	iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1;
 	setup_memory_map();
 	parse_setup_data();
-- 
cgit v1.2.3-18-g5258


From 3e4d3af501cccdc8a8cca41bdbe57d54ad7e7e73 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 26 Oct 2010 14:21:51 -0700
Subject: mm: stack based kmap_atomic()

Keep the current interface but ignore the KM_type and use a stack based
approach.

The advantage is that we get rid of crappy code like:

	#define __KM_PTE			\
		(in_nmi() ? KM_NMI_PTE : 	\
		 in_irq() ? KM_IRQ_PTE :	\
		 KM_PTE0)

and in general can stop worrying about what context we're in and what kmap
slots might be appropriate for that.

The downside is that FRV kmap_atomic() gets more expensive.

For now we use a CPP trick suggested by Andrew:

  #define kmap_atomic(page, args...) __kmap_atomic(page)

to avoid having to touch all kmap_atomic() users in a single patch.

[ not compiled on:
  - mn10300: the arch doesn't actually build with highmem to begin with ]

[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: fix up drivers/gpu/drm/i915/intel_overlay.c]
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Chris Metcalf <cmetcalf@tilera.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: David Miller <davem@davemloft.net>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Dave Airlie <airlied@linux.ie>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/highmem.h  | 11 +++---
 arch/x86/include/asm/iomap.h    |  4 +--
 arch/x86/kernel/crash_dump_32.c |  2 +-
 arch/x86/mm/highmem_32.c        | 75 ++++++++++++++++++++++-------------------
 arch/x86/mm/iomap_32.c          | 42 +++++++++++++----------
 5 files changed, 75 insertions(+), 59 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h
index 8caac76ac32..3bd04022fd0 100644
--- a/arch/x86/include/asm/highmem.h
+++ b/arch/x86/include/asm/highmem.h
@@ -59,11 +59,12 @@ extern void kunmap_high(struct page *page);
 
 void *kmap(struct page *page);
 void kunmap(struct page *page);
-void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot);
-void *kmap_atomic(struct page *page, enum km_type type);
-void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type);
-void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
-void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
+
+void *kmap_atomic_prot(struct page *page, pgprot_t prot);
+void *__kmap_atomic(struct page *page);
+void __kunmap_atomic(void *kvaddr);
+void *kmap_atomic_pfn(unsigned long pfn);
+void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot);
 struct page *kmap_atomic_to_page(void *ptr);
 
 #define flush_cache_kmaps()	do { } while (0)
diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h
index c4191b3b705..363e33eb6ec 100644
--- a/arch/x86/include/asm/iomap.h
+++ b/arch/x86/include/asm/iomap.h
@@ -27,10 +27,10 @@
 #include <asm/tlbflush.h>
 
 void __iomem *
-iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
+iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot);
 
 void
-iounmap_atomic(void __iomem *kvaddr, enum km_type type);
+iounmap_atomic(void __iomem *kvaddr);
 
 int
 iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot);
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
index 67414550c3c..d5cd13945d5 100644
--- a/arch/x86/kernel/crash_dump_32.c
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -61,7 +61,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
 	if (!is_crashed_pfn_valid(pfn))
 		return -EFAULT;
 
-	vaddr = kmap_atomic_pfn(pfn, KM_PTE0);
+	vaddr = kmap_atomic_pfn(pfn);
 
 	if (!userbuf) {
 		memcpy(buf, (vaddr + offset), csize);
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 5e8fa12ef86..d723e369003 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -9,6 +9,7 @@ void *kmap(struct page *page)
 		return page_address(page);
 	return kmap_high(page);
 }
+EXPORT_SYMBOL(kmap);
 
 void kunmap(struct page *page)
 {
@@ -18,6 +19,7 @@ void kunmap(struct page *page)
 		return;
 	kunmap_high(page);
 }
+EXPORT_SYMBOL(kunmap);
 
 /*
  * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
@@ -27,10 +29,10 @@ void kunmap(struct page *page)
  * However when holding an atomic kmap it is not legal to sleep, so atomic
  * kmaps are appropriate for short, tight code paths only.
  */
-void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
+void *kmap_atomic_prot(struct page *page, pgprot_t prot)
 {
-	enum fixed_addresses idx;
 	unsigned long vaddr;
+	int idx, type;
 
 	/* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
 	pagefault_disable();
@@ -38,8 +40,7 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
 	if (!PageHighMem(page))
 		return page_address(page);
 
-	debug_kmap_atomic(type);
-
+	type = kmap_atomic_idx_push();
 	idx = type + KM_TYPE_NR*smp_processor_id();
 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
 	BUG_ON(!pte_none(*(kmap_pte-idx)));
@@ -47,44 +48,56 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
 
 	return (void *)vaddr;
 }
+EXPORT_SYMBOL(kmap_atomic_prot);
+
+void *__kmap_atomic(struct page *page)
+{
+	return kmap_atomic_prot(page, kmap_prot);
+}
+EXPORT_SYMBOL(__kmap_atomic);
 
-void *kmap_atomic(struct page *page, enum km_type type)
+/*
+ * This is the same as kmap_atomic() but can map memory that doesn't
+ * have a struct page associated with it.
+ */
+void *kmap_atomic_pfn(unsigned long pfn)
 {
-	return kmap_atomic_prot(page, type, kmap_prot);
+	return kmap_atomic_prot_pfn(pfn, kmap_prot);
 }
+EXPORT_SYMBOL_GPL(kmap_atomic_pfn);
 
-void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type)
+void __kunmap_atomic(void *kvaddr)
 {
 	unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
-	enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
-
-	/*
-	 * Force other mappings to Oops if they'll try to access this pte
-	 * without first remap it.  Keeping stale mappings around is a bad idea
-	 * also, in case the page changes cacheability attributes or becomes
-	 * a protected page in a hypervisor.
-	 */
-	if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx))
+
+	if (vaddr >= __fix_to_virt(FIX_KMAP_END) &&
+	    vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) {
+		int idx, type;
+
+		type = kmap_atomic_idx_pop();
+		idx = type + KM_TYPE_NR * smp_processor_id();
+
+#ifdef CONFIG_DEBUG_HIGHMEM
+		WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
+#endif
+		/*
+		 * Force other mappings to Oops if they'll try to access this
+		 * pte without first remap it.  Keeping stale mappings around
+		 * is a bad idea also, in case the page changes cacheability
+		 * attributes or becomes a protected page in a hypervisor.
+		 */
 		kpte_clear_flush(kmap_pte-idx, vaddr);
-	else {
+	}
 #ifdef CONFIG_DEBUG_HIGHMEM
+	else {
 		BUG_ON(vaddr < PAGE_OFFSET);
 		BUG_ON(vaddr >= (unsigned long)high_memory);
-#endif
 	}
+#endif
 
 	pagefault_enable();
 }
-
-/*
- * This is the same as kmap_atomic() but can map memory that doesn't
- * have a struct page associated with it.
- */
-void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
-{
-	return kmap_atomic_prot_pfn(pfn, type, kmap_prot);
-}
-EXPORT_SYMBOL_GPL(kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */
+EXPORT_SYMBOL(__kunmap_atomic);
 
 struct page *kmap_atomic_to_page(void *ptr)
 {
@@ -98,12 +111,6 @@ struct page *kmap_atomic_to_page(void *ptr)
 	pte = kmap_pte - (idx - FIX_KMAP_BEGIN);
 	return pte_page(*pte);
 }
-
-EXPORT_SYMBOL(kmap);
-EXPORT_SYMBOL(kunmap);
-EXPORT_SYMBOL(kmap_atomic);
-EXPORT_SYMBOL(kunmap_atomic_notypecheck);
-EXPORT_SYMBOL(kmap_atomic_prot);
 EXPORT_SYMBOL(kmap_atomic_to_page);
 
 void __init set_highmem_pages_init(void)
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index 72fc70cf618..75a3d7f24a2 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -48,21 +48,20 @@ int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot)
 }
 EXPORT_SYMBOL_GPL(iomap_create_wc);
 
-void
-iomap_free(resource_size_t base, unsigned long size)
+void iomap_free(resource_size_t base, unsigned long size)
 {
 	io_free_memtype(base, base + size);
 }
 EXPORT_SYMBOL_GPL(iomap_free);
 
-void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
+void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
 {
-	enum fixed_addresses idx;
 	unsigned long vaddr;
+	int idx, type;
 
 	pagefault_disable();
 
-	debug_kmap_atomic(type);
+	type = kmap_atomic_idx_push();
 	idx = type + KM_TYPE_NR * smp_processor_id();
 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
 	set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
@@ -72,10 +71,10 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
 }
 
 /*
- * Map 'pfn' using fixed map 'type' and protections 'prot'
+ * Map 'pfn' using protections 'prot'
  */
 void __iomem *
-iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
+iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
 {
 	/*
 	 * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS.
@@ -86,24 +85,33 @@ iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
 	if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC))
 		prot = PAGE_KERNEL_UC_MINUS;
 
-	return (void __force __iomem *) kmap_atomic_prot_pfn(pfn, type, prot);
+	return (void __force __iomem *) kmap_atomic_prot_pfn(pfn, prot);
 }
 EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn);
 
 void
-iounmap_atomic(void __iomem *kvaddr, enum km_type type)
+iounmap_atomic(void __iomem *kvaddr)
 {
 	unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
-	enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
 
-	/*
-	 * Force other mappings to Oops if they'll try to access this pte
-	 * without first remap it.  Keeping stale mappings around is a bad idea
-	 * also, in case the page changes cacheability attributes or becomes
-	 * a protected page in a hypervisor.
-	 */
-	if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx))
+	if (vaddr >= __fix_to_virt(FIX_KMAP_END) &&
+	    vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) {
+		int idx, type;
+
+		type = kmap_atomic_idx_pop();
+		idx = type + KM_TYPE_NR * smp_processor_id();
+
+#ifdef CONFIG_DEBUG_HIGHMEM
+		WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
+#endif
+		/*
+		 * Force other mappings to Oops if they'll try to access this
+		 * pte without first remap it.  Keeping stale mappings around
+		 * is a bad idea also, in case the page changes cacheability
+		 * attributes or becomes a protected page in a hypervisor.
+		 */
 		kpte_clear_flush(kmap_pte-idx, vaddr);
+	}
 
 	pagefault_enable();
 }
-- 
cgit v1.2.3-18-g5258


From ece0e2b6406a995c371e0311190631ea34ad851a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 26 Oct 2010 14:21:52 -0700
Subject: mm: remove pte_*map_nested()

Since we no longer need to provide KM_type, the whole pte_*map_nested()
API is now redundant, remove it.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Chris Metcalf <cmetcalf@tilera.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: David Miller <davem@davemloft.net>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/pgtable_32.h | 14 ++------------
 arch/x86/include/asm/pgtable_64.h |  2 --
 2 files changed, 2 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index 8abde9ec90b..0c92113c4cb 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -49,24 +49,14 @@ extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t);
 #endif
 
 #if defined(CONFIG_HIGHPTE)
-#define __KM_PTE			\
-	(in_nmi() ? KM_NMI_PTE : 	\
-	 in_irq() ? KM_IRQ_PTE :	\
-	 KM_PTE0)
 #define pte_offset_map(dir, address)					\
-	((pte_t *)kmap_atomic(pmd_page(*(dir)), __KM_PTE) +		\
+	((pte_t *)kmap_atomic(pmd_page(*(dir))) +		\
 	 pte_index((address)))
-#define pte_offset_map_nested(dir, address)				\
-	((pte_t *)kmap_atomic(pmd_page(*(dir)), KM_PTE1) +		\
-	 pte_index((address)))
-#define pte_unmap(pte) kunmap_atomic((pte), __KM_PTE)
-#define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1)
+#define pte_unmap(pte) kunmap_atomic((pte))
 #else
 #define pte_offset_map(dir, address)					\
 	((pte_t *)page_address(pmd_page(*(dir))) + pte_index((address)))
-#define pte_offset_map_nested(dir, address) pte_offset_map((dir), (address))
 #define pte_unmap(pte) do { } while (0)
-#define pte_unmap_nested(pte) do { } while (0)
 #endif
 
 /* Clear a kernel PTE and flush it from the TLB */
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index f96ac9bedf7..f86da20347f 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -127,9 +127,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
 
 /* x86-64 always has all page tables mapped. */
 #define pte_offset_map(dir, address) pte_offset_kernel((dir), (address))
-#define pte_offset_map_nested(dir, address) pte_offset_kernel((dir), (address))
 #define pte_unmap(pte) ((void)(pte))/* NOP */
-#define pte_unmap_nested(pte) ((void)(pte)) /* NOP */
 
 #define update_mmu_cache(vma, address, ptep) do { } while (0)
 
-- 
cgit v1.2.3-18-g5258


From 7a837d1bb7cb2bceb093ec639068626586a89234 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 26 Oct 2010 14:21:53 -0700
Subject: perf, x86: Fix up kmap_atomic() type

Now that the KM_type stuff is history, clean up the compiler warning.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Chris Metcalf <cmetcalf@tilera.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: David Miller <davem@davemloft.net>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/cpu/perf_event.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index fe73c1844a9..c1e8c7a5116 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -49,7 +49,6 @@ static unsigned long
 copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
 {
 	unsigned long offset, addr = (unsigned long)from;
-	int type = in_nmi() ? KM_NMI : KM_IRQ0;
 	unsigned long size, len = 0;
 	struct page *page;
 	void *map;
@@ -63,9 +62,9 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
 		offset = addr & (PAGE_SIZE - 1);
 		size = min(PAGE_SIZE - offset, n - len);
 
-		map = kmap_atomic(page, type);
+		map = kmap_atomic(page);
 		memcpy(to, map+offset, size);
-		kunmap_atomic(map, type);
+		kunmap_atomic(map);
 		put_page(page);
 
 		len  += size;
-- 
cgit v1.2.3-18-g5258


From d065bd810b6deb67d4897a14bfe21f8eb526ba99 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Tue, 26 Oct 2010 14:21:57 -0700
Subject: mm: retry page fault when blocking on disk transfer

This change reduces mmap_sem hold times that are caused by waiting for
disk transfers when accessing file mapped VMAs.

It introduces the VM_FAULT_ALLOW_RETRY flag, which indicates that the call
site wants mmap_sem to be released if blocking on a pending disk transfer.
In that case, filemap_fault() returns the VM_FAULT_RETRY status bit and
do_page_fault() will then re-acquire mmap_sem and retry the page fault.

It is expected that the retry will hit the same page which will now be
cached, and thus it will complete with a low mmap_sem hold time.

Tests:

- microbenchmark: thread A mmaps a large file and does random read accesses
  to the mmaped area - achieves about 55 iterations/s. Thread B does
  mmap/munmap in a loop at a separate location - achieves 55 iterations/s
  before, 15000 iterations/s after.

- We are seeing related effects in some applications in house, which show
  significant performance regressions when running without this change.

[akpm@linux-foundation.org: fix warning & crash]
Signed-off-by: Michel Lespinasse <walken@google.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Reviewed-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: Ying Han <yinghan@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Acked-by: "H. Peter Anvin" <hpa@zytor.com>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/fault.c | 38 ++++++++++++++++++++++++++------------
 1 file changed, 26 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 852b319edbd..9b2345c9e0c 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -956,8 +956,10 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	struct task_struct *tsk;
 	unsigned long address;
 	struct mm_struct *mm;
-	int write;
 	int fault;
+	int write = error_code & PF_WRITE;
+	unsigned int flags = FAULT_FLAG_ALLOW_RETRY |
+					(write ? FAULT_FLAG_WRITE : 0);
 
 	tsk = current;
 	mm = tsk->mm;
@@ -1068,6 +1070,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
 			bad_area_nosemaphore(regs, error_code, address);
 			return;
 		}
+retry:
 		down_read(&mm->mmap_sem);
 	} else {
 		/*
@@ -1111,8 +1114,6 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	 * we can handle it..
 	 */
 good_area:
-	write = error_code & PF_WRITE;
-
 	if (unlikely(access_error(error_code, write, vma))) {
 		bad_area_access_error(regs, error_code, address);
 		return;
@@ -1123,21 +1124,34 @@ good_area:
 	 * make sure we exit gracefully rather than endlessly redo
 	 * the fault:
 	 */
-	fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0);
+	fault = handle_mm_fault(mm, vma, address, flags);
 
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		mm_fault_error(regs, error_code, address, fault);
 		return;
 	}
 
-	if (fault & VM_FAULT_MAJOR) {
-		tsk->maj_flt++;
-		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
-				     regs, address);
-	} else {
-		tsk->min_flt++;
-		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
-				     regs, address);
+	/*
+	 * Major/minor page fault accounting is only done on the
+	 * initial attempt. If we go through a retry, it is extremely
+	 * likely that the page will be found in page cache at that point.
+	 */
+	if (flags & FAULT_FLAG_ALLOW_RETRY) {
+		if (fault & VM_FAULT_MAJOR) {
+			tsk->maj_flt++;
+			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
+				      regs, address);
+		} else {
+			tsk->min_flt++;
+			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
+				      regs, address);
+		}
+		if (fault & VM_FAULT_RETRY) {
+			/* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
+			 * of starvation. */
+			flags &= ~FAULT_FLAG_ALLOW_RETRY;
+			goto retry;
+		}
 	}
 
 	check_v8086_mode(regs, address, tsk);
-- 
cgit v1.2.3-18-g5258


From 68da336a14e16c2de95e987f3200995b707d7038 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Tue, 26 Oct 2010 14:21:58 -0700
Subject: x86: access_error API cleanup

access_error() already takes error_code as an argument, so there is
no need for an additional write flag.

Signed-off-by: Michel Lespinasse <walken@google.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Acked-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: Ying Han <yinghan@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Acked-by: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/fault.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 9b2345c9e0c..7d90ceb882a 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -919,9 +919,9 @@ spurious_fault(unsigned long error_code, unsigned long address)
 int show_unhandled_signals = 1;
 
 static inline int
-access_error(unsigned long error_code, int write, struct vm_area_struct *vma)
+access_error(unsigned long error_code, struct vm_area_struct *vma)
 {
-	if (write) {
+	if (error_code & PF_WRITE) {
 		/* write, present and write, not present: */
 		if (unlikely(!(vma->vm_flags & VM_WRITE)))
 			return 1;
@@ -1114,7 +1114,7 @@ retry:
 	 * we can handle it..
 	 */
 good_area:
-	if (unlikely(access_error(error_code, write, vma))) {
+	if (unlikely(access_error(error_code, vma))) {
 		bad_area_access_error(regs, error_code, address);
 		return;
 	}
-- 
cgit v1.2.3-18-g5258


From 732eacc0542d0aa48797f675888b85d6065af837 Mon Sep 17 00:00:00 2001
From: Hagen Paul Pfeifer <hagen@jauu.net>
Date: Tue, 26 Oct 2010 14:22:23 -0700
Subject: replace nested max/min macros with {max,min}3 macro

Use the new {max,min}3 macros to save some cycles and bytes on the stack.
This patch substitutes trivial nested macros with their counterpart.

Signed-off-by: Hagen Paul Pfeifer <hagen@jauu.net>
Cc: Joe Perches <joe@perches.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Hartley Sweeten <hsweeten@visionengravers.com>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Roland Dreier <rolandd@cisco.com>
Cc: Sean Hefty <sean.hefty@intel.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/cpu/intel_cacheinfo.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 12cd823c8d0..17ad0336621 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -327,6 +327,7 @@ static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
 	l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13));
 
 	l3->indices = (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1;
+	l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1;
 }
 
 static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node)
-- 
cgit v1.2.3-18-g5258


From ca1cab37d91cbe8a8333732540d43cabb54cfa85 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 26 Oct 2010 14:22:34 -0700
Subject: workqueues: s/ON_STACK/ONSTACK/

Silly though it is, completions and wait_queue_heads use foo_ONSTACK
(COMPLETION_INITIALIZER_ONSTACK, DECLARE_COMPLETION_ONSTACK,
__WAIT_QUEUE_HEAD_INIT_ONSTACK and DECLARE_WAIT_QUEUE_HEAD_ONSTACK) so I
guess workqueues should do the same thing.

s/INIT_WORK_ON_STACK/INIT_WORK_ONSTACK/
s/INIT_DELAYED_WORK_ON_STACK/INIT_DELAYED_WORK_ONSTACK/

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/hpet.c    | 2 +-
 arch/x86/kernel/smpboot.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index aff0b3c2750..ae03cab4352 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -713,7 +713,7 @@ static int hpet_cpuhp_notify(struct notifier_block *n,
 
 	switch (action & 0xf) {
 	case CPU_ONLINE:
-		INIT_DELAYED_WORK_ON_STACK(&work.work, hpet_work);
+		INIT_DELAYED_WORK_ONSTACK(&work.work, hpet_work);
 		init_completion(&work.complete);
 		/* FIXME: add schedule_work_on() */
 		schedule_delayed_work_on(cpu, &work.work, 0);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 6af118511b4..6c7faecd9e4 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -747,7 +747,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
 		.done	= COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
 	};
 
-	INIT_WORK_ON_STACK(&c_idle.work, do_fork_idle);
+	INIT_WORK_ONSTACK(&c_idle.work, do_fork_idle);
 
 	alternatives_smp_switch(1);
 
-- 
cgit v1.2.3-18-g5258


From 3adbb7f4a32dd34993ebe3829c69694f0c5fc85b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 16 Oct 2010 10:11:22 +0200
Subject: x86: Add platform directory

x86 has finally arrived in the embedded nightmare and will rapidly
grow SoC platform support in various flavours. So we need a place for
the platform support files. That also allows us to clean up the
dumpground which arch/x86/kernel has become over time.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/Kbuild            | 1 +
 arch/x86/platform/Makefile | 1 +
 2 files changed, 2 insertions(+)
 create mode 100644 arch/x86/platform/Makefile

(limited to 'arch/x86')

diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
index ad8ec356fb3..0e103236b75 100644
--- a/arch/x86/Kbuild
+++ b/arch/x86/Kbuild
@@ -14,3 +14,4 @@ obj-y += crypto/
 obj-y += vdso/
 obj-$(CONFIG_IA32_EMULATION) += ia32/
 
+obj-y += platform/
diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile
new file mode 100644
index 00000000000..fdf4113befb
--- /dev/null
+++ b/arch/x86/platform/Makefile
@@ -0,0 +1 @@
+# Platform specific code goes here
-- 
cgit v1.2.3-18-g5258


From 937f961a6539b0ac5ebf31472b90810bc1f02200 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 16 Oct 2010 10:16:59 +0200
Subject: x86: Move sfi to platform

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Len Brown <lenb@kernel.org>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
---
 arch/x86/kernel/Makefile       |   1 -
 arch/x86/kernel/sfi.c          | 120 -----------------------------------------
 arch/x86/platform/Makefile     |   1 +
 arch/x86/platform/sfi/Makefile |   1 +
 arch/x86/platform/sfi/sfi.c    | 120 +++++++++++++++++++++++++++++++++++++++++
 5 files changed, 122 insertions(+), 121 deletions(-)
 delete mode 100644 arch/x86/kernel/sfi.c
 create mode 100644 arch/x86/platform/sfi/Makefile
 create mode 100644 arch/x86/platform/sfi/sfi.c

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 2c833d8c414..d9067d10c6a 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -58,7 +58,6 @@ obj-$(CONFIG_INTEL_TXT)		+= tboot.o
 obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
 obj-y				+= cpu/
 obj-y				+= acpi/
-obj-$(CONFIG_SFI)		+= sfi.o
 obj-y				+= reboot.o
 obj-$(CONFIG_MCA)		+= mca_32.o
 obj-$(CONFIG_X86_MSR)		+= msr.o
diff --git a/arch/x86/kernel/sfi.c b/arch/x86/kernel/sfi.c
deleted file mode 100644
index dd4c281ffe5..00000000000
--- a/arch/x86/kernel/sfi.c
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * sfi.c - x86 architecture SFI support.
- *
- * Copyright (c) 2009, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
- *
- */
-
-#define KMSG_COMPONENT "SFI"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
-
-#include <linux/acpi.h>
-#include <linux/init.h>
-#include <linux/sfi.h>
-#include <linux/io.h>
-
-#include <asm/io_apic.h>
-#include <asm/mpspec.h>
-#include <asm/setup.h>
-#include <asm/apic.h>
-
-#ifdef CONFIG_X86_LOCAL_APIC
-static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
-
-static void __init mp_sfi_register_lapic_address(unsigned long address)
-{
-	mp_lapic_addr = address;
-
-	set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
-	if (boot_cpu_physical_apicid == -1U)
-		boot_cpu_physical_apicid = read_apic_id();
-
-	pr_info("Boot CPU = %d\n", boot_cpu_physical_apicid);
-}
-
-/* All CPUs enumerated by SFI must be present and enabled */
-static void __cpuinit mp_sfi_register_lapic(u8 id)
-{
-	if (MAX_APICS - id <= 0) {
-		pr_warning("Processor #%d invalid (max %d)\n",
-			id, MAX_APICS);
-		return;
-	}
-
-	pr_info("registering lapic[%d]\n", id);
-
-	generic_processor_info(id, GET_APIC_VERSION(apic_read(APIC_LVR)));
-}
-
-static int __init sfi_parse_cpus(struct sfi_table_header *table)
-{
-	struct sfi_table_simple *sb;
-	struct sfi_cpu_table_entry *pentry;
-	int i;
-	int cpu_num;
-
-	sb = (struct sfi_table_simple *)table;
-	cpu_num = SFI_GET_NUM_ENTRIES(sb, struct sfi_cpu_table_entry);
-	pentry = (struct sfi_cpu_table_entry *)sb->pentry;
-
-	for (i = 0; i < cpu_num; i++) {
-		mp_sfi_register_lapic(pentry->apic_id);
-		pentry++;
-	}
-
-	smp_found_config = 1;
-	return 0;
-}
-#endif /* CONFIG_X86_LOCAL_APIC */
-
-#ifdef CONFIG_X86_IO_APIC
-
-static int __init sfi_parse_ioapic(struct sfi_table_header *table)
-{
-	struct sfi_table_simple *sb;
-	struct sfi_apic_table_entry *pentry;
-	int i, num;
-
-	sb = (struct sfi_table_simple *)table;
-	num = SFI_GET_NUM_ENTRIES(sb, struct sfi_apic_table_entry);
-	pentry = (struct sfi_apic_table_entry *)sb->pentry;
-
-	for (i = 0; i < num; i++) {
-		mp_register_ioapic(i, pentry->phys_addr, gsi_top);
-		pentry++;
-	}
-
-	WARN(pic_mode, KERN_WARNING
-		"SFI: pic_mod shouldn't be 1 when IOAPIC table is present\n");
-	pic_mode = 0;
-	return 0;
-}
-#endif /* CONFIG_X86_IO_APIC */
-
-/*
- * sfi_platform_init(): register lapics & io-apics
- */
-int __init sfi_platform_init(void)
-{
-#ifdef CONFIG_X86_LOCAL_APIC
-	mp_sfi_register_lapic_address(sfi_lapic_addr);
-	sfi_table_parse(SFI_SIG_CPUS, NULL, NULL, sfi_parse_cpus);
-#endif
-#ifdef CONFIG_X86_IO_APIC
-	sfi_table_parse(SFI_SIG_APIC, NULL, NULL, sfi_parse_ioapic);
-#endif
-	return 0;
-}
diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile
index fdf4113befb..a964fa3c086 100644
--- a/arch/x86/platform/Makefile
+++ b/arch/x86/platform/Makefile
@@ -1 +1,2 @@
 # Platform specific code goes here
+obj-y	+= sfi/
diff --git a/arch/x86/platform/sfi/Makefile b/arch/x86/platform/sfi/Makefile
new file mode 100644
index 00000000000..cc5db1168a5
--- /dev/null
+++ b/arch/x86/platform/sfi/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_SFI)		+= sfi.o
diff --git a/arch/x86/platform/sfi/sfi.c b/arch/x86/platform/sfi/sfi.c
new file mode 100644
index 00000000000..dd4c281ffe5
--- /dev/null
+++ b/arch/x86/platform/sfi/sfi.c
@@ -0,0 +1,120 @@
+/*
+ * sfi.c - x86 architecture SFI support.
+ *
+ * Copyright (c) 2009, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+
+#define KMSG_COMPONENT "SFI"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/acpi.h>
+#include <linux/init.h>
+#include <linux/sfi.h>
+#include <linux/io.h>
+
+#include <asm/io_apic.h>
+#include <asm/mpspec.h>
+#include <asm/setup.h>
+#include <asm/apic.h>
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
+
+static void __init mp_sfi_register_lapic_address(unsigned long address)
+{
+	mp_lapic_addr = address;
+
+	set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
+	if (boot_cpu_physical_apicid == -1U)
+		boot_cpu_physical_apicid = read_apic_id();
+
+	pr_info("Boot CPU = %d\n", boot_cpu_physical_apicid);
+}
+
+/* All CPUs enumerated by SFI must be present and enabled */
+static void __cpuinit mp_sfi_register_lapic(u8 id)
+{
+	if (MAX_APICS - id <= 0) {
+		pr_warning("Processor #%d invalid (max %d)\n",
+			id, MAX_APICS);
+		return;
+	}
+
+	pr_info("registering lapic[%d]\n", id);
+
+	generic_processor_info(id, GET_APIC_VERSION(apic_read(APIC_LVR)));
+}
+
+static int __init sfi_parse_cpus(struct sfi_table_header *table)
+{
+	struct sfi_table_simple *sb;
+	struct sfi_cpu_table_entry *pentry;
+	int i;
+	int cpu_num;
+
+	sb = (struct sfi_table_simple *)table;
+	cpu_num = SFI_GET_NUM_ENTRIES(sb, struct sfi_cpu_table_entry);
+	pentry = (struct sfi_cpu_table_entry *)sb->pentry;
+
+	for (i = 0; i < cpu_num; i++) {
+		mp_sfi_register_lapic(pentry->apic_id);
+		pentry++;
+	}
+
+	smp_found_config = 1;
+	return 0;
+}
+#endif /* CONFIG_X86_LOCAL_APIC */
+
+#ifdef CONFIG_X86_IO_APIC
+
+static int __init sfi_parse_ioapic(struct sfi_table_header *table)
+{
+	struct sfi_table_simple *sb;
+	struct sfi_apic_table_entry *pentry;
+	int i, num;
+
+	sb = (struct sfi_table_simple *)table;
+	num = SFI_GET_NUM_ENTRIES(sb, struct sfi_apic_table_entry);
+	pentry = (struct sfi_apic_table_entry *)sb->pentry;
+
+	for (i = 0; i < num; i++) {
+		mp_register_ioapic(i, pentry->phys_addr, gsi_top);
+		pentry++;
+	}
+
+	WARN(pic_mode, KERN_WARNING
+		"SFI: pic_mod shouldn't be 1 when IOAPIC table is present\n");
+	pic_mode = 0;
+	return 0;
+}
+#endif /* CONFIG_X86_IO_APIC */
+
+/*
+ * sfi_platform_init(): register lapics & io-apics
+ */
+int __init sfi_platform_init(void)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+	mp_sfi_register_lapic_address(sfi_lapic_addr);
+	sfi_table_parse(SFI_SIG_CPUS, NULL, NULL, sfi_parse_cpus);
+#endif
+#ifdef CONFIG_X86_IO_APIC
+	sfi_table_parse(SFI_SIG_APIC, NULL, NULL, sfi_parse_ioapic);
+#endif
+	return 0;
+}
-- 
cgit v1.2.3-18-g5258


From b17ed48040d9e8b6ae35bc492015bf0fe1c8bae4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 16 Oct 2010 10:19:54 +0200
Subject: x86: Move efi to platform

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Huang Ying <ying.huang@intel.com>
---
 arch/x86/kernel/Makefile            |   1 -
 arch/x86/kernel/efi.c               | 613 ------------------------------------
 arch/x86/kernel/efi_32.c            | 112 -------
 arch/x86/kernel/efi_64.c            | 114 -------
 arch/x86/kernel/efi_stub_32.S       | 123 --------
 arch/x86/kernel/efi_stub_64.S       | 116 -------
 arch/x86/platform/Makefile          |   1 +
 arch/x86/platform/efi/Makefile      |   1 +
 arch/x86/platform/efi/efi.c         | 613 ++++++++++++++++++++++++++++++++++++
 arch/x86/platform/efi/efi_32.c      | 112 +++++++
 arch/x86/platform/efi/efi_64.c      | 114 +++++++
 arch/x86/platform/efi/efi_stub_32.S | 123 ++++++++
 arch/x86/platform/efi/efi_stub_64.S | 116 +++++++
 13 files changed, 1080 insertions(+), 1079 deletions(-)
 delete mode 100644 arch/x86/kernel/efi.c
 delete mode 100644 arch/x86/kernel/efi_32.c
 delete mode 100644 arch/x86/kernel/efi_64.c
 delete mode 100644 arch/x86/kernel/efi_stub_32.S
 delete mode 100644 arch/x86/kernel/efi_stub_64.S
 create mode 100644 arch/x86/platform/efi/Makefile
 create mode 100644 arch/x86/platform/efi/efi.c
 create mode 100644 arch/x86/platform/efi/efi_32.c
 create mode 100644 arch/x86/platform/efi/efi_64.c
 create mode 100644 arch/x86/platform/efi/efi_stub_32.S
 create mode 100644 arch/x86/platform/efi/efi_stub_64.S

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index d9067d10c6a..b01c7b1ac1b 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -81,7 +81,6 @@ obj-$(CONFIG_KEXEC)		+= relocate_kernel_$(BITS).o crash.o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump_$(BITS).o
 obj-$(CONFIG_KPROBES)		+= kprobes.o
 obj-$(CONFIG_MODULES)		+= module.o
-obj-$(CONFIG_EFI) 		+= efi.o efi_$(BITS).o efi_stub_$(BITS).o
 obj-$(CONFIG_DOUBLEFAULT) 	+= doublefault_32.o
 obj-$(CONFIG_KGDB)		+= kgdb.o
 obj-$(CONFIG_VM86)		+= vm86_32.o
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
deleted file mode 100644
index 0fe27d7c625..00000000000
--- a/arch/x86/kernel/efi.c
+++ /dev/null
@@ -1,613 +0,0 @@
-/*
- * Common EFI (Extensible Firmware Interface) support functions
- * Based on Extensible Firmware Interface Specification version 1.0
- *
- * Copyright (C) 1999 VA Linux Systems
- * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
- * Copyright (C) 1999-2002 Hewlett-Packard Co.
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *	Stephane Eranian <eranian@hpl.hp.com>
- * Copyright (C) 2005-2008 Intel Co.
- *	Fenghua Yu <fenghua.yu@intel.com>
- *	Bibo Mao <bibo.mao@intel.com>
- *	Chandramouli Narayanan <mouli@linux.intel.com>
- *	Huang Ying <ying.huang@intel.com>
- *
- * Copied from efi_32.c to eliminate the duplicated code between EFI
- * 32/64 support code. --ying 2007-10-26
- *
- * All EFI Runtime Services are not implemented yet as EFI only
- * supports physical mode addressing on SoftSDV. This is to be fixed
- * in a future version.  --drummond 1999-07-20
- *
- * Implemented EFI runtime services and virtual mode calls.  --davidm
- *
- * Goutham Rao: <goutham.rao@intel.com>
- *	Skip non-WB memory and ignore empty memory ranges.
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/efi.h>
-#include <linux/bootmem.h>
-#include <linux/memblock.h>
-#include <linux/spinlock.h>
-#include <linux/uaccess.h>
-#include <linux/time.h>
-#include <linux/io.h>
-#include <linux/reboot.h>
-#include <linux/bcd.h>
-
-#include <asm/setup.h>
-#include <asm/efi.h>
-#include <asm/time.h>
-#include <asm/cacheflush.h>
-#include <asm/tlbflush.h>
-#include <asm/x86_init.h>
-
-#define EFI_DEBUG	1
-#define PFX 		"EFI: "
-
-int efi_enabled;
-EXPORT_SYMBOL(efi_enabled);
-
-struct efi efi;
-EXPORT_SYMBOL(efi);
-
-struct efi_memory_map memmap;
-
-static struct efi efi_phys __initdata;
-static efi_system_table_t efi_systab __initdata;
-
-static int __init setup_noefi(char *arg)
-{
-	efi_enabled = 0;
-	return 0;
-}
-early_param("noefi", setup_noefi);
-
-int add_efi_memmap;
-EXPORT_SYMBOL(add_efi_memmap);
-
-static int __init setup_add_efi_memmap(char *arg)
-{
-	add_efi_memmap = 1;
-	return 0;
-}
-early_param("add_efi_memmap", setup_add_efi_memmap);
-
-
-static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
-{
-	return efi_call_virt2(get_time, tm, tc);
-}
-
-static efi_status_t virt_efi_set_time(efi_time_t *tm)
-{
-	return efi_call_virt1(set_time, tm);
-}
-
-static efi_status_t virt_efi_get_wakeup_time(efi_bool_t *enabled,
-					     efi_bool_t *pending,
-					     efi_time_t *tm)
-{
-	return efi_call_virt3(get_wakeup_time,
-			      enabled, pending, tm);
-}
-
-static efi_status_t virt_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm)
-{
-	return efi_call_virt2(set_wakeup_time,
-			      enabled, tm);
-}
-
-static efi_status_t virt_efi_get_variable(efi_char16_t *name,
-					  efi_guid_t *vendor,
-					  u32 *attr,
-					  unsigned long *data_size,
-					  void *data)
-{
-	return efi_call_virt5(get_variable,
-			      name, vendor, attr,
-			      data_size, data);
-}
-
-static efi_status_t virt_efi_get_next_variable(unsigned long *name_size,
-					       efi_char16_t *name,
-					       efi_guid_t *vendor)
-{
-	return efi_call_virt3(get_next_variable,
-			      name_size, name, vendor);
-}
-
-static efi_status_t virt_efi_set_variable(efi_char16_t *name,
-					  efi_guid_t *vendor,
-					  unsigned long attr,
-					  unsigned long data_size,
-					  void *data)
-{
-	return efi_call_virt5(set_variable,
-			      name, vendor, attr,
-			      data_size, data);
-}
-
-static efi_status_t virt_efi_get_next_high_mono_count(u32 *count)
-{
-	return efi_call_virt1(get_next_high_mono_count, count);
-}
-
-static void virt_efi_reset_system(int reset_type,
-				  efi_status_t status,
-				  unsigned long data_size,
-				  efi_char16_t *data)
-{
-	efi_call_virt4(reset_system, reset_type, status,
-		       data_size, data);
-}
-
-static efi_status_t virt_efi_set_virtual_address_map(
-	unsigned long memory_map_size,
-	unsigned long descriptor_size,
-	u32 descriptor_version,
-	efi_memory_desc_t *virtual_map)
-{
-	return efi_call_virt4(set_virtual_address_map,
-			      memory_map_size, descriptor_size,
-			      descriptor_version, virtual_map);
-}
-
-static efi_status_t __init phys_efi_set_virtual_address_map(
-	unsigned long memory_map_size,
-	unsigned long descriptor_size,
-	u32 descriptor_version,
-	efi_memory_desc_t *virtual_map)
-{
-	efi_status_t status;
-
-	efi_call_phys_prelog();
-	status = efi_call_phys4(efi_phys.set_virtual_address_map,
-				memory_map_size, descriptor_size,
-				descriptor_version, virtual_map);
-	efi_call_phys_epilog();
-	return status;
-}
-
-static efi_status_t __init phys_efi_get_time(efi_time_t *tm,
-					     efi_time_cap_t *tc)
-{
-	efi_status_t status;
-
-	efi_call_phys_prelog();
-	status = efi_call_phys2(efi_phys.get_time, tm, tc);
-	efi_call_phys_epilog();
-	return status;
-}
-
-int efi_set_rtc_mmss(unsigned long nowtime)
-{
-	int real_seconds, real_minutes;
-	efi_status_t 	status;
-	efi_time_t 	eft;
-	efi_time_cap_t 	cap;
-
-	status = efi.get_time(&eft, &cap);
-	if (status != EFI_SUCCESS) {
-		printk(KERN_ERR "Oops: efitime: can't read time!\n");
-		return -1;
-	}
-
-	real_seconds = nowtime % 60;
-	real_minutes = nowtime / 60;
-	if (((abs(real_minutes - eft.minute) + 15)/30) & 1)
-		real_minutes += 30;
-	real_minutes %= 60;
-	eft.minute = real_minutes;
-	eft.second = real_seconds;
-
-	status = efi.set_time(&eft);
-	if (status != EFI_SUCCESS) {
-		printk(KERN_ERR "Oops: efitime: can't write time!\n");
-		return -1;
-	}
-	return 0;
-}
-
-unsigned long efi_get_time(void)
-{
-	efi_status_t status;
-	efi_time_t eft;
-	efi_time_cap_t cap;
-
-	status = efi.get_time(&eft, &cap);
-	if (status != EFI_SUCCESS)
-		printk(KERN_ERR "Oops: efitime: can't read time!\n");
-
-	return mktime(eft.year, eft.month, eft.day, eft.hour,
-		      eft.minute, eft.second);
-}
-
-/*
- * Tell the kernel about the EFI memory map.  This might include
- * more than the max 128 entries that can fit in the e820 legacy
- * (zeropage) memory map.
- */
-
-static void __init do_add_efi_memmap(void)
-{
-	void *p;
-
-	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
-		efi_memory_desc_t *md = p;
-		unsigned long long start = md->phys_addr;
-		unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
-		int e820_type;
-
-		switch (md->type) {
-		case EFI_LOADER_CODE:
-		case EFI_LOADER_DATA:
-		case EFI_BOOT_SERVICES_CODE:
-		case EFI_BOOT_SERVICES_DATA:
-		case EFI_CONVENTIONAL_MEMORY:
-			if (md->attribute & EFI_MEMORY_WB)
-				e820_type = E820_RAM;
-			else
-				e820_type = E820_RESERVED;
-			break;
-		case EFI_ACPI_RECLAIM_MEMORY:
-			e820_type = E820_ACPI;
-			break;
-		case EFI_ACPI_MEMORY_NVS:
-			e820_type = E820_NVS;
-			break;
-		case EFI_UNUSABLE_MEMORY:
-			e820_type = E820_UNUSABLE;
-			break;
-		default:
-			/*
-			 * EFI_RESERVED_TYPE EFI_RUNTIME_SERVICES_CODE
-			 * EFI_RUNTIME_SERVICES_DATA EFI_MEMORY_MAPPED_IO
-			 * EFI_MEMORY_MAPPED_IO_PORT_SPACE EFI_PAL_CODE
-			 */
-			e820_type = E820_RESERVED;
-			break;
-		}
-		e820_add_region(start, size, e820_type);
-	}
-	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
-}
-
-void __init efi_memblock_x86_reserve_range(void)
-{
-	unsigned long pmap;
-
-#ifdef CONFIG_X86_32
-	pmap = boot_params.efi_info.efi_memmap;
-#else
-	pmap = (boot_params.efi_info.efi_memmap |
-		((__u64)boot_params.efi_info.efi_memmap_hi<<32));
-#endif
-	memmap.phys_map = (void *)pmap;
-	memmap.nr_map = boot_params.efi_info.efi_memmap_size /
-		boot_params.efi_info.efi_memdesc_size;
-	memmap.desc_version = boot_params.efi_info.efi_memdesc_version;
-	memmap.desc_size = boot_params.efi_info.efi_memdesc_size;
-	memblock_x86_reserve_range(pmap, pmap + memmap.nr_map * memmap.desc_size,
-		      "EFI memmap");
-}
-
-#if EFI_DEBUG
-static void __init print_efi_memmap(void)
-{
-	efi_memory_desc_t *md;
-	void *p;
-	int i;
-
-	for (p = memmap.map, i = 0;
-	     p < memmap.map_end;
-	     p += memmap.desc_size, i++) {
-		md = p;
-		printk(KERN_INFO PFX "mem%02u: type=%u, attr=0x%llx, "
-			"range=[0x%016llx-0x%016llx) (%lluMB)\n",
-			i, md->type, md->attribute, md->phys_addr,
-			md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT),
-			(md->num_pages >> (20 - EFI_PAGE_SHIFT)));
-	}
-}
-#endif  /*  EFI_DEBUG  */
-
-void __init efi_init(void)
-{
-	efi_config_table_t *config_tables;
-	efi_runtime_services_t *runtime;
-	efi_char16_t *c16;
-	char vendor[100] = "unknown";
-	int i = 0;
-	void *tmp;
-
-#ifdef CONFIG_X86_32
-	efi_phys.systab = (efi_system_table_t *)boot_params.efi_info.efi_systab;
-#else
-	efi_phys.systab = (efi_system_table_t *)
-		(boot_params.efi_info.efi_systab |
-		 ((__u64)boot_params.efi_info.efi_systab_hi<<32));
-#endif
-
-	efi.systab = early_ioremap((unsigned long)efi_phys.systab,
-				   sizeof(efi_system_table_t));
-	if (efi.systab == NULL)
-		printk(KERN_ERR "Couldn't map the EFI system table!\n");
-	memcpy(&efi_systab, efi.systab, sizeof(efi_system_table_t));
-	early_iounmap(efi.systab, sizeof(efi_system_table_t));
-	efi.systab = &efi_systab;
-
-	/*
-	 * Verify the EFI Table
-	 */
-	if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
-		printk(KERN_ERR "EFI system table signature incorrect!\n");
-	if ((efi.systab->hdr.revision >> 16) == 0)
-		printk(KERN_ERR "Warning: EFI system table version "
-		       "%d.%02d, expected 1.00 or greater!\n",
-		       efi.systab->hdr.revision >> 16,
-		       efi.systab->hdr.revision & 0xffff);
-
-	/*
-	 * Show what we know for posterity
-	 */
-	c16 = tmp = early_ioremap(efi.systab->fw_vendor, 2);
-	if (c16) {
-		for (i = 0; i < sizeof(vendor) - 1 && *c16; ++i)
-			vendor[i] = *c16++;
-		vendor[i] = '\0';
-	} else
-		printk(KERN_ERR PFX "Could not map the firmware vendor!\n");
-	early_iounmap(tmp, 2);
-
-	printk(KERN_INFO "EFI v%u.%.02u by %s\n",
-	       efi.systab->hdr.revision >> 16,
-	       efi.systab->hdr.revision & 0xffff, vendor);
-
-	/*
-	 * Let's see what config tables the firmware passed to us.
-	 */
-	config_tables = early_ioremap(
-		efi.systab->tables,
-		efi.systab->nr_tables * sizeof(efi_config_table_t));
-	if (config_tables == NULL)
-		printk(KERN_ERR "Could not map EFI Configuration Table!\n");
-
-	printk(KERN_INFO);
-	for (i = 0; i < efi.systab->nr_tables; i++) {
-		if (!efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID)) {
-			efi.mps = config_tables[i].table;
-			printk(" MPS=0x%lx ", config_tables[i].table);
-		} else if (!efi_guidcmp(config_tables[i].guid,
-					ACPI_20_TABLE_GUID)) {
-			efi.acpi20 = config_tables[i].table;
-			printk(" ACPI 2.0=0x%lx ", config_tables[i].table);
-		} else if (!efi_guidcmp(config_tables[i].guid,
-					ACPI_TABLE_GUID)) {
-			efi.acpi = config_tables[i].table;
-			printk(" ACPI=0x%lx ", config_tables[i].table);
-		} else if (!efi_guidcmp(config_tables[i].guid,
-					SMBIOS_TABLE_GUID)) {
-			efi.smbios = config_tables[i].table;
-			printk(" SMBIOS=0x%lx ", config_tables[i].table);
-#ifdef CONFIG_X86_UV
-		} else if (!efi_guidcmp(config_tables[i].guid,
-					UV_SYSTEM_TABLE_GUID)) {
-			efi.uv_systab = config_tables[i].table;
-			printk(" UVsystab=0x%lx ", config_tables[i].table);
-#endif
-		} else if (!efi_guidcmp(config_tables[i].guid,
-					HCDP_TABLE_GUID)) {
-			efi.hcdp = config_tables[i].table;
-			printk(" HCDP=0x%lx ", config_tables[i].table);
-		} else if (!efi_guidcmp(config_tables[i].guid,
-					UGA_IO_PROTOCOL_GUID)) {
-			efi.uga = config_tables[i].table;
-			printk(" UGA=0x%lx ", config_tables[i].table);
-		}
-	}
-	printk("\n");
-	early_iounmap(config_tables,
-			  efi.systab->nr_tables * sizeof(efi_config_table_t));
-
-	/*
-	 * Check out the runtime services table. We need to map
-	 * the runtime services table so that we can grab the physical
-	 * address of several of the EFI runtime functions, needed to
-	 * set the firmware into virtual mode.
-	 */
-	runtime = early_ioremap((unsigned long)efi.systab->runtime,
-				sizeof(efi_runtime_services_t));
-	if (runtime != NULL) {
-		/*
-		 * We will only need *early* access to the following
-		 * two EFI runtime services before set_virtual_address_map
-		 * is invoked.
-		 */
-		efi_phys.get_time = (efi_get_time_t *)runtime->get_time;
-		efi_phys.set_virtual_address_map =
-			(efi_set_virtual_address_map_t *)
-			runtime->set_virtual_address_map;
-		/*
-		 * Make efi_get_time can be called before entering
-		 * virtual mode.
-		 */
-		efi.get_time = phys_efi_get_time;
-	} else
-		printk(KERN_ERR "Could not map the EFI runtime service "
-		       "table!\n");
-	early_iounmap(runtime, sizeof(efi_runtime_services_t));
-
-	/* Map the EFI memory map */
-	memmap.map = early_ioremap((unsigned long)memmap.phys_map,
-				   memmap.nr_map * memmap.desc_size);
-	if (memmap.map == NULL)
-		printk(KERN_ERR "Could not map the EFI memory map!\n");
-	memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
-
-	if (memmap.desc_size != sizeof(efi_memory_desc_t))
-		printk(KERN_WARNING
-		  "Kernel-defined memdesc doesn't match the one from EFI!\n");
-
-	if (add_efi_memmap)
-		do_add_efi_memmap();
-
-#ifdef CONFIG_X86_32
-	x86_platform.get_wallclock = efi_get_time;
-	x86_platform.set_wallclock = efi_set_rtc_mmss;
-#endif
-
-	/* Setup for EFI runtime service */
-	reboot_type = BOOT_EFI;
-
-#if EFI_DEBUG
-	print_efi_memmap();
-#endif
-}
-
-static void __init runtime_code_page_mkexec(void)
-{
-	efi_memory_desc_t *md;
-	void *p;
-	u64 addr, npages;
-
-	/* Make EFI runtime service code area executable */
-	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
-		md = p;
-
-		if (md->type != EFI_RUNTIME_SERVICES_CODE)
-			continue;
-
-		addr = md->virt_addr;
-		npages = md->num_pages;
-		memrange_efi_to_native(&addr, &npages);
-		set_memory_x(addr, npages);
-	}
-}
-
-/*
- * This function will switch the EFI runtime services to virtual mode.
- * Essentially, look through the EFI memmap and map every region that
- * has the runtime attribute bit set in its memory descriptor and update
- * that memory descriptor with the virtual address obtained from ioremap().
- * This enables the runtime services to be called without having to
- * thunk back into physical mode for every invocation.
- */
-void __init efi_enter_virtual_mode(void)
-{
-	efi_memory_desc_t *md;
-	efi_status_t status;
-	unsigned long size;
-	u64 end, systab, addr, npages, end_pfn;
-	void *p, *va;
-
-	efi.systab = NULL;
-	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
-		md = p;
-		if (!(md->attribute & EFI_MEMORY_RUNTIME))
-			continue;
-
-		size = md->num_pages << EFI_PAGE_SHIFT;
-		end = md->phys_addr + size;
-
-		end_pfn = PFN_UP(end);
-		if (end_pfn <= max_low_pfn_mapped
-		    || (end_pfn > (1UL << (32 - PAGE_SHIFT))
-			&& end_pfn <= max_pfn_mapped))
-			va = __va(md->phys_addr);
-		else
-			va = efi_ioremap(md->phys_addr, size, md->type);
-
-		md->virt_addr = (u64) (unsigned long) va;
-
-		if (!va) {
-			printk(KERN_ERR PFX "ioremap of 0x%llX failed!\n",
-			       (unsigned long long)md->phys_addr);
-			continue;
-		}
-
-		if (!(md->attribute & EFI_MEMORY_WB)) {
-			addr = md->virt_addr;
-			npages = md->num_pages;
-			memrange_efi_to_native(&addr, &npages);
-			set_memory_uc(addr, npages);
-		}
-
-		systab = (u64) (unsigned long) efi_phys.systab;
-		if (md->phys_addr <= systab && systab < end) {
-			systab += md->virt_addr - md->phys_addr;
-			efi.systab = (efi_system_table_t *) (unsigned long) systab;
-		}
-	}
-
-	BUG_ON(!efi.systab);
-
-	status = phys_efi_set_virtual_address_map(
-		memmap.desc_size * memmap.nr_map,
-		memmap.desc_size,
-		memmap.desc_version,
-		memmap.phys_map);
-
-	if (status != EFI_SUCCESS) {
-		printk(KERN_ALERT "Unable to switch EFI into virtual mode "
-		       "(status=%lx)!\n", status);
-		panic("EFI call to SetVirtualAddressMap() failed!");
-	}
-
-	/*
-	 * Now that EFI is in virtual mode, update the function
-	 * pointers in the runtime service table to the new virtual addresses.
-	 *
-	 * Call EFI services through wrapper functions.
-	 */
-	efi.get_time = virt_efi_get_time;
-	efi.set_time = virt_efi_set_time;
-	efi.get_wakeup_time = virt_efi_get_wakeup_time;
-	efi.set_wakeup_time = virt_efi_set_wakeup_time;
-	efi.get_variable = virt_efi_get_variable;
-	efi.get_next_variable = virt_efi_get_next_variable;
-	efi.set_variable = virt_efi_set_variable;
-	efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count;
-	efi.reset_system = virt_efi_reset_system;
-	efi.set_virtual_address_map = virt_efi_set_virtual_address_map;
-	if (__supported_pte_mask & _PAGE_NX)
-		runtime_code_page_mkexec();
-	early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size);
-	memmap.map = NULL;
-}
-
-/*
- * Convenience functions to obtain memory types and attributes
- */
-u32 efi_mem_type(unsigned long phys_addr)
-{
-	efi_memory_desc_t *md;
-	void *p;
-
-	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
-		md = p;
-		if ((md->phys_addr <= phys_addr) &&
-		    (phys_addr < (md->phys_addr +
-				  (md->num_pages << EFI_PAGE_SHIFT))))
-			return md->type;
-	}
-	return 0;
-}
-
-u64 efi_mem_attributes(unsigned long phys_addr)
-{
-	efi_memory_desc_t *md;
-	void *p;
-
-	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
-		md = p;
-		if ((md->phys_addr <= phys_addr) &&
-		    (phys_addr < (md->phys_addr +
-				  (md->num_pages << EFI_PAGE_SHIFT))))
-			return md->attribute;
-	}
-	return 0;
-}
diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/kernel/efi_32.c
deleted file mode 100644
index 5cab48ee61a..00000000000
--- a/arch/x86/kernel/efi_32.c
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Extensible Firmware Interface
- *
- * Based on Extensible Firmware Interface Specification version 1.0
- *
- * Copyright (C) 1999 VA Linux Systems
- * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
- * Copyright (C) 1999-2002 Hewlett-Packard Co.
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *	Stephane Eranian <eranian@hpl.hp.com>
- *
- * All EFI Runtime Services are not implemented yet as EFI only
- * supports physical mode addressing on SoftSDV. This is to be fixed
- * in a future version.  --drummond 1999-07-20
- *
- * Implemented EFI runtime services and virtual mode calls.  --davidm
- *
- * Goutham Rao: <goutham.rao@intel.com>
- *	Skip non-WB memory and ignore empty memory ranges.
- */
-
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/ioport.h>
-#include <linux/efi.h>
-
-#include <asm/io.h>
-#include <asm/page.h>
-#include <asm/pgtable.h>
-#include <asm/tlbflush.h>
-#include <asm/efi.h>
-
-/*
- * To make EFI call EFI runtime service in physical addressing mode we need
- * prelog/epilog before/after the invocation to disable interrupt, to
- * claim EFI runtime service handler exclusively and to duplicate a memory in
- * low memory space say 0 - 3G.
- */
-
-static unsigned long efi_rt_eflags;
-static pgd_t efi_bak_pg_dir_pointer[2];
-
-void efi_call_phys_prelog(void)
-{
-	unsigned long cr4;
-	unsigned long temp;
-	struct desc_ptr gdt_descr;
-
-	local_irq_save(efi_rt_eflags);
-
-	/*
-	 * If I don't have PAE, I should just duplicate two entries in page
-	 * directory. If I have PAE, I just need to duplicate one entry in
-	 * page directory.
-	 */
-	cr4 = read_cr4_safe();
-
-	if (cr4 & X86_CR4_PAE) {
-		efi_bak_pg_dir_pointer[0].pgd =
-		    swapper_pg_dir[pgd_index(0)].pgd;
-		swapper_pg_dir[0].pgd =
-		    swapper_pg_dir[pgd_index(PAGE_OFFSET)].pgd;
-	} else {
-		efi_bak_pg_dir_pointer[0].pgd =
-		    swapper_pg_dir[pgd_index(0)].pgd;
-		efi_bak_pg_dir_pointer[1].pgd =
-		    swapper_pg_dir[pgd_index(0x400000)].pgd;
-		swapper_pg_dir[pgd_index(0)].pgd =
-		    swapper_pg_dir[pgd_index(PAGE_OFFSET)].pgd;
-		temp = PAGE_OFFSET + 0x400000;
-		swapper_pg_dir[pgd_index(0x400000)].pgd =
-		    swapper_pg_dir[pgd_index(temp)].pgd;
-	}
-
-	/*
-	 * After the lock is released, the original page table is restored.
-	 */
-	__flush_tlb_all();
-
-	gdt_descr.address = __pa(get_cpu_gdt_table(0));
-	gdt_descr.size = GDT_SIZE - 1;
-	load_gdt(&gdt_descr);
-}
-
-void efi_call_phys_epilog(void)
-{
-	unsigned long cr4;
-	struct desc_ptr gdt_descr;
-
-	gdt_descr.address = (unsigned long)get_cpu_gdt_table(0);
-	gdt_descr.size = GDT_SIZE - 1;
-	load_gdt(&gdt_descr);
-
-	cr4 = read_cr4_safe();
-
-	if (cr4 & X86_CR4_PAE) {
-		swapper_pg_dir[pgd_index(0)].pgd =
-		    efi_bak_pg_dir_pointer[0].pgd;
-	} else {
-		swapper_pg_dir[pgd_index(0)].pgd =
-		    efi_bak_pg_dir_pointer[0].pgd;
-		swapper_pg_dir[pgd_index(0x400000)].pgd =
-		    efi_bak_pg_dir_pointer[1].pgd;
-	}
-
-	/*
-	 * After the lock is released, the original page table is restored.
-	 */
-	__flush_tlb_all();
-
-	local_irq_restore(efi_rt_eflags);
-}
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c
deleted file mode 100644
index ac0621a7ac3..00000000000
--- a/arch/x86/kernel/efi_64.c
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * x86_64 specific EFI support functions
- * Based on Extensible Firmware Interface Specification version 1.0
- *
- * Copyright (C) 2005-2008 Intel Co.
- *	Fenghua Yu <fenghua.yu@intel.com>
- *	Bibo Mao <bibo.mao@intel.com>
- *	Chandramouli Narayanan <mouli@linux.intel.com>
- *	Huang Ying <ying.huang@intel.com>
- *
- * Code to convert EFI to E820 map has been implemented in elilo bootloader
- * based on a EFI patch by Edgar Hucek. Based on the E820 map, the page table
- * is setup appropriately for EFI runtime code.
- * - mouli 06/14/2007.
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/mm.h>
-#include <linux/types.h>
-#include <linux/spinlock.h>
-#include <linux/bootmem.h>
-#include <linux/ioport.h>
-#include <linux/module.h>
-#include <linux/efi.h>
-#include <linux/uaccess.h>
-#include <linux/io.h>
-#include <linux/reboot.h>
-
-#include <asm/setup.h>
-#include <asm/page.h>
-#include <asm/e820.h>
-#include <asm/pgtable.h>
-#include <asm/tlbflush.h>
-#include <asm/proto.h>
-#include <asm/efi.h>
-#include <asm/cacheflush.h>
-#include <asm/fixmap.h>
-
-static pgd_t save_pgd __initdata;
-static unsigned long efi_flags __initdata;
-
-static void __init early_mapping_set_exec(unsigned long start,
-					  unsigned long end,
-					  int executable)
-{
-	unsigned long num_pages;
-
-	start &= PMD_MASK;
-	end = (end + PMD_SIZE - 1) & PMD_MASK;
-	num_pages = (end - start) >> PAGE_SHIFT;
-	if (executable)
-		set_memory_x((unsigned long)__va(start), num_pages);
-	else
-		set_memory_nx((unsigned long)__va(start), num_pages);
-}
-
-static void __init early_runtime_code_mapping_set_exec(int executable)
-{
-	efi_memory_desc_t *md;
-	void *p;
-
-	if (!(__supported_pte_mask & _PAGE_NX))
-		return;
-
-	/* Make EFI runtime service code area executable */
-	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
-		md = p;
-		if (md->type == EFI_RUNTIME_SERVICES_CODE) {
-			unsigned long end;
-			end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT);
-			early_mapping_set_exec(md->phys_addr, end, executable);
-		}
-	}
-}
-
-void __init efi_call_phys_prelog(void)
-{
-	unsigned long vaddress;
-
-	early_runtime_code_mapping_set_exec(1);
-	local_irq_save(efi_flags);
-	vaddress = (unsigned long)__va(0x0UL);
-	save_pgd = *pgd_offset_k(0x0UL);
-	set_pgd(pgd_offset_k(0x0UL), *pgd_offset_k(vaddress));
-	__flush_tlb_all();
-}
-
-void __init efi_call_phys_epilog(void)
-{
-	/*
-	 * After the lock is released, the original page table is restored.
-	 */
-	set_pgd(pgd_offset_k(0x0UL), save_pgd);
-	__flush_tlb_all();
-	local_irq_restore(efi_flags);
-	early_runtime_code_mapping_set_exec(0);
-}
-
-void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size,
-				 u32 type)
-{
-	unsigned long last_map_pfn;
-
-	if (type == EFI_MEMORY_MAPPED_IO)
-		return ioremap(phys_addr, size);
-
-	last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size);
-	if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size)
-		return NULL;
-
-	return (void __iomem *)__va(phys_addr);
-}
diff --git a/arch/x86/kernel/efi_stub_32.S b/arch/x86/kernel/efi_stub_32.S
deleted file mode 100644
index fbe66e626c0..00000000000
--- a/arch/x86/kernel/efi_stub_32.S
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * EFI call stub for IA32.
- *
- * This stub allows us to make EFI calls in physical mode with interrupts
- * turned off.
- */
-
-#include <linux/linkage.h>
-#include <asm/page_types.h>
-
-/*
- * efi_call_phys(void *, ...) is a function with variable parameters.
- * All the callers of this function assure that all the parameters are 4-bytes.
- */
-
-/*
- * In gcc calling convention, EBX, ESP, EBP, ESI and EDI are all callee save.
- * So we'd better save all of them at the beginning of this function and restore
- * at the end no matter how many we use, because we can not assure EFI runtime
- * service functions will comply with gcc calling convention, too.
- */
-
-.text
-ENTRY(efi_call_phys)
-	/*
-	 * 0. The function can only be called in Linux kernel. So CS has been
-	 * set to 0x0010, DS and SS have been set to 0x0018. In EFI, I found
-	 * the values of these registers are the same. And, the corresponding
-	 * GDT entries are identical. So I will do nothing about segment reg
-	 * and GDT, but change GDT base register in prelog and epilog.
-	 */
-
-	/*
-	 * 1. Now I am running with EIP = <physical address> + PAGE_OFFSET.
-	 * But to make it smoothly switch from virtual mode to flat mode.
-	 * The mapping of lower virtual memory has been created in prelog and
-	 * epilog.
-	 */
-	movl	$1f, %edx
-	subl	$__PAGE_OFFSET, %edx
-	jmp	*%edx
-1:
-
-	/*
-	 * 2. Now on the top of stack is the return
-	 * address in the caller of efi_call_phys(), then parameter 1,
-	 * parameter 2, ..., param n. To make things easy, we save the return
-	 * address of efi_call_phys in a global variable.
-	 */
-	popl	%edx
-	movl	%edx, saved_return_addr
-	/* get the function pointer into ECX*/
-	popl	%ecx
-	movl	%ecx, efi_rt_function_ptr
-	movl	$2f, %edx
-	subl	$__PAGE_OFFSET, %edx
-	pushl	%edx
-
-	/*
-	 * 3. Clear PG bit in %CR0.
-	 */
-	movl	%cr0, %edx
-	andl	$0x7fffffff, %edx
-	movl	%edx, %cr0
-	jmp	1f
-1:
-
-	/*
-	 * 4. Adjust stack pointer.
-	 */
-	subl	$__PAGE_OFFSET, %esp
-
-	/*
-	 * 5. Call the physical function.
-	 */
-	jmp	*%ecx
-
-2:
-	/*
-	 * 6. After EFI runtime service returns, control will return to
-	 * following instruction. We'd better readjust stack pointer first.
-	 */
-	addl	$__PAGE_OFFSET, %esp
-
-	/*
-	 * 7. Restore PG bit
-	 */
-	movl	%cr0, %edx
-	orl	$0x80000000, %edx
-	movl	%edx, %cr0
-	jmp	1f
-1:
-	/*
-	 * 8. Now restore the virtual mode from flat mode by
-	 * adding EIP with PAGE_OFFSET.
-	 */
-	movl	$1f, %edx
-	jmp	*%edx
-1:
-
-	/*
-	 * 9. Balance the stack. And because EAX contain the return value,
-	 * we'd better not clobber it.
-	 */
-	leal	efi_rt_function_ptr, %edx
-	movl	(%edx), %ecx
-	pushl	%ecx
-
-	/*
-	 * 10. Push the saved return address onto the stack and return.
-	 */
-	leal	saved_return_addr, %edx
-	movl	(%edx), %ecx
-	pushl	%ecx
-	ret
-ENDPROC(efi_call_phys)
-.previous
-
-.data
-saved_return_addr:
-	.long 0
-efi_rt_function_ptr:
-	.long 0
diff --git a/arch/x86/kernel/efi_stub_64.S b/arch/x86/kernel/efi_stub_64.S
deleted file mode 100644
index 4c07ccab814..00000000000
--- a/arch/x86/kernel/efi_stub_64.S
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Function calling ABI conversion from Linux to EFI for x86_64
- *
- * Copyright (C) 2007 Intel Corp
- *	Bibo Mao <bibo.mao@intel.com>
- *	Huang Ying <ying.huang@intel.com>
- */
-
-#include <linux/linkage.h>
-
-#define SAVE_XMM			\
-	mov %rsp, %rax;			\
-	subq $0x70, %rsp;		\
-	and $~0xf, %rsp;		\
-	mov %rax, (%rsp);		\
-	mov %cr0, %rax;			\
-	clts;				\
-	mov %rax, 0x8(%rsp);		\
-	movaps %xmm0, 0x60(%rsp);	\
-	movaps %xmm1, 0x50(%rsp);	\
-	movaps %xmm2, 0x40(%rsp);	\
-	movaps %xmm3, 0x30(%rsp);	\
-	movaps %xmm4, 0x20(%rsp);	\
-	movaps %xmm5, 0x10(%rsp)
-
-#define RESTORE_XMM			\
-	movaps 0x60(%rsp), %xmm0;	\
-	movaps 0x50(%rsp), %xmm1;	\
-	movaps 0x40(%rsp), %xmm2;	\
-	movaps 0x30(%rsp), %xmm3;	\
-	movaps 0x20(%rsp), %xmm4;	\
-	movaps 0x10(%rsp), %xmm5;	\
-	mov 0x8(%rsp), %rsi;		\
-	mov %rsi, %cr0;			\
-	mov (%rsp), %rsp
-
-ENTRY(efi_call0)
-	SAVE_XMM
-	subq $32, %rsp
-	call *%rdi
-	addq $32, %rsp
-	RESTORE_XMM
-	ret
-ENDPROC(efi_call0)
-
-ENTRY(efi_call1)
-	SAVE_XMM
-	subq $32, %rsp
-	mov  %rsi, %rcx
-	call *%rdi
-	addq $32, %rsp
-	RESTORE_XMM
-	ret
-ENDPROC(efi_call1)
-
-ENTRY(efi_call2)
-	SAVE_XMM
-	subq $32, %rsp
-	mov  %rsi, %rcx
-	call *%rdi
-	addq $32, %rsp
-	RESTORE_XMM
-	ret
-ENDPROC(efi_call2)
-
-ENTRY(efi_call3)
-	SAVE_XMM
-	subq $32, %rsp
-	mov  %rcx, %r8
-	mov  %rsi, %rcx
-	call *%rdi
-	addq $32, %rsp
-	RESTORE_XMM
-	ret
-ENDPROC(efi_call3)
-
-ENTRY(efi_call4)
-	SAVE_XMM
-	subq $32, %rsp
-	mov %r8, %r9
-	mov %rcx, %r8
-	mov %rsi, %rcx
-	call *%rdi
-	addq $32, %rsp
-	RESTORE_XMM
-	ret
-ENDPROC(efi_call4)
-
-ENTRY(efi_call5)
-	SAVE_XMM
-	subq $48, %rsp
-	mov %r9, 32(%rsp)
-	mov %r8, %r9
-	mov %rcx, %r8
-	mov %rsi, %rcx
-	call *%rdi
-	addq $48, %rsp
-	RESTORE_XMM
-	ret
-ENDPROC(efi_call5)
-
-ENTRY(efi_call6)
-	SAVE_XMM
-	mov (%rsp), %rax
-	mov 8(%rax), %rax
-	subq $48, %rsp
-	mov %r9, 32(%rsp)
-	mov %rax, 40(%rsp)
-	mov %r8, %r9
-	mov %rcx, %r8
-	mov %rsi, %rcx
-	call *%rdi
-	addq $48, %rsp
-	RESTORE_XMM
-	ret
-ENDPROC(efi_call6)
diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile
index a964fa3c086..99e95b32ae7 100644
--- a/arch/x86/platform/Makefile
+++ b/arch/x86/platform/Makefile
@@ -1,2 +1,3 @@
 # Platform specific code goes here
+obj-y	+= efi/
 obj-y	+= sfi/
diff --git a/arch/x86/platform/efi/Makefile b/arch/x86/platform/efi/Makefile
new file mode 100644
index 00000000000..73b8be0f367
--- /dev/null
+++ b/arch/x86/platform/efi/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_EFI) 		+= efi.o efi_$(BITS).o efi_stub_$(BITS).o
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
new file mode 100644
index 00000000000..0fe27d7c625
--- /dev/null
+++ b/arch/x86/platform/efi/efi.c
@@ -0,0 +1,613 @@
+/*
+ * Common EFI (Extensible Firmware Interface) support functions
+ * Based on Extensible Firmware Interface Specification version 1.0
+ *
+ * Copyright (C) 1999 VA Linux Systems
+ * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
+ * Copyright (C) 1999-2002 Hewlett-Packard Co.
+ *	David Mosberger-Tang <davidm@hpl.hp.com>
+ *	Stephane Eranian <eranian@hpl.hp.com>
+ * Copyright (C) 2005-2008 Intel Co.
+ *	Fenghua Yu <fenghua.yu@intel.com>
+ *	Bibo Mao <bibo.mao@intel.com>
+ *	Chandramouli Narayanan <mouli@linux.intel.com>
+ *	Huang Ying <ying.huang@intel.com>
+ *
+ * Copied from efi_32.c to eliminate the duplicated code between EFI
+ * 32/64 support code. --ying 2007-10-26
+ *
+ * All EFI Runtime Services are not implemented yet as EFI only
+ * supports physical mode addressing on SoftSDV. This is to be fixed
+ * in a future version.  --drummond 1999-07-20
+ *
+ * Implemented EFI runtime services and virtual mode calls.  --davidm
+ *
+ * Goutham Rao: <goutham.rao@intel.com>
+ *	Skip non-WB memory and ignore empty memory ranges.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/efi.h>
+#include <linux/bootmem.h>
+#include <linux/memblock.h>
+#include <linux/spinlock.h>
+#include <linux/uaccess.h>
+#include <linux/time.h>
+#include <linux/io.h>
+#include <linux/reboot.h>
+#include <linux/bcd.h>
+
+#include <asm/setup.h>
+#include <asm/efi.h>
+#include <asm/time.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+#include <asm/x86_init.h>
+
+#define EFI_DEBUG	1
+#define PFX 		"EFI: "
+
+int efi_enabled;
+EXPORT_SYMBOL(efi_enabled);
+
+struct efi efi;
+EXPORT_SYMBOL(efi);
+
+struct efi_memory_map memmap;
+
+static struct efi efi_phys __initdata;
+static efi_system_table_t efi_systab __initdata;
+
+static int __init setup_noefi(char *arg)
+{
+	efi_enabled = 0;
+	return 0;
+}
+early_param("noefi", setup_noefi);
+
+int add_efi_memmap;
+EXPORT_SYMBOL(add_efi_memmap);
+
+static int __init setup_add_efi_memmap(char *arg)
+{
+	add_efi_memmap = 1;
+	return 0;
+}
+early_param("add_efi_memmap", setup_add_efi_memmap);
+
+
+static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
+{
+	return efi_call_virt2(get_time, tm, tc);
+}
+
+static efi_status_t virt_efi_set_time(efi_time_t *tm)
+{
+	return efi_call_virt1(set_time, tm);
+}
+
+static efi_status_t virt_efi_get_wakeup_time(efi_bool_t *enabled,
+					     efi_bool_t *pending,
+					     efi_time_t *tm)
+{
+	return efi_call_virt3(get_wakeup_time,
+			      enabled, pending, tm);
+}
+
+static efi_status_t virt_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm)
+{
+	return efi_call_virt2(set_wakeup_time,
+			      enabled, tm);
+}
+
+static efi_status_t virt_efi_get_variable(efi_char16_t *name,
+					  efi_guid_t *vendor,
+					  u32 *attr,
+					  unsigned long *data_size,
+					  void *data)
+{
+	return efi_call_virt5(get_variable,
+			      name, vendor, attr,
+			      data_size, data);
+}
+
+static efi_status_t virt_efi_get_next_variable(unsigned long *name_size,
+					       efi_char16_t *name,
+					       efi_guid_t *vendor)
+{
+	return efi_call_virt3(get_next_variable,
+			      name_size, name, vendor);
+}
+
+static efi_status_t virt_efi_set_variable(efi_char16_t *name,
+					  efi_guid_t *vendor,
+					  unsigned long attr,
+					  unsigned long data_size,
+					  void *data)
+{
+	return efi_call_virt5(set_variable,
+			      name, vendor, attr,
+			      data_size, data);
+}
+
+static efi_status_t virt_efi_get_next_high_mono_count(u32 *count)
+{
+	return efi_call_virt1(get_next_high_mono_count, count);
+}
+
+static void virt_efi_reset_system(int reset_type,
+				  efi_status_t status,
+				  unsigned long data_size,
+				  efi_char16_t *data)
+{
+	efi_call_virt4(reset_system, reset_type, status,
+		       data_size, data);
+}
+
+static efi_status_t virt_efi_set_virtual_address_map(
+	unsigned long memory_map_size,
+	unsigned long descriptor_size,
+	u32 descriptor_version,
+	efi_memory_desc_t *virtual_map)
+{
+	return efi_call_virt4(set_virtual_address_map,
+			      memory_map_size, descriptor_size,
+			      descriptor_version, virtual_map);
+}
+
+static efi_status_t __init phys_efi_set_virtual_address_map(
+	unsigned long memory_map_size,
+	unsigned long descriptor_size,
+	u32 descriptor_version,
+	efi_memory_desc_t *virtual_map)
+{
+	efi_status_t status;
+
+	efi_call_phys_prelog();
+	status = efi_call_phys4(efi_phys.set_virtual_address_map,
+				memory_map_size, descriptor_size,
+				descriptor_version, virtual_map);
+	efi_call_phys_epilog();
+	return status;
+}
+
+static efi_status_t __init phys_efi_get_time(efi_time_t *tm,
+					     efi_time_cap_t *tc)
+{
+	efi_status_t status;
+
+	efi_call_phys_prelog();
+	status = efi_call_phys2(efi_phys.get_time, tm, tc);
+	efi_call_phys_epilog();
+	return status;
+}
+
+int efi_set_rtc_mmss(unsigned long nowtime)
+{
+	int real_seconds, real_minutes;
+	efi_status_t 	status;
+	efi_time_t 	eft;
+	efi_time_cap_t 	cap;
+
+	status = efi.get_time(&eft, &cap);
+	if (status != EFI_SUCCESS) {
+		printk(KERN_ERR "Oops: efitime: can't read time!\n");
+		return -1;
+	}
+
+	real_seconds = nowtime % 60;
+	real_minutes = nowtime / 60;
+	if (((abs(real_minutes - eft.minute) + 15)/30) & 1)
+		real_minutes += 30;
+	real_minutes %= 60;
+	eft.minute = real_minutes;
+	eft.second = real_seconds;
+
+	status = efi.set_time(&eft);
+	if (status != EFI_SUCCESS) {
+		printk(KERN_ERR "Oops: efitime: can't write time!\n");
+		return -1;
+	}
+	return 0;
+}
+
+unsigned long efi_get_time(void)
+{
+	efi_status_t status;
+	efi_time_t eft;
+	efi_time_cap_t cap;
+
+	status = efi.get_time(&eft, &cap);
+	if (status != EFI_SUCCESS)
+		printk(KERN_ERR "Oops: efitime: can't read time!\n");
+
+	return mktime(eft.year, eft.month, eft.day, eft.hour,
+		      eft.minute, eft.second);
+}
+
+/*
+ * Tell the kernel about the EFI memory map.  This might include
+ * more than the max 128 entries that can fit in the e820 legacy
+ * (zeropage) memory map.
+ */
+
+static void __init do_add_efi_memmap(void)
+{
+	void *p;
+
+	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+		efi_memory_desc_t *md = p;
+		unsigned long long start = md->phys_addr;
+		unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
+		int e820_type;
+
+		switch (md->type) {
+		case EFI_LOADER_CODE:
+		case EFI_LOADER_DATA:
+		case EFI_BOOT_SERVICES_CODE:
+		case EFI_BOOT_SERVICES_DATA:
+		case EFI_CONVENTIONAL_MEMORY:
+			if (md->attribute & EFI_MEMORY_WB)
+				e820_type = E820_RAM;
+			else
+				e820_type = E820_RESERVED;
+			break;
+		case EFI_ACPI_RECLAIM_MEMORY:
+			e820_type = E820_ACPI;
+			break;
+		case EFI_ACPI_MEMORY_NVS:
+			e820_type = E820_NVS;
+			break;
+		case EFI_UNUSABLE_MEMORY:
+			e820_type = E820_UNUSABLE;
+			break;
+		default:
+			/*
+			 * EFI_RESERVED_TYPE EFI_RUNTIME_SERVICES_CODE
+			 * EFI_RUNTIME_SERVICES_DATA EFI_MEMORY_MAPPED_IO
+			 * EFI_MEMORY_MAPPED_IO_PORT_SPACE EFI_PAL_CODE
+			 */
+			e820_type = E820_RESERVED;
+			break;
+		}
+		e820_add_region(start, size, e820_type);
+	}
+	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+}
+
+void __init efi_memblock_x86_reserve_range(void)
+{
+	unsigned long pmap;
+
+#ifdef CONFIG_X86_32
+	pmap = boot_params.efi_info.efi_memmap;
+#else
+	pmap = (boot_params.efi_info.efi_memmap |
+		((__u64)boot_params.efi_info.efi_memmap_hi<<32));
+#endif
+	memmap.phys_map = (void *)pmap;
+	memmap.nr_map = boot_params.efi_info.efi_memmap_size /
+		boot_params.efi_info.efi_memdesc_size;
+	memmap.desc_version = boot_params.efi_info.efi_memdesc_version;
+	memmap.desc_size = boot_params.efi_info.efi_memdesc_size;
+	memblock_x86_reserve_range(pmap, pmap + memmap.nr_map * memmap.desc_size,
+		      "EFI memmap");
+}
+
+#if EFI_DEBUG
+static void __init print_efi_memmap(void)
+{
+	efi_memory_desc_t *md;
+	void *p;
+	int i;
+
+	for (p = memmap.map, i = 0;
+	     p < memmap.map_end;
+	     p += memmap.desc_size, i++) {
+		md = p;
+		printk(KERN_INFO PFX "mem%02u: type=%u, attr=0x%llx, "
+			"range=[0x%016llx-0x%016llx) (%lluMB)\n",
+			i, md->type, md->attribute, md->phys_addr,
+			md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT),
+			(md->num_pages >> (20 - EFI_PAGE_SHIFT)));
+	}
+}
+#endif  /*  EFI_DEBUG  */
+
+void __init efi_init(void)
+{
+	efi_config_table_t *config_tables;
+	efi_runtime_services_t *runtime;
+	efi_char16_t *c16;
+	char vendor[100] = "unknown";
+	int i = 0;
+	void *tmp;
+
+#ifdef CONFIG_X86_32
+	efi_phys.systab = (efi_system_table_t *)boot_params.efi_info.efi_systab;
+#else
+	efi_phys.systab = (efi_system_table_t *)
+		(boot_params.efi_info.efi_systab |
+		 ((__u64)boot_params.efi_info.efi_systab_hi<<32));
+#endif
+
+	efi.systab = early_ioremap((unsigned long)efi_phys.systab,
+				   sizeof(efi_system_table_t));
+	if (efi.systab == NULL)
+		printk(KERN_ERR "Couldn't map the EFI system table!\n");
+	memcpy(&efi_systab, efi.systab, sizeof(efi_system_table_t));
+	early_iounmap(efi.systab, sizeof(efi_system_table_t));
+	efi.systab = &efi_systab;
+
+	/*
+	 * Verify the EFI Table
+	 */
+	if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
+		printk(KERN_ERR "EFI system table signature incorrect!\n");
+	if ((efi.systab->hdr.revision >> 16) == 0)
+		printk(KERN_ERR "Warning: EFI system table version "
+		       "%d.%02d, expected 1.00 or greater!\n",
+		       efi.systab->hdr.revision >> 16,
+		       efi.systab->hdr.revision & 0xffff);
+
+	/*
+	 * Show what we know for posterity
+	 */
+	c16 = tmp = early_ioremap(efi.systab->fw_vendor, 2);
+	if (c16) {
+		for (i = 0; i < sizeof(vendor) - 1 && *c16; ++i)
+			vendor[i] = *c16++;
+		vendor[i] = '\0';
+	} else
+		printk(KERN_ERR PFX "Could not map the firmware vendor!\n");
+	early_iounmap(tmp, 2);
+
+	printk(KERN_INFO "EFI v%u.%.02u by %s\n",
+	       efi.systab->hdr.revision >> 16,
+	       efi.systab->hdr.revision & 0xffff, vendor);
+
+	/*
+	 * Let's see what config tables the firmware passed to us.
+	 */
+	config_tables = early_ioremap(
+		efi.systab->tables,
+		efi.systab->nr_tables * sizeof(efi_config_table_t));
+	if (config_tables == NULL)
+		printk(KERN_ERR "Could not map EFI Configuration Table!\n");
+
+	printk(KERN_INFO);
+	for (i = 0; i < efi.systab->nr_tables; i++) {
+		if (!efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID)) {
+			efi.mps = config_tables[i].table;
+			printk(" MPS=0x%lx ", config_tables[i].table);
+		} else if (!efi_guidcmp(config_tables[i].guid,
+					ACPI_20_TABLE_GUID)) {
+			efi.acpi20 = config_tables[i].table;
+			printk(" ACPI 2.0=0x%lx ", config_tables[i].table);
+		} else if (!efi_guidcmp(config_tables[i].guid,
+					ACPI_TABLE_GUID)) {
+			efi.acpi = config_tables[i].table;
+			printk(" ACPI=0x%lx ", config_tables[i].table);
+		} else if (!efi_guidcmp(config_tables[i].guid,
+					SMBIOS_TABLE_GUID)) {
+			efi.smbios = config_tables[i].table;
+			printk(" SMBIOS=0x%lx ", config_tables[i].table);
+#ifdef CONFIG_X86_UV
+		} else if (!efi_guidcmp(config_tables[i].guid,
+					UV_SYSTEM_TABLE_GUID)) {
+			efi.uv_systab = config_tables[i].table;
+			printk(" UVsystab=0x%lx ", config_tables[i].table);
+#endif
+		} else if (!efi_guidcmp(config_tables[i].guid,
+					HCDP_TABLE_GUID)) {
+			efi.hcdp = config_tables[i].table;
+			printk(" HCDP=0x%lx ", config_tables[i].table);
+		} else if (!efi_guidcmp(config_tables[i].guid,
+					UGA_IO_PROTOCOL_GUID)) {
+			efi.uga = config_tables[i].table;
+			printk(" UGA=0x%lx ", config_tables[i].table);
+		}
+	}
+	printk("\n");
+	early_iounmap(config_tables,
+			  efi.systab->nr_tables * sizeof(efi_config_table_t));
+
+	/*
+	 * Check out the runtime services table. We need to map
+	 * the runtime services table so that we can grab the physical
+	 * address of several of the EFI runtime functions, needed to
+	 * set the firmware into virtual mode.
+	 */
+	runtime = early_ioremap((unsigned long)efi.systab->runtime,
+				sizeof(efi_runtime_services_t));
+	if (runtime != NULL) {
+		/*
+		 * We will only need *early* access to the following
+		 * two EFI runtime services before set_virtual_address_map
+		 * is invoked.
+		 */
+		efi_phys.get_time = (efi_get_time_t *)runtime->get_time;
+		efi_phys.set_virtual_address_map =
+			(efi_set_virtual_address_map_t *)
+			runtime->set_virtual_address_map;
+		/*
+		 * Make efi_get_time can be called before entering
+		 * virtual mode.
+		 */
+		efi.get_time = phys_efi_get_time;
+	} else
+		printk(KERN_ERR "Could not map the EFI runtime service "
+		       "table!\n");
+	early_iounmap(runtime, sizeof(efi_runtime_services_t));
+
+	/* Map the EFI memory map */
+	memmap.map = early_ioremap((unsigned long)memmap.phys_map,
+				   memmap.nr_map * memmap.desc_size);
+	if (memmap.map == NULL)
+		printk(KERN_ERR "Could not map the EFI memory map!\n");
+	memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
+
+	if (memmap.desc_size != sizeof(efi_memory_desc_t))
+		printk(KERN_WARNING
+		  "Kernel-defined memdesc doesn't match the one from EFI!\n");
+
+	if (add_efi_memmap)
+		do_add_efi_memmap();
+
+#ifdef CONFIG_X86_32
+	x86_platform.get_wallclock = efi_get_time;
+	x86_platform.set_wallclock = efi_set_rtc_mmss;
+#endif
+
+	/* Setup for EFI runtime service */
+	reboot_type = BOOT_EFI;
+
+#if EFI_DEBUG
+	print_efi_memmap();
+#endif
+}
+
+static void __init runtime_code_page_mkexec(void)
+{
+	efi_memory_desc_t *md;
+	void *p;
+	u64 addr, npages;
+
+	/* Make EFI runtime service code area executable */
+	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+		md = p;
+
+		if (md->type != EFI_RUNTIME_SERVICES_CODE)
+			continue;
+
+		addr = md->virt_addr;
+		npages = md->num_pages;
+		memrange_efi_to_native(&addr, &npages);
+		set_memory_x(addr, npages);
+	}
+}
+
+/*
+ * This function will switch the EFI runtime services to virtual mode.
+ * Essentially, look through the EFI memmap and map every region that
+ * has the runtime attribute bit set in its memory descriptor and update
+ * that memory descriptor with the virtual address obtained from ioremap().
+ * This enables the runtime services to be called without having to
+ * thunk back into physical mode for every invocation.
+ */
+void __init efi_enter_virtual_mode(void)
+{
+	efi_memory_desc_t *md;
+	efi_status_t status;
+	unsigned long size;
+	u64 end, systab, addr, npages, end_pfn;
+	void *p, *va;
+
+	efi.systab = NULL;
+	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+		md = p;
+		if (!(md->attribute & EFI_MEMORY_RUNTIME))
+			continue;
+
+		size = md->num_pages << EFI_PAGE_SHIFT;
+		end = md->phys_addr + size;
+
+		end_pfn = PFN_UP(end);
+		if (end_pfn <= max_low_pfn_mapped
+		    || (end_pfn > (1UL << (32 - PAGE_SHIFT))
+			&& end_pfn <= max_pfn_mapped))
+			va = __va(md->phys_addr);
+		else
+			va = efi_ioremap(md->phys_addr, size, md->type);
+
+		md->virt_addr = (u64) (unsigned long) va;
+
+		if (!va) {
+			printk(KERN_ERR PFX "ioremap of 0x%llX failed!\n",
+			       (unsigned long long)md->phys_addr);
+			continue;
+		}
+
+		if (!(md->attribute & EFI_MEMORY_WB)) {
+			addr = md->virt_addr;
+			npages = md->num_pages;
+			memrange_efi_to_native(&addr, &npages);
+			set_memory_uc(addr, npages);
+		}
+
+		systab = (u64) (unsigned long) efi_phys.systab;
+		if (md->phys_addr <= systab && systab < end) {
+			systab += md->virt_addr - md->phys_addr;
+			efi.systab = (efi_system_table_t *) (unsigned long) systab;
+		}
+	}
+
+	BUG_ON(!efi.systab);
+
+	status = phys_efi_set_virtual_address_map(
+		memmap.desc_size * memmap.nr_map,
+		memmap.desc_size,
+		memmap.desc_version,
+		memmap.phys_map);
+
+	if (status != EFI_SUCCESS) {
+		printk(KERN_ALERT "Unable to switch EFI into virtual mode "
+		       "(status=%lx)!\n", status);
+		panic("EFI call to SetVirtualAddressMap() failed!");
+	}
+
+	/*
+	 * Now that EFI is in virtual mode, update the function
+	 * pointers in the runtime service table to the new virtual addresses.
+	 *
+	 * Call EFI services through wrapper functions.
+	 */
+	efi.get_time = virt_efi_get_time;
+	efi.set_time = virt_efi_set_time;
+	efi.get_wakeup_time = virt_efi_get_wakeup_time;
+	efi.set_wakeup_time = virt_efi_set_wakeup_time;
+	efi.get_variable = virt_efi_get_variable;
+	efi.get_next_variable = virt_efi_get_next_variable;
+	efi.set_variable = virt_efi_set_variable;
+	efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count;
+	efi.reset_system = virt_efi_reset_system;
+	efi.set_virtual_address_map = virt_efi_set_virtual_address_map;
+	if (__supported_pte_mask & _PAGE_NX)
+		runtime_code_page_mkexec();
+	early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size);
+	memmap.map = NULL;
+}
+
+/*
+ * Convenience functions to obtain memory types and attributes
+ */
+u32 efi_mem_type(unsigned long phys_addr)
+{
+	efi_memory_desc_t *md;
+	void *p;
+
+	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+		md = p;
+		if ((md->phys_addr <= phys_addr) &&
+		    (phys_addr < (md->phys_addr +
+				  (md->num_pages << EFI_PAGE_SHIFT))))
+			return md->type;
+	}
+	return 0;
+}
+
+u64 efi_mem_attributes(unsigned long phys_addr)
+{
+	efi_memory_desc_t *md;
+	void *p;
+
+	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+		md = p;
+		if ((md->phys_addr <= phys_addr) &&
+		    (phys_addr < (md->phys_addr +
+				  (md->num_pages << EFI_PAGE_SHIFT))))
+			return md->attribute;
+	}
+	return 0;
+}
diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c
new file mode 100644
index 00000000000..5cab48ee61a
--- /dev/null
+++ b/arch/x86/platform/efi/efi_32.c
@@ -0,0 +1,112 @@
+/*
+ * Extensible Firmware Interface
+ *
+ * Based on Extensible Firmware Interface Specification version 1.0
+ *
+ * Copyright (C) 1999 VA Linux Systems
+ * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
+ * Copyright (C) 1999-2002 Hewlett-Packard Co.
+ *	David Mosberger-Tang <davidm@hpl.hp.com>
+ *	Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * All EFI Runtime Services are not implemented yet as EFI only
+ * supports physical mode addressing on SoftSDV. This is to be fixed
+ * in a future version.  --drummond 1999-07-20
+ *
+ * Implemented EFI runtime services and virtual mode calls.  --davidm
+ *
+ * Goutham Rao: <goutham.rao@intel.com>
+ *	Skip non-WB memory and ignore empty memory ranges.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/ioport.h>
+#include <linux/efi.h>
+
+#include <asm/io.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/efi.h>
+
+/*
+ * To make EFI call EFI runtime service in physical addressing mode we need
+ * prelog/epilog before/after the invocation to disable interrupt, to
+ * claim EFI runtime service handler exclusively and to duplicate a memory in
+ * low memory space say 0 - 3G.
+ */
+
+static unsigned long efi_rt_eflags;
+static pgd_t efi_bak_pg_dir_pointer[2];
+
+void efi_call_phys_prelog(void)
+{
+	unsigned long cr4;
+	unsigned long temp;
+	struct desc_ptr gdt_descr;
+
+	local_irq_save(efi_rt_eflags);
+
+	/*
+	 * If I don't have PAE, I should just duplicate two entries in page
+	 * directory. If I have PAE, I just need to duplicate one entry in
+	 * page directory.
+	 */
+	cr4 = read_cr4_safe();
+
+	if (cr4 & X86_CR4_PAE) {
+		efi_bak_pg_dir_pointer[0].pgd =
+		    swapper_pg_dir[pgd_index(0)].pgd;
+		swapper_pg_dir[0].pgd =
+		    swapper_pg_dir[pgd_index(PAGE_OFFSET)].pgd;
+	} else {
+		efi_bak_pg_dir_pointer[0].pgd =
+		    swapper_pg_dir[pgd_index(0)].pgd;
+		efi_bak_pg_dir_pointer[1].pgd =
+		    swapper_pg_dir[pgd_index(0x400000)].pgd;
+		swapper_pg_dir[pgd_index(0)].pgd =
+		    swapper_pg_dir[pgd_index(PAGE_OFFSET)].pgd;
+		temp = PAGE_OFFSET + 0x400000;
+		swapper_pg_dir[pgd_index(0x400000)].pgd =
+		    swapper_pg_dir[pgd_index(temp)].pgd;
+	}
+
+	/*
+	 * After the lock is released, the original page table is restored.
+	 */
+	__flush_tlb_all();
+
+	gdt_descr.address = __pa(get_cpu_gdt_table(0));
+	gdt_descr.size = GDT_SIZE - 1;
+	load_gdt(&gdt_descr);
+}
+
+void efi_call_phys_epilog(void)
+{
+	unsigned long cr4;
+	struct desc_ptr gdt_descr;
+
+	gdt_descr.address = (unsigned long)get_cpu_gdt_table(0);
+	gdt_descr.size = GDT_SIZE - 1;
+	load_gdt(&gdt_descr);
+
+	cr4 = read_cr4_safe();
+
+	if (cr4 & X86_CR4_PAE) {
+		swapper_pg_dir[pgd_index(0)].pgd =
+		    efi_bak_pg_dir_pointer[0].pgd;
+	} else {
+		swapper_pg_dir[pgd_index(0)].pgd =
+		    efi_bak_pg_dir_pointer[0].pgd;
+		swapper_pg_dir[pgd_index(0x400000)].pgd =
+		    efi_bak_pg_dir_pointer[1].pgd;
+	}
+
+	/*
+	 * After the lock is released, the original page table is restored.
+	 */
+	__flush_tlb_all();
+
+	local_irq_restore(efi_rt_eflags);
+}
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
new file mode 100644
index 00000000000..ac0621a7ac3
--- /dev/null
+++ b/arch/x86/platform/efi/efi_64.c
@@ -0,0 +1,114 @@
+/*
+ * x86_64 specific EFI support functions
+ * Based on Extensible Firmware Interface Specification version 1.0
+ *
+ * Copyright (C) 2005-2008 Intel Co.
+ *	Fenghua Yu <fenghua.yu@intel.com>
+ *	Bibo Mao <bibo.mao@intel.com>
+ *	Chandramouli Narayanan <mouli@linux.intel.com>
+ *	Huang Ying <ying.huang@intel.com>
+ *
+ * Code to convert EFI to E820 map has been implemented in elilo bootloader
+ * based on a EFI patch by Edgar Hucek. Based on the E820 map, the page table
+ * is setup appropriately for EFI runtime code.
+ * - mouli 06/14/2007.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/bootmem.h>
+#include <linux/ioport.h>
+#include <linux/module.h>
+#include <linux/efi.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
+#include <linux/reboot.h>
+
+#include <asm/setup.h>
+#include <asm/page.h>
+#include <asm/e820.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/proto.h>
+#include <asm/efi.h>
+#include <asm/cacheflush.h>
+#include <asm/fixmap.h>
+
+static pgd_t save_pgd __initdata;
+static unsigned long efi_flags __initdata;
+
+static void __init early_mapping_set_exec(unsigned long start,
+					  unsigned long end,
+					  int executable)
+{
+	unsigned long num_pages;
+
+	start &= PMD_MASK;
+	end = (end + PMD_SIZE - 1) & PMD_MASK;
+	num_pages = (end - start) >> PAGE_SHIFT;
+	if (executable)
+		set_memory_x((unsigned long)__va(start), num_pages);
+	else
+		set_memory_nx((unsigned long)__va(start), num_pages);
+}
+
+static void __init early_runtime_code_mapping_set_exec(int executable)
+{
+	efi_memory_desc_t *md;
+	void *p;
+
+	if (!(__supported_pte_mask & _PAGE_NX))
+		return;
+
+	/* Make EFI runtime service code area executable */
+	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+		md = p;
+		if (md->type == EFI_RUNTIME_SERVICES_CODE) {
+			unsigned long end;
+			end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT);
+			early_mapping_set_exec(md->phys_addr, end, executable);
+		}
+	}
+}
+
+void __init efi_call_phys_prelog(void)
+{
+	unsigned long vaddress;
+
+	early_runtime_code_mapping_set_exec(1);
+	local_irq_save(efi_flags);
+	vaddress = (unsigned long)__va(0x0UL);
+	save_pgd = *pgd_offset_k(0x0UL);
+	set_pgd(pgd_offset_k(0x0UL), *pgd_offset_k(vaddress));
+	__flush_tlb_all();
+}
+
+void __init efi_call_phys_epilog(void)
+{
+	/*
+	 * After the lock is released, the original page table is restored.
+	 */
+	set_pgd(pgd_offset_k(0x0UL), save_pgd);
+	__flush_tlb_all();
+	local_irq_restore(efi_flags);
+	early_runtime_code_mapping_set_exec(0);
+}
+
+void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size,
+				 u32 type)
+{
+	unsigned long last_map_pfn;
+
+	if (type == EFI_MEMORY_MAPPED_IO)
+		return ioremap(phys_addr, size);
+
+	last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size);
+	if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size)
+		return NULL;
+
+	return (void __iomem *)__va(phys_addr);
+}
diff --git a/arch/x86/platform/efi/efi_stub_32.S b/arch/x86/platform/efi/efi_stub_32.S
new file mode 100644
index 00000000000..fbe66e626c0
--- /dev/null
+++ b/arch/x86/platform/efi/efi_stub_32.S
@@ -0,0 +1,123 @@
+/*
+ * EFI call stub for IA32.
+ *
+ * This stub allows us to make EFI calls in physical mode with interrupts
+ * turned off.
+ */
+
+#include <linux/linkage.h>
+#include <asm/page_types.h>
+
+/*
+ * efi_call_phys(void *, ...) is a function with variable parameters.
+ * All the callers of this function assure that all the parameters are 4-bytes.
+ */
+
+/*
+ * In gcc calling convention, EBX, ESP, EBP, ESI and EDI are all callee save.
+ * So we'd better save all of them at the beginning of this function and restore
+ * at the end no matter how many we use, because we can not assure EFI runtime
+ * service functions will comply with gcc calling convention, too.
+ */
+
+.text
+ENTRY(efi_call_phys)
+	/*
+	 * 0. The function can only be called in Linux kernel. So CS has been
+	 * set to 0x0010, DS and SS have been set to 0x0018. In EFI, I found
+	 * the values of these registers are the same. And, the corresponding
+	 * GDT entries are identical. So I will do nothing about segment reg
+	 * and GDT, but change GDT base register in prelog and epilog.
+	 */
+
+	/*
+	 * 1. Now I am running with EIP = <physical address> + PAGE_OFFSET.
+	 * But to make it smoothly switch from virtual mode to flat mode.
+	 * The mapping of lower virtual memory has been created in prelog and
+	 * epilog.
+	 */
+	movl	$1f, %edx
+	subl	$__PAGE_OFFSET, %edx
+	jmp	*%edx
+1:
+
+	/*
+	 * 2. Now on the top of stack is the return
+	 * address in the caller of efi_call_phys(), then parameter 1,
+	 * parameter 2, ..., param n. To make things easy, we save the return
+	 * address of efi_call_phys in a global variable.
+	 */
+	popl	%edx
+	movl	%edx, saved_return_addr
+	/* get the function pointer into ECX*/
+	popl	%ecx
+	movl	%ecx, efi_rt_function_ptr
+	movl	$2f, %edx
+	subl	$__PAGE_OFFSET, %edx
+	pushl	%edx
+
+	/*
+	 * 3. Clear PG bit in %CR0.
+	 */
+	movl	%cr0, %edx
+	andl	$0x7fffffff, %edx
+	movl	%edx, %cr0
+	jmp	1f
+1:
+
+	/*
+	 * 4. Adjust stack pointer.
+	 */
+	subl	$__PAGE_OFFSET, %esp
+
+	/*
+	 * 5. Call the physical function.
+	 */
+	jmp	*%ecx
+
+2:
+	/*
+	 * 6. After EFI runtime service returns, control will return to
+	 * following instruction. We'd better readjust stack pointer first.
+	 */
+	addl	$__PAGE_OFFSET, %esp
+
+	/*
+	 * 7. Restore PG bit
+	 */
+	movl	%cr0, %edx
+	orl	$0x80000000, %edx
+	movl	%edx, %cr0
+	jmp	1f
+1:
+	/*
+	 * 8. Now restore the virtual mode from flat mode by
+	 * adding EIP with PAGE_OFFSET.
+	 */
+	movl	$1f, %edx
+	jmp	*%edx
+1:
+
+	/*
+	 * 9. Balance the stack. And because EAX contain the return value,
+	 * we'd better not clobber it.
+	 */
+	leal	efi_rt_function_ptr, %edx
+	movl	(%edx), %ecx
+	pushl	%ecx
+
+	/*
+	 * 10. Push the saved return address onto the stack and return.
+	 */
+	leal	saved_return_addr, %edx
+	movl	(%edx), %ecx
+	pushl	%ecx
+	ret
+ENDPROC(efi_call_phys)
+.previous
+
+.data
+saved_return_addr:
+	.long 0
+efi_rt_function_ptr:
+	.long 0
diff --git a/arch/x86/platform/efi/efi_stub_64.S b/arch/x86/platform/efi/efi_stub_64.S
new file mode 100644
index 00000000000..4c07ccab814
--- /dev/null
+++ b/arch/x86/platform/efi/efi_stub_64.S
@@ -0,0 +1,116 @@
+/*
+ * Function calling ABI conversion from Linux to EFI for x86_64
+ *
+ * Copyright (C) 2007 Intel Corp
+ *	Bibo Mao <bibo.mao@intel.com>
+ *	Huang Ying <ying.huang@intel.com>
+ */
+
+#include <linux/linkage.h>
+
+#define SAVE_XMM			\
+	mov %rsp, %rax;			\
+	subq $0x70, %rsp;		\
+	and $~0xf, %rsp;		\
+	mov %rax, (%rsp);		\
+	mov %cr0, %rax;			\
+	clts;				\
+	mov %rax, 0x8(%rsp);		\
+	movaps %xmm0, 0x60(%rsp);	\
+	movaps %xmm1, 0x50(%rsp);	\
+	movaps %xmm2, 0x40(%rsp);	\
+	movaps %xmm3, 0x30(%rsp);	\
+	movaps %xmm4, 0x20(%rsp);	\
+	movaps %xmm5, 0x10(%rsp)
+
+#define RESTORE_XMM			\
+	movaps 0x60(%rsp), %xmm0;	\
+	movaps 0x50(%rsp), %xmm1;	\
+	movaps 0x40(%rsp), %xmm2;	\
+	movaps 0x30(%rsp), %xmm3;	\
+	movaps 0x20(%rsp), %xmm4;	\
+	movaps 0x10(%rsp), %xmm5;	\
+	mov 0x8(%rsp), %rsi;		\
+	mov %rsi, %cr0;			\
+	mov (%rsp), %rsp
+
+ENTRY(efi_call0)
+	SAVE_XMM
+	subq $32, %rsp
+	call *%rdi
+	addq $32, %rsp
+	RESTORE_XMM
+	ret
+ENDPROC(efi_call0)
+
+ENTRY(efi_call1)
+	SAVE_XMM
+	subq $32, %rsp
+	mov  %rsi, %rcx
+	call *%rdi
+	addq $32, %rsp
+	RESTORE_XMM
+	ret
+ENDPROC(efi_call1)
+
+ENTRY(efi_call2)
+	SAVE_XMM
+	subq $32, %rsp
+	mov  %rsi, %rcx
+	call *%rdi
+	addq $32, %rsp
+	RESTORE_XMM
+	ret
+ENDPROC(efi_call2)
+
+ENTRY(efi_call3)
+	SAVE_XMM
+	subq $32, %rsp
+	mov  %rcx, %r8
+	mov  %rsi, %rcx
+	call *%rdi
+	addq $32, %rsp
+	RESTORE_XMM
+	ret
+ENDPROC(efi_call3)
+
+ENTRY(efi_call4)
+	SAVE_XMM
+	subq $32, %rsp
+	mov %r8, %r9
+	mov %rcx, %r8
+	mov %rsi, %rcx
+	call *%rdi
+	addq $32, %rsp
+	RESTORE_XMM
+	ret
+ENDPROC(efi_call4)
+
+ENTRY(efi_call5)
+	SAVE_XMM
+	subq $48, %rsp
+	mov %r9, 32(%rsp)
+	mov %r8, %r9
+	mov %rcx, %r8
+	mov %rsi, %rcx
+	call *%rdi
+	addq $48, %rsp
+	RESTORE_XMM
+	ret
+ENDPROC(efi_call5)
+
+ENTRY(efi_call6)
+	SAVE_XMM
+	mov (%rsp), %rax
+	mov 8(%rax), %rax
+	subq $48, %rsp
+	mov %r9, 32(%rsp)
+	mov %rax, 40(%rsp)
+	mov %r8, %r9
+	mov %rcx, %r8
+	mov %rsi, %rcx
+	call *%rdi
+	addq $48, %rsp
+	RESTORE_XMM
+	ret
+ENDPROC(efi_call6)
-- 
cgit v1.2.3-18-g5258


From c4e72ad6bbbbbf1f826df3a5d3e3c4af2f4d48c9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 16 Oct 2010 10:33:09 +0200
Subject: x86: Move visws to platform

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/Makefile               |   1 -
 arch/x86/kernel/visws_quirks.c         | 614 ---------------------------------
 arch/x86/platform/Makefile             |   1 +
 arch/x86/platform/visws/Makefile       |   1 +
 arch/x86/platform/visws/visws_quirks.c | 614 +++++++++++++++++++++++++++++++++
 5 files changed, 616 insertions(+), 615 deletions(-)
 delete mode 100644 arch/x86/kernel/visws_quirks.c
 create mode 100644 arch/x86/platform/visws/Makefile
 create mode 100644 arch/x86/platform/visws/visws_quirks.c

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index b01c7b1ac1b..28c4f3f2e97 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -36,7 +36,6 @@ obj-y			+= traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
 obj-y			+= time.o ioport.o ldt.o dumpstack.o
 obj-y			+= setup.o x86_init.o i8259.o irqinit.o jump_label.o
 obj-$(CONFIG_IRQ_WORK)  += irq_work.o
-obj-$(CONFIG_X86_VISWS)	+= visws_quirks.o
 obj-$(CONFIG_X86_32)	+= probe_roms_32.o
 obj-$(CONFIG_X86_32)	+= sys_i386_32.o i386_ksyms_32.o
 obj-$(CONFIG_X86_64)	+= sys_x86_64.o x8664_ksyms_64.o
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
deleted file mode 100644
index 3371bd053b8..00000000000
--- a/arch/x86/kernel/visws_quirks.c
+++ /dev/null
@@ -1,614 +0,0 @@
-/*
- *  SGI Visual Workstation support and quirks, unmaintained.
- *
- *  Split out from setup.c by davej@suse.de
- *
- *	Copyright (C) 1999 Bent Hagemark, Ingo Molnar
- *
- *  SGI Visual Workstation interrupt controller
- *
- *  The Cobalt system ASIC in the Visual Workstation contains a "Cobalt" APIC
- *  which serves as the main interrupt controller in the system.  Non-legacy
- *  hardware in the system uses this controller directly.  Legacy devices
- *  are connected to the PIIX4 which in turn has its 8259(s) connected to
- *  a of the Cobalt APIC entry.
- *
- *  09/02/2000 - Updated for 2.4 by jbarnes@sgi.com
- *
- *  25/11/2002 - Updated for 2.5 by Andrey Panin <pazke@orbita1.ru>
- */
-#include <linux/interrupt.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-
-#include <asm/visws/cobalt.h>
-#include <asm/visws/piix4.h>
-#include <asm/io_apic.h>
-#include <asm/fixmap.h>
-#include <asm/reboot.h>
-#include <asm/setup.h>
-#include <asm/apic.h>
-#include <asm/e820.h>
-#include <asm/time.h>
-#include <asm/io.h>
-
-#include <linux/kernel_stat.h>
-
-#include <asm/i8259.h>
-#include <asm/irq_vectors.h>
-#include <asm/visws/lithium.h>
-
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/pci.h>
-#include <linux/pci_ids.h>
-
-extern int no_broadcast;
-
-char visws_board_type	= -1;
-char visws_board_rev	= -1;
-
-static void __init visws_time_init(void)
-{
-	printk(KERN_INFO "Starting Cobalt Timer system clock\n");
-
-	/* Set the countdown value */
-	co_cpu_write(CO_CPU_TIMEVAL, CO_TIME_HZ/HZ);
-
-	/* Start the timer */
-	co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) | CO_CTRL_TIMERUN);
-
-	/* Enable (unmask) the timer interrupt */
-	co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) & ~CO_CTRL_TIMEMASK);
-
-	setup_default_timer_irq();
-}
-
-/* Replaces the default init_ISA_irqs in the generic setup */
-static void __init visws_pre_intr_init(void);
-
-/* Quirk for machine specific memory setup. */
-
-#define MB (1024 * 1024)
-
-unsigned long sgivwfb_mem_phys;
-unsigned long sgivwfb_mem_size;
-EXPORT_SYMBOL(sgivwfb_mem_phys);
-EXPORT_SYMBOL(sgivwfb_mem_size);
-
-long long mem_size __initdata = 0;
-
-static char * __init visws_memory_setup(void)
-{
-	long long gfx_mem_size = 8 * MB;
-
-	mem_size = boot_params.alt_mem_k;
-
-	if (!mem_size) {
-		printk(KERN_WARNING "Bootloader didn't set memory size, upgrade it !\n");
-		mem_size = 128 * MB;
-	}
-
-	/*
-	 * this hardcodes the graphics memory to 8 MB
-	 * it really should be sized dynamically (or at least
-	 * set as a boot param)
-	 */
-	if (!sgivwfb_mem_size) {
-		printk(KERN_WARNING "Defaulting to 8 MB framebuffer size\n");
-		sgivwfb_mem_size = 8 * MB;
-	}
-
-	/*
-	 * Trim to nearest MB
-	 */
-	sgivwfb_mem_size &= ~((1 << 20) - 1);
-	sgivwfb_mem_phys = mem_size - gfx_mem_size;
-
-	e820_add_region(0, LOWMEMSIZE(), E820_RAM);
-	e820_add_region(HIGH_MEMORY, mem_size - sgivwfb_mem_size - HIGH_MEMORY, E820_RAM);
-	e820_add_region(sgivwfb_mem_phys, sgivwfb_mem_size, E820_RESERVED);
-
-	return "PROM";
-}
-
-static void visws_machine_emergency_restart(void)
-{
-	/*
-	 * Visual Workstations restart after this
-	 * register is poked on the PIIX4
-	 */
-	outb(PIIX4_RESET_VAL, PIIX4_RESET_PORT);
-}
-
-static void visws_machine_power_off(void)
-{
-	unsigned short pm_status;
-/*	extern unsigned int pci_bus0; */
-
-	while ((pm_status = inw(PMSTS_PORT)) & 0x100)
-		outw(pm_status, PMSTS_PORT);
-
-	outw(PM_SUSPEND_ENABLE, PMCNTRL_PORT);
-
-	mdelay(10);
-
-#define PCI_CONF1_ADDRESS(bus, devfn, reg) \
-	(0x80000000 | (bus << 16) | (devfn << 8) | (reg & ~3))
-
-/*	outl(PCI_CONF1_ADDRESS(pci_bus0, SPECIAL_DEV, SPECIAL_REG), 0xCF8); */
-	outl(PIIX_SPECIAL_STOP, 0xCFC);
-}
-
-static void __init visws_get_smp_config(unsigned int early)
-{
-}
-
-/*
- * The Visual Workstation is Intel MP compliant in the hardware
- * sense, but it doesn't have a BIOS(-configuration table).
- * No problem for Linux.
- */
-
-static void __init MP_processor_info(struct mpc_cpu *m)
-{
-	int ver, logical_apicid;
-	physid_mask_t apic_cpus;
-
-	if (!(m->cpuflag & CPU_ENABLED))
-		return;
-
-	logical_apicid = m->apicid;
-	printk(KERN_INFO "%sCPU #%d %u:%u APIC version %d\n",
-	       m->cpuflag & CPU_BOOTPROCESSOR ? "Bootup " : "",
-	       m->apicid, (m->cpufeature & CPU_FAMILY_MASK) >> 8,
-	       (m->cpufeature & CPU_MODEL_MASK) >> 4, m->apicver);
-
-	if (m->cpuflag & CPU_BOOTPROCESSOR)
-		boot_cpu_physical_apicid = m->apicid;
-
-	ver = m->apicver;
-	if ((ver >= 0x14 && m->apicid >= 0xff) || m->apicid >= 0xf) {
-		printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
-			m->apicid, MAX_APICS);
-		return;
-	}
-
-	apic->apicid_to_cpu_present(m->apicid, &apic_cpus);
-	physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus);
-	/*
-	 * Validate version
-	 */
-	if (ver == 0x0) {
-		printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! "
-			"fixing up to 0x10. (tell your hw vendor)\n",
-			m->apicid);
-		ver = 0x10;
-	}
-	apic_version[m->apicid] = ver;
-}
-
-static void __init visws_find_smp_config(void)
-{
-	struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS);
-	unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS));
-
-	if (ncpus > CO_CPU_MAX) {
-		printk(KERN_WARNING "find_visws_smp: got cpu count of %d at %p\n",
-			ncpus, mp);
-
-		ncpus = CO_CPU_MAX;
-	}
-
-	if (ncpus > setup_max_cpus)
-		ncpus = setup_max_cpus;
-
-#ifdef CONFIG_X86_LOCAL_APIC
-	smp_found_config = 1;
-#endif
-	while (ncpus--)
-		MP_processor_info(mp++);
-
-	mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
-}
-
-static void visws_trap_init(void);
-
-void __init visws_early_detect(void)
-{
-	int raw;
-
-	visws_board_type = (char)(inb_p(PIIX_GPI_BD_REG) & PIIX_GPI_BD_REG)
-							 >> PIIX_GPI_BD_SHIFT;
-
-	if (visws_board_type < 0)
-		return;
-
-	/*
-	 * Override the default platform setup functions
-	 */
-	x86_init.resources.memory_setup = visws_memory_setup;
-	x86_init.mpparse.get_smp_config = visws_get_smp_config;
-	x86_init.mpparse.find_smp_config = visws_find_smp_config;
-	x86_init.irqs.pre_vector_init = visws_pre_intr_init;
-	x86_init.irqs.trap_init = visws_trap_init;
-	x86_init.timers.timer_init = visws_time_init;
-	x86_init.pci.init = pci_visws_init;
-	x86_init.pci.init_irq = x86_init_noop;
-
-	/*
-	 * Install reboot quirks:
-	 */
-	pm_power_off			= visws_machine_power_off;
-	machine_ops.emergency_restart	= visws_machine_emergency_restart;
-
-	/*
-	 * Do not use broadcast IPIs:
-	 */
-	no_broadcast = 0;
-
-#ifdef CONFIG_X86_IO_APIC
-	/*
-	 * Turn off IO-APIC detection and initialization:
-	 */
-	skip_ioapic_setup		= 1;
-#endif
-
-	/*
-	 * Get Board rev.
-	 * First, we have to initialize the 307 part to allow us access
-	 * to the GPIO registers.  Let's map them at 0x0fc0 which is right
-	 * after the PIIX4 PM section.
-	 */
-	outb_p(SIO_DEV_SEL, SIO_INDEX);
-	outb_p(SIO_GP_DEV, SIO_DATA);	/* Talk to GPIO regs. */
-
-	outb_p(SIO_DEV_MSB, SIO_INDEX);
-	outb_p(SIO_GP_MSB, SIO_DATA);	/* MSB of GPIO base address */
-
-	outb_p(SIO_DEV_LSB, SIO_INDEX);
-	outb_p(SIO_GP_LSB, SIO_DATA);	/* LSB of GPIO base address */
-
-	outb_p(SIO_DEV_ENB, SIO_INDEX);
-	outb_p(1, SIO_DATA);		/* Enable GPIO registers. */
-
-	/*
-	 * Now, we have to map the power management section to write
-	 * a bit which enables access to the GPIO registers.
-	 * What lunatic came up with this shit?
-	 */
-	outb_p(SIO_DEV_SEL, SIO_INDEX);
-	outb_p(SIO_PM_DEV, SIO_DATA);	/* Talk to GPIO regs. */
-
-	outb_p(SIO_DEV_MSB, SIO_INDEX);
-	outb_p(SIO_PM_MSB, SIO_DATA);	/* MSB of PM base address */
-
-	outb_p(SIO_DEV_LSB, SIO_INDEX);
-	outb_p(SIO_PM_LSB, SIO_DATA);	/* LSB of PM base address */
-
-	outb_p(SIO_DEV_ENB, SIO_INDEX);
-	outb_p(1, SIO_DATA);		/* Enable PM registers. */
-
-	/*
-	 * Now, write the PM register which enables the GPIO registers.
-	 */
-	outb_p(SIO_PM_FER2, SIO_PM_INDEX);
-	outb_p(SIO_PM_GP_EN, SIO_PM_DATA);
-
-	/*
-	 * Now, initialize the GPIO registers.
-	 * We want them all to be inputs which is the
-	 * power on default, so let's leave them alone.
-	 * So, let's just read the board rev!
-	 */
-	raw = inb_p(SIO_GP_DATA1);
-	raw &= 0x7f;	/* 7 bits of valid board revision ID. */
-
-	if (visws_board_type == VISWS_320) {
-		if (raw < 0x6) {
-			visws_board_rev = 4;
-		} else if (raw < 0xc) {
-			visws_board_rev = 5;
-		} else {
-			visws_board_rev = 6;
-		}
-	} else if (visws_board_type == VISWS_540) {
-			visws_board_rev = 2;
-		} else {
-			visws_board_rev = raw;
-		}
-
-	printk(KERN_INFO "Silicon Graphics Visual Workstation %s (rev %d) detected\n",
-	       (visws_board_type == VISWS_320 ? "320" :
-	       (visws_board_type == VISWS_540 ? "540" :
-		"unknown")), visws_board_rev);
-}
-
-#define A01234 (LI_INTA_0 | LI_INTA_1 | LI_INTA_2 | LI_INTA_3 | LI_INTA_4)
-#define BCD (LI_INTB | LI_INTC | LI_INTD)
-#define ALLDEVS (A01234 | BCD)
-
-static __init void lithium_init(void)
-{
-	set_fixmap(FIX_LI_PCIA, LI_PCI_A_PHYS);
-	set_fixmap(FIX_LI_PCIB, LI_PCI_B_PHYS);
-
-	if ((li_pcia_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) ||
-	    (li_pcia_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) {
-		printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'A');
-/*		panic("This machine is not SGI Visual Workstation 320/540"); */
-	}
-
-	if ((li_pcib_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) ||
-	    (li_pcib_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) {
-		printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'B');
-/*		panic("This machine is not SGI Visual Workstation 320/540"); */
-	}
-
-	li_pcia_write16(LI_PCI_INTEN, ALLDEVS);
-	li_pcib_write16(LI_PCI_INTEN, ALLDEVS);
-}
-
-static __init void cobalt_init(void)
-{
-	/*
-	 * On normal SMP PC this is used only with SMP, but we have to
-	 * use it and set it up here to start the Cobalt clock
-	 */
-	set_fixmap(FIX_APIC_BASE, APIC_DEFAULT_PHYS_BASE);
-	setup_local_APIC();
-	printk(KERN_INFO "Local APIC Version %#x, ID %#x\n",
-		(unsigned int)apic_read(APIC_LVR),
-		(unsigned int)apic_read(APIC_ID));
-
-	set_fixmap(FIX_CO_CPU, CO_CPU_PHYS);
-	set_fixmap(FIX_CO_APIC, CO_APIC_PHYS);
-	printk(KERN_INFO "Cobalt Revision %#lx, APIC ID %#lx\n",
-		co_cpu_read(CO_CPU_REV), co_apic_read(CO_APIC_ID));
-
-	/* Enable Cobalt APIC being careful to NOT change the ID! */
-	co_apic_write(CO_APIC_ID, co_apic_read(CO_APIC_ID) | CO_APIC_ENABLE);
-
-	printk(KERN_INFO "Cobalt APIC enabled: ID reg %#lx\n",
-		co_apic_read(CO_APIC_ID));
-}
-
-static void __init visws_trap_init(void)
-{
-	lithium_init();
-	cobalt_init();
-}
-
-/*
- * IRQ controller / APIC support:
- */
-
-static DEFINE_SPINLOCK(cobalt_lock);
-
-/*
- * Set the given Cobalt APIC Redirection Table entry to point
- * to the given IDT vector/index.
- */
-static inline void co_apic_set(int entry, int irq)
-{
-	co_apic_write(CO_APIC_LO(entry), CO_APIC_LEVEL | (irq + FIRST_EXTERNAL_VECTOR));
-	co_apic_write(CO_APIC_HI(entry), 0);
-}
-
-/*
- * Cobalt (IO)-APIC functions to handle PCI devices.
- */
-static inline int co_apic_ide0_hack(void)
-{
-	extern char visws_board_type;
-	extern char visws_board_rev;
-
-	if (visws_board_type == VISWS_320 && visws_board_rev == 5)
-		return 5;
-	return CO_APIC_IDE0;
-}
-
-static int is_co_apic(unsigned int irq)
-{
-	if (IS_CO_APIC(irq))
-		return CO_APIC(irq);
-
-	switch (irq) {
-		case 0: return CO_APIC_CPU;
-		case CO_IRQ_IDE0: return co_apic_ide0_hack();
-		case CO_IRQ_IDE1: return CO_APIC_IDE1;
-		default: return -1;
-	}
-}
-
-
-/*
- * This is the SGI Cobalt (IO-)APIC:
- */
-static void enable_cobalt_irq(struct irq_data *data)
-{
-	co_apic_set(is_co_apic(data->irq), data->irq);
-}
-
-static void disable_cobalt_irq(struct irq_data *data)
-{
-	int entry = is_co_apic(data->irq);
-
-	co_apic_write(CO_APIC_LO(entry), CO_APIC_MASK);
-	co_apic_read(CO_APIC_LO(entry));
-}
-
-static void ack_cobalt_irq(struct irq_data *data)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&cobalt_lock, flags);
-	disable_cobalt_irq(data);
-	apic_write(APIC_EOI, APIC_EIO_ACK);
-	spin_unlock_irqrestore(&cobalt_lock, flags);
-}
-
-static struct irq_chip cobalt_irq_type = {
-	.name		= "Cobalt-APIC",
-	.irq_enable	= enable_cobalt_irq,
-	.irq_disable	= disable_cobalt_irq,
-	.irq_ack	= ack_cobalt_irq,
-};
-
-
-/*
- * This is the PIIX4-based 8259 that is wired up indirectly to Cobalt
- * -- not the manner expected by the code in i8259.c.
- *
- * there is a 'master' physical interrupt source that gets sent to
- * the CPU. But in the chipset there are various 'virtual' interrupts
- * waiting to be handled. We represent this to Linux through a 'master'
- * interrupt controller type, and through a special virtual interrupt-
- * controller. Device drivers only see the virtual interrupt sources.
- */
-static unsigned int startup_piix4_master_irq(struct irq_data *data)
-{
-	legacy_pic->init(0);
-	enable_cobalt_irq(data);
-}
-
-static void end_piix4_master_irq(struct irq_data *data)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&cobalt_lock, flags);
-	enable_cobalt_irq(data);
-	spin_unlock_irqrestore(&cobalt_lock, flags);
-}
-
-static struct irq_chip piix4_master_irq_type = {
-	.name		= "PIIX4-master",
-	.irq_startup	= startup_piix4_master_irq,
-	.irq_ack	= ack_cobalt_irq,
-};
-
-static void pii4_mask(struct irq_data *data) { }
-
-static struct irq_chip piix4_virtual_irq_type = {
-	.name		= "PIIX4-virtual",
-	.mask		= pii4_mask,
-};
-
-/*
- * PIIX4-8259 master/virtual functions to handle interrupt requests
- * from legacy devices: floppy, parallel, serial, rtc.
- *
- * None of these get Cobalt APIC entries, neither do they have IDT
- * entries. These interrupts are purely virtual and distributed from
- * the 'master' interrupt source: CO_IRQ_8259.
- *
- * When the 8259 interrupts its handler figures out which of these
- * devices is interrupting and dispatches to its handler.
- *
- * CAREFUL: devices see the 'virtual' interrupt only. Thus disable/
- * enable_irq gets the right irq. This 'master' irq is never directly
- * manipulated by any driver.
- */
-static irqreturn_t piix4_master_intr(int irq, void *dev_id)
-{
-	unsigned long flags;
-	int realirq;
-
-	raw_spin_lock_irqsave(&i8259A_lock, flags);
-
-	/* Find out what's interrupting in the PIIX4 master 8259 */
-	outb(0x0c, 0x20);		/* OCW3 Poll command */
-	realirq = inb(0x20);
-
-	/*
-	 * Bit 7 == 0 means invalid/spurious
-	 */
-	if (unlikely(!(realirq & 0x80)))
-		goto out_unlock;
-
-	realirq &= 7;
-
-	if (unlikely(realirq == 2)) {
-		outb(0x0c, 0xa0);
-		realirq = inb(0xa0);
-
-		if (unlikely(!(realirq & 0x80)))
-			goto out_unlock;
-
-		realirq = (realirq & 7) + 8;
-	}
-
-	/* mask and ack interrupt */
-	cached_irq_mask |= 1 << realirq;
-	if (unlikely(realirq > 7)) {
-		inb(0xa1);
-		outb(cached_slave_mask, 0xa1);
-		outb(0x60 + (realirq & 7), 0xa0);
-		outb(0x60 + 2, 0x20);
-	} else {
-		inb(0x21);
-		outb(cached_master_mask, 0x21);
-		outb(0x60 + realirq, 0x20);
-	}
-
-	raw_spin_unlock_irqrestore(&i8259A_lock, flags);
-
-	/*
-	 * handle this 'virtual interrupt' as a Cobalt one now.
-	 */
-	generic_handle_irq(realirq);
-
-	return IRQ_HANDLED;
-
-out_unlock:
-	raw_spin_unlock_irqrestore(&i8259A_lock, flags);
-	return IRQ_NONE;
-}
-
-static struct irqaction master_action = {
-	.handler =	piix4_master_intr,
-	.name =		"PIIX4-8259",
-};
-
-static struct irqaction cascade_action = {
-	.handler = 	no_action,
-	.name =		"cascade",
-};
-
-static inline void set_piix4_virtual_irq_type(void)
-{
-	piix4_virtual_irq_type.enable =	i8259A_chip.unmask;
-	piix4_virtual_irq_type.disable = i8259A_chip.mask;
-	piix4_virtual_irq_type.unmask =	i8259A_chip.unmask;
-}
-
-static void __init visws_pre_intr_init(void)
-{
-	int i;
-
-	set_piix4_virtual_irq_type();
-
-	for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) {
-		struct irq_chip *chip = NULL;
-
-		if (i == 0)
-			chip = &cobalt_irq_type;
-		else if (i == CO_IRQ_IDE0)
-			chip = &cobalt_irq_type;
-		else if (i == CO_IRQ_IDE1)
-			>chip = &cobalt_irq_type;
-		else if (i == CO_IRQ_8259)
-			chip = &piix4_master_irq_type;
-		else if (i < CO_IRQ_APIC0)
-			chip = &piix4_virtual_irq_type;
-		else if (IS_CO_APIC(i))
-			chip = &cobalt_irq_type;
-
-		if (chip)
-			set_irq_chip(i, chip);
-	}
-
-	setup_irq(CO_IRQ_8259, &master_action);
-	setup_irq(2, &cascade_action);
-}
diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile
index 99e95b32ae7..e629d7a428c 100644
--- a/arch/x86/platform/Makefile
+++ b/arch/x86/platform/Makefile
@@ -1,3 +1,4 @@
 # Platform specific code goes here
 obj-y	+= efi/
 obj-y	+= sfi/
+obj-y	+= visws/
diff --git a/arch/x86/platform/visws/Makefile b/arch/x86/platform/visws/Makefile
new file mode 100644
index 00000000000..91bc17ab2fd
--- /dev/null
+++ b/arch/x86/platform/visws/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_X86_VISWS)	+= visws_quirks.o
diff --git a/arch/x86/platform/visws/visws_quirks.c b/arch/x86/platform/visws/visws_quirks.c
new file mode 100644
index 00000000000..3371bd053b8
--- /dev/null
+++ b/arch/x86/platform/visws/visws_quirks.c
@@ -0,0 +1,614 @@
+/*
+ *  SGI Visual Workstation support and quirks, unmaintained.
+ *
+ *  Split out from setup.c by davej@suse.de
+ *
+ *	Copyright (C) 1999 Bent Hagemark, Ingo Molnar
+ *
+ *  SGI Visual Workstation interrupt controller
+ *
+ *  The Cobalt system ASIC in the Visual Workstation contains a "Cobalt" APIC
+ *  which serves as the main interrupt controller in the system.  Non-legacy
+ *  hardware in the system uses this controller directly.  Legacy devices
+ *  are connected to the PIIX4 which in turn has its 8259(s) connected to
+ *  a of the Cobalt APIC entry.
+ *
+ *  09/02/2000 - Updated for 2.4 by jbarnes@sgi.com
+ *
+ *  25/11/2002 - Updated for 2.5 by Andrey Panin <pazke@orbita1.ru>
+ */
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+
+#include <asm/visws/cobalt.h>
+#include <asm/visws/piix4.h>
+#include <asm/io_apic.h>
+#include <asm/fixmap.h>
+#include <asm/reboot.h>
+#include <asm/setup.h>
+#include <asm/apic.h>
+#include <asm/e820.h>
+#include <asm/time.h>
+#include <asm/io.h>
+
+#include <linux/kernel_stat.h>
+
+#include <asm/i8259.h>
+#include <asm/irq_vectors.h>
+#include <asm/visws/lithium.h>
+
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/pci_ids.h>
+
+extern int no_broadcast;
+
+char visws_board_type	= -1;
+char visws_board_rev	= -1;
+
+static void __init visws_time_init(void)
+{
+	printk(KERN_INFO "Starting Cobalt Timer system clock\n");
+
+	/* Set the countdown value */
+	co_cpu_write(CO_CPU_TIMEVAL, CO_TIME_HZ/HZ);
+
+	/* Start the timer */
+	co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) | CO_CTRL_TIMERUN);
+
+	/* Enable (unmask) the timer interrupt */
+	co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) & ~CO_CTRL_TIMEMASK);
+
+	setup_default_timer_irq();
+}
+
+/* Replaces the default init_ISA_irqs in the generic setup */
+static void __init visws_pre_intr_init(void);
+
+/* Quirk for machine specific memory setup. */
+
+#define MB (1024 * 1024)
+
+unsigned long sgivwfb_mem_phys;
+unsigned long sgivwfb_mem_size;
+EXPORT_SYMBOL(sgivwfb_mem_phys);
+EXPORT_SYMBOL(sgivwfb_mem_size);
+
+long long mem_size __initdata = 0;
+
+static char * __init visws_memory_setup(void)
+{
+	long long gfx_mem_size = 8 * MB;
+
+	mem_size = boot_params.alt_mem_k;
+
+	if (!mem_size) {
+		printk(KERN_WARNING "Bootloader didn't set memory size, upgrade it !\n");
+		mem_size = 128 * MB;
+	}
+
+	/*
+	 * this hardcodes the graphics memory to 8 MB
+	 * it really should be sized dynamically (or at least
+	 * set as a boot param)
+	 */
+	if (!sgivwfb_mem_size) {
+		printk(KERN_WARNING "Defaulting to 8 MB framebuffer size\n");
+		sgivwfb_mem_size = 8 * MB;
+	}
+
+	/*
+	 * Trim to nearest MB
+	 */
+	sgivwfb_mem_size &= ~((1 << 20) - 1);
+	sgivwfb_mem_phys = mem_size - gfx_mem_size;
+
+	e820_add_region(0, LOWMEMSIZE(), E820_RAM);
+	e820_add_region(HIGH_MEMORY, mem_size - sgivwfb_mem_size - HIGH_MEMORY, E820_RAM);
+	e820_add_region(sgivwfb_mem_phys, sgivwfb_mem_size, E820_RESERVED);
+
+	return "PROM";
+}
+
+static void visws_machine_emergency_restart(void)
+{
+	/*
+	 * Visual Workstations restart after this
+	 * register is poked on the PIIX4
+	 */
+	outb(PIIX4_RESET_VAL, PIIX4_RESET_PORT);
+}
+
+static void visws_machine_power_off(void)
+{
+	unsigned short pm_status;
+/*	extern unsigned int pci_bus0; */
+
+	while ((pm_status = inw(PMSTS_PORT)) & 0x100)
+		outw(pm_status, PMSTS_PORT);
+
+	outw(PM_SUSPEND_ENABLE, PMCNTRL_PORT);
+
+	mdelay(10);
+
+#define PCI_CONF1_ADDRESS(bus, devfn, reg) \
+	(0x80000000 | (bus << 16) | (devfn << 8) | (reg & ~3))
+
+/*	outl(PCI_CONF1_ADDRESS(pci_bus0, SPECIAL_DEV, SPECIAL_REG), 0xCF8); */
+	outl(PIIX_SPECIAL_STOP, 0xCFC);
+}
+
+static void __init visws_get_smp_config(unsigned int early)
+{
+}
+
+/*
+ * The Visual Workstation is Intel MP compliant in the hardware
+ * sense, but it doesn't have a BIOS(-configuration table).
+ * No problem for Linux.
+ */
+
+static void __init MP_processor_info(struct mpc_cpu *m)
+{
+	int ver, logical_apicid;
+	physid_mask_t apic_cpus;
+
+	if (!(m->cpuflag & CPU_ENABLED))
+		return;
+
+	logical_apicid = m->apicid;
+	printk(KERN_INFO "%sCPU #%d %u:%u APIC version %d\n",
+	       m->cpuflag & CPU_BOOTPROCESSOR ? "Bootup " : "",
+	       m->apicid, (m->cpufeature & CPU_FAMILY_MASK) >> 8,
+	       (m->cpufeature & CPU_MODEL_MASK) >> 4, m->apicver);
+
+	if (m->cpuflag & CPU_BOOTPROCESSOR)
+		boot_cpu_physical_apicid = m->apicid;
+
+	ver = m->apicver;
+	if ((ver >= 0x14 && m->apicid >= 0xff) || m->apicid >= 0xf) {
+		printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
+			m->apicid, MAX_APICS);
+		return;
+	}
+
+	apic->apicid_to_cpu_present(m->apicid, &apic_cpus);
+	physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus);
+	/*
+	 * Validate version
+	 */
+	if (ver == 0x0) {
+		printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! "
+			"fixing up to 0x10. (tell your hw vendor)\n",
+			m->apicid);
+		ver = 0x10;
+	}
+	apic_version[m->apicid] = ver;
+}
+
+static void __init visws_find_smp_config(void)
+{
+	struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS);
+	unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS));
+
+	if (ncpus > CO_CPU_MAX) {
+		printk(KERN_WARNING "find_visws_smp: got cpu count of %d at %p\n",
+			ncpus, mp);
+
+		ncpus = CO_CPU_MAX;
+	}
+
+	if (ncpus > setup_max_cpus)
+		ncpus = setup_max_cpus;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+	smp_found_config = 1;
+#endif
+	while (ncpus--)
+		MP_processor_info(mp++);
+
+	mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+}
+
+static void visws_trap_init(void);
+
+void __init visws_early_detect(void)
+{
+	int raw;
+
+	visws_board_type = (char)(inb_p(PIIX_GPI_BD_REG) & PIIX_GPI_BD_REG)
+							 >> PIIX_GPI_BD_SHIFT;
+
+	if (visws_board_type < 0)
+		return;
+
+	/*
+	 * Override the default platform setup functions
+	 */
+	x86_init.resources.memory_setup = visws_memory_setup;
+	x86_init.mpparse.get_smp_config = visws_get_smp_config;
+	x86_init.mpparse.find_smp_config = visws_find_smp_config;
+	x86_init.irqs.pre_vector_init = visws_pre_intr_init;
+	x86_init.irqs.trap_init = visws_trap_init;
+	x86_init.timers.timer_init = visws_time_init;
+	x86_init.pci.init = pci_visws_init;
+	x86_init.pci.init_irq = x86_init_noop;
+
+	/*
+	 * Install reboot quirks:
+	 */
+	pm_power_off			= visws_machine_power_off;
+	machine_ops.emergency_restart	= visws_machine_emergency_restart;
+
+	/*
+	 * Do not use broadcast IPIs:
+	 */
+	no_broadcast = 0;
+
+#ifdef CONFIG_X86_IO_APIC
+	/*
+	 * Turn off IO-APIC detection and initialization:
+	 */
+	skip_ioapic_setup		= 1;
+#endif
+
+	/*
+	 * Get Board rev.
+	 * First, we have to initialize the 307 part to allow us access
+	 * to the GPIO registers.  Let's map them at 0x0fc0 which is right
+	 * after the PIIX4 PM section.
+	 */
+	outb_p(SIO_DEV_SEL, SIO_INDEX);
+	outb_p(SIO_GP_DEV, SIO_DATA);	/* Talk to GPIO regs. */
+
+	outb_p(SIO_DEV_MSB, SIO_INDEX);
+	outb_p(SIO_GP_MSB, SIO_DATA);	/* MSB of GPIO base address */
+
+	outb_p(SIO_DEV_LSB, SIO_INDEX);
+	outb_p(SIO_GP_LSB, SIO_DATA);	/* LSB of GPIO base address */
+
+	outb_p(SIO_DEV_ENB, SIO_INDEX);
+	outb_p(1, SIO_DATA);		/* Enable GPIO registers. */
+
+	/*
+	 * Now, we have to map the power management section to write
+	 * a bit which enables access to the GPIO registers.
+	 * What lunatic came up with this shit?
+	 */
+	outb_p(SIO_DEV_SEL, SIO_INDEX);
+	outb_p(SIO_PM_DEV, SIO_DATA);	/* Talk to GPIO regs. */
+
+	outb_p(SIO_DEV_MSB, SIO_INDEX);
+	outb_p(SIO_PM_MSB, SIO_DATA);	/* MSB of PM base address */
+
+	outb_p(SIO_DEV_LSB, SIO_INDEX);
+	outb_p(SIO_PM_LSB, SIO_DATA);	/* LSB of PM base address */
+
+	outb_p(SIO_DEV_ENB, SIO_INDEX);
+	outb_p(1, SIO_DATA);		/* Enable PM registers. */
+
+	/*
+	 * Now, write the PM register which enables the GPIO registers.
+	 */
+	outb_p(SIO_PM_FER2, SIO_PM_INDEX);
+	outb_p(SIO_PM_GP_EN, SIO_PM_DATA);
+
+	/*
+	 * Now, initialize the GPIO registers.
+	 * We want them all to be inputs which is the
+	 * power on default, so let's leave them alone.
+	 * So, let's just read the board rev!
+	 */
+	raw = inb_p(SIO_GP_DATA1);
+	raw &= 0x7f;	/* 7 bits of valid board revision ID. */
+
+	if (visws_board_type == VISWS_320) {
+		if (raw < 0x6) {
+			visws_board_rev = 4;
+		} else if (raw < 0xc) {
+			visws_board_rev = 5;
+		} else {
+			visws_board_rev = 6;
+		}
+	} else if (visws_board_type == VISWS_540) {
+			visws_board_rev = 2;
+		} else {
+			visws_board_rev = raw;
+		}
+
+	printk(KERN_INFO "Silicon Graphics Visual Workstation %s (rev %d) detected\n",
+	       (visws_board_type == VISWS_320 ? "320" :
+	       (visws_board_type == VISWS_540 ? "540" :
+		"unknown")), visws_board_rev);
+}
+
+#define A01234 (LI_INTA_0 | LI_INTA_1 | LI_INTA_2 | LI_INTA_3 | LI_INTA_4)
+#define BCD (LI_INTB | LI_INTC | LI_INTD)
+#define ALLDEVS (A01234 | BCD)
+
+static __init void lithium_init(void)
+{
+	set_fixmap(FIX_LI_PCIA, LI_PCI_A_PHYS);
+	set_fixmap(FIX_LI_PCIB, LI_PCI_B_PHYS);
+
+	if ((li_pcia_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) ||
+	    (li_pcia_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) {
+		printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'A');
+/*		panic("This machine is not SGI Visual Workstation 320/540"); */
+	}
+
+	if ((li_pcib_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) ||
+	    (li_pcib_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) {
+		printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'B');
+/*		panic("This machine is not SGI Visual Workstation 320/540"); */
+	}
+
+	li_pcia_write16(LI_PCI_INTEN, ALLDEVS);
+	li_pcib_write16(LI_PCI_INTEN, ALLDEVS);
+}
+
+static __init void cobalt_init(void)
+{
+	/*
+	 * On normal SMP PC this is used only with SMP, but we have to
+	 * use it and set it up here to start the Cobalt clock
+	 */
+	set_fixmap(FIX_APIC_BASE, APIC_DEFAULT_PHYS_BASE);
+	setup_local_APIC();
+	printk(KERN_INFO "Local APIC Version %#x, ID %#x\n",
+		(unsigned int)apic_read(APIC_LVR),
+		(unsigned int)apic_read(APIC_ID));
+
+	set_fixmap(FIX_CO_CPU, CO_CPU_PHYS);
+	set_fixmap(FIX_CO_APIC, CO_APIC_PHYS);
+	printk(KERN_INFO "Cobalt Revision %#lx, APIC ID %#lx\n",
+		co_cpu_read(CO_CPU_REV), co_apic_read(CO_APIC_ID));
+
+	/* Enable Cobalt APIC being careful to NOT change the ID! */
+	co_apic_write(CO_APIC_ID, co_apic_read(CO_APIC_ID) | CO_APIC_ENABLE);
+
+	printk(KERN_INFO "Cobalt APIC enabled: ID reg %#lx\n",
+		co_apic_read(CO_APIC_ID));
+}
+
+static void __init visws_trap_init(void)
+{
+	lithium_init();
+	cobalt_init();
+}
+
+/*
+ * IRQ controller / APIC support:
+ */
+
+static DEFINE_SPINLOCK(cobalt_lock);
+
+/*
+ * Set the given Cobalt APIC Redirection Table entry to point
+ * to the given IDT vector/index.
+ */
+static inline void co_apic_set(int entry, int irq)
+{
+	co_apic_write(CO_APIC_LO(entry), CO_APIC_LEVEL | (irq + FIRST_EXTERNAL_VECTOR));
+	co_apic_write(CO_APIC_HI(entry), 0);
+}
+
+/*
+ * Cobalt (IO)-APIC functions to handle PCI devices.
+ */
+static inline int co_apic_ide0_hack(void)
+{
+	extern char visws_board_type;
+	extern char visws_board_rev;
+
+	if (visws_board_type == VISWS_320 && visws_board_rev == 5)
+		return 5;
+	return CO_APIC_IDE0;
+}
+
+static int is_co_apic(unsigned int irq)
+{
+	if (IS_CO_APIC(irq))
+		return CO_APIC(irq);
+
+	switch (irq) {
+		case 0: return CO_APIC_CPU;
+		case CO_IRQ_IDE0: return co_apic_ide0_hack();
+		case CO_IRQ_IDE1: return CO_APIC_IDE1;
+		default: return -1;
+	}
+}
+
+
+/*
+ * This is the SGI Cobalt (IO-)APIC:
+ */
+static void enable_cobalt_irq(struct irq_data *data)
+{
+	co_apic_set(is_co_apic(data->irq), data->irq);
+}
+
+static void disable_cobalt_irq(struct irq_data *data)
+{
+	int entry = is_co_apic(data->irq);
+
+	co_apic_write(CO_APIC_LO(entry), CO_APIC_MASK);
+	co_apic_read(CO_APIC_LO(entry));
+}
+
+static void ack_cobalt_irq(struct irq_data *data)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cobalt_lock, flags);
+	disable_cobalt_irq(data);
+	apic_write(APIC_EOI, APIC_EIO_ACK);
+	spin_unlock_irqrestore(&cobalt_lock, flags);
+}
+
+static struct irq_chip cobalt_irq_type = {
+	.name		= "Cobalt-APIC",
+	.irq_enable	= enable_cobalt_irq,
+	.irq_disable	= disable_cobalt_irq,
+	.irq_ack	= ack_cobalt_irq,
+};
+
+
+/*
+ * This is the PIIX4-based 8259 that is wired up indirectly to Cobalt
+ * -- not the manner expected by the code in i8259.c.
+ *
+ * there is a 'master' physical interrupt source that gets sent to
+ * the CPU. But in the chipset there are various 'virtual' interrupts
+ * waiting to be handled. We represent this to Linux through a 'master'
+ * interrupt controller type, and through a special virtual interrupt-
+ * controller. Device drivers only see the virtual interrupt sources.
+ */
+static unsigned int startup_piix4_master_irq(struct irq_data *data)
+{
+	legacy_pic->init(0);
+	enable_cobalt_irq(data);
+}
+
+static void end_piix4_master_irq(struct irq_data *data)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cobalt_lock, flags);
+	enable_cobalt_irq(data);
+	spin_unlock_irqrestore(&cobalt_lock, flags);
+}
+
+static struct irq_chip piix4_master_irq_type = {
+	.name		= "PIIX4-master",
+	.irq_startup	= startup_piix4_master_irq,
+	.irq_ack	= ack_cobalt_irq,
+};
+
+static void pii4_mask(struct irq_data *data) { }
+
+static struct irq_chip piix4_virtual_irq_type = {
+	.name		= "PIIX4-virtual",
+	.mask		= pii4_mask,
+};
+
+/*
+ * PIIX4-8259 master/virtual functions to handle interrupt requests
+ * from legacy devices: floppy, parallel, serial, rtc.
+ *
+ * None of these get Cobalt APIC entries, neither do they have IDT
+ * entries. These interrupts are purely virtual and distributed from
+ * the 'master' interrupt source: CO_IRQ_8259.
+ *
+ * When the 8259 interrupts its handler figures out which of these
+ * devices is interrupting and dispatches to its handler.
+ *
+ * CAREFUL: devices see the 'virtual' interrupt only. Thus disable/
+ * enable_irq gets the right irq. This 'master' irq is never directly
+ * manipulated by any driver.
+ */
+static irqreturn_t piix4_master_intr(int irq, void *dev_id)
+{
+	unsigned long flags;
+	int realirq;
+
+	raw_spin_lock_irqsave(&i8259A_lock, flags);
+
+	/* Find out what's interrupting in the PIIX4 master 8259 */
+	outb(0x0c, 0x20);		/* OCW3 Poll command */
+	realirq = inb(0x20);
+
+	/*
+	 * Bit 7 == 0 means invalid/spurious
+	 */
+	if (unlikely(!(realirq & 0x80)))
+		goto out_unlock;
+
+	realirq &= 7;
+
+	if (unlikely(realirq == 2)) {
+		outb(0x0c, 0xa0);
+		realirq = inb(0xa0);
+
+		if (unlikely(!(realirq & 0x80)))
+			goto out_unlock;
+
+		realirq = (realirq & 7) + 8;
+	}
+
+	/* mask and ack interrupt */
+	cached_irq_mask |= 1 << realirq;
+	if (unlikely(realirq > 7)) {
+		inb(0xa1);
+		outb(cached_slave_mask, 0xa1);
+		outb(0x60 + (realirq & 7), 0xa0);
+		outb(0x60 + 2, 0x20);
+	} else {
+		inb(0x21);
+		outb(cached_master_mask, 0x21);
+		outb(0x60 + realirq, 0x20);
+	}
+
+	raw_spin_unlock_irqrestore(&i8259A_lock, flags);
+
+	/*
+	 * handle this 'virtual interrupt' as a Cobalt one now.
+	 */
+	generic_handle_irq(realirq);
+
+	return IRQ_HANDLED;
+
+out_unlock:
+	raw_spin_unlock_irqrestore(&i8259A_lock, flags);
+	return IRQ_NONE;
+}
+
+static struct irqaction master_action = {
+	.handler =	piix4_master_intr,
+	.name =		"PIIX4-8259",
+};
+
+static struct irqaction cascade_action = {
+	.handler = 	no_action,
+	.name =		"cascade",
+};
+
+static inline void set_piix4_virtual_irq_type(void)
+{
+	piix4_virtual_irq_type.enable =	i8259A_chip.unmask;
+	piix4_virtual_irq_type.disable = i8259A_chip.mask;
+	piix4_virtual_irq_type.unmask =	i8259A_chip.unmask;
+}
+
+static void __init visws_pre_intr_init(void)
+{
+	int i;
+
+	set_piix4_virtual_irq_type();
+
+	for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) {
+		struct irq_chip *chip = NULL;
+
+		if (i == 0)
+			chip = &cobalt_irq_type;
+		else if (i == CO_IRQ_IDE0)
+			chip = &cobalt_irq_type;
+		else if (i == CO_IRQ_IDE1)
+			>chip = &cobalt_irq_type;
+		else if (i == CO_IRQ_8259)
+			chip = &piix4_master_irq_type;
+		else if (i < CO_IRQ_APIC0)
+			chip = &piix4_virtual_irq_type;
+		else if (IS_CO_APIC(i))
+			chip = &cobalt_irq_type;
+
+		if (chip)
+			set_irq_chip(i, chip);
+	}
+
+	setup_irq(CO_IRQ_8259, &master_action);
+	setup_irq(2, &cascade_action);
+}
-- 
cgit v1.2.3-18-g5258


From 3b3da9d25ae9d8cac99302ad66834499cf324d08 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 16 Oct 2010 10:35:51 +0200
Subject: x86: Move scx200 to platform

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/Makefile             |   3 -
 arch/x86/kernel/scx200_32.c          | 131 -----------------------------------
 arch/x86/platform/Makefile           |   1 +
 arch/x86/platform/scx200/Makefile    |   2 +
 arch/x86/platform/scx200/scx200_32.c | 131 +++++++++++++++++++++++++++++++++++
 5 files changed, 134 insertions(+), 134 deletions(-)
 delete mode 100644 arch/x86/kernel/scx200_32.c
 create mode 100644 arch/x86/platform/scx200/Makefile
 create mode 100644 arch/x86/platform/scx200/scx200_32.c

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 28c4f3f2e97..f57eeea4bc1 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -101,9 +101,6 @@ obj-$(CONFIG_PARAVIRT_CLOCK)	+= pvclock.o
 
 obj-$(CONFIG_PCSPKR_PLATFORM)	+= pcspeaker.o
 
-obj-$(CONFIG_SCx200)		+= scx200.o
-scx200-y			+= scx200_32.o
-
 obj-$(CONFIG_OLPC)		+= olpc.o
 obj-$(CONFIG_OLPC_XO1)		+= olpc-xo1.o
 obj-$(CONFIG_OLPC_OPENFIRMWARE)	+= olpc_ofw.o
diff --git a/arch/x86/kernel/scx200_32.c b/arch/x86/kernel/scx200_32.c
deleted file mode 100644
index 7e004acbe52..00000000000
--- a/arch/x86/kernel/scx200_32.c
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- *  Copyright (c) 2001,2002 Christer Weinigel <wingel@nano-system.com>
- *
- *  National Semiconductor SCx200 support.
- */
-
-#include <linux/module.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/mutex.h>
-#include <linux/pci.h>
-
-#include <linux/scx200.h>
-#include <linux/scx200_gpio.h>
-
-/* Verify that the configuration block really is there */
-#define scx200_cb_probe(base) (inw((base) + SCx200_CBA) == (base))
-
-#define NAME "scx200"
-
-MODULE_AUTHOR("Christer Weinigel <wingel@nano-system.com>");
-MODULE_DESCRIPTION("NatSemi SCx200 Driver");
-MODULE_LICENSE("GPL");
-
-unsigned scx200_gpio_base = 0;
-unsigned long scx200_gpio_shadow[2];
-
-unsigned scx200_cb_base = 0;
-
-static struct pci_device_id scx200_tbl[] = {
-	{ PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SCx200_BRIDGE) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SCx200_XBUS)   },
-	{ PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_XBUS)   },
-	{ },
-};
-MODULE_DEVICE_TABLE(pci,scx200_tbl);
-
-static int __devinit scx200_probe(struct pci_dev *, const struct pci_device_id *);
-
-static struct pci_driver scx200_pci_driver = {
-	.name = "scx200",
-	.id_table = scx200_tbl,
-	.probe = scx200_probe,
-};
-
-static DEFINE_MUTEX(scx200_gpio_config_lock);
-
-static void __devinit scx200_init_shadow(void)
-{
-	int bank;
-
-	/* read the current values driven on the GPIO signals */
-	for (bank = 0; bank < 2; ++bank)
-		scx200_gpio_shadow[bank] = inl(scx200_gpio_base + 0x10 * bank);
-}
-
-static int __devinit scx200_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
-{
-	unsigned base;
-
-	if (pdev->device == PCI_DEVICE_ID_NS_SCx200_BRIDGE ||
-	    pdev->device == PCI_DEVICE_ID_NS_SC1100_BRIDGE) {
-		base = pci_resource_start(pdev, 0);
-		printk(KERN_INFO NAME ": GPIO base 0x%x\n", base);
-
-		if (!request_region(base, SCx200_GPIO_SIZE, "NatSemi SCx200 GPIO")) {
-			printk(KERN_ERR NAME ": can't allocate I/O for GPIOs\n");
-			return -EBUSY;
-		}
-
-		scx200_gpio_base = base;
-		scx200_init_shadow();
-
-	} else {
-		/* find the base of the Configuration Block */
-		if (scx200_cb_probe(SCx200_CB_BASE_FIXED)) {
-			scx200_cb_base = SCx200_CB_BASE_FIXED;
-		} else {
-			pci_read_config_dword(pdev, SCx200_CBA_SCRATCH, &base);
-			if (scx200_cb_probe(base)) {
-				scx200_cb_base = base;
-			} else {
-				printk(KERN_WARNING NAME ": Configuration Block not found\n");
-				return -ENODEV;
-			}
-		}
-		printk(KERN_INFO NAME ": Configuration Block base 0x%x\n", scx200_cb_base);
-	}
-
-	return 0;
-}
-
-u32 scx200_gpio_configure(unsigned index, u32 mask, u32 bits)
-{
-	u32 config, new_config;
-
-	mutex_lock(&scx200_gpio_config_lock);
-
-	outl(index, scx200_gpio_base + 0x20);
-	config = inl(scx200_gpio_base + 0x24);
-
-	new_config = (config & mask) | bits;
-	outl(new_config, scx200_gpio_base + 0x24);
-
-	mutex_unlock(&scx200_gpio_config_lock);
-
-	return config;
-}
-
-static int __init scx200_init(void)
-{
-	printk(KERN_INFO NAME ": NatSemi SCx200 Driver\n");
-
-	return pci_register_driver(&scx200_pci_driver);
-}
-
-static void __exit scx200_cleanup(void)
-{
-	pci_unregister_driver(&scx200_pci_driver);
-	release_region(scx200_gpio_base, SCx200_GPIO_SIZE);
-}
-
-module_init(scx200_init);
-module_exit(scx200_cleanup);
-
-EXPORT_SYMBOL(scx200_gpio_base);
-EXPORT_SYMBOL(scx200_gpio_shadow);
-EXPORT_SYMBOL(scx200_gpio_configure);
-EXPORT_SYMBOL(scx200_cb_base);
diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile
index e629d7a428c..1191989e9f0 100644
--- a/arch/x86/platform/Makefile
+++ b/arch/x86/platform/Makefile
@@ -1,4 +1,5 @@
 # Platform specific code goes here
 obj-y	+= efi/
+obj-y	+= scx200/
 obj-y	+= sfi/
 obj-y	+= visws/
diff --git a/arch/x86/platform/scx200/Makefile b/arch/x86/platform/scx200/Makefile
new file mode 100644
index 00000000000..762b4c7f431
--- /dev/null
+++ b/arch/x86/platform/scx200/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_SCx200)		+= scx200.o
+scx200-y			+= scx200_32.o
diff --git a/arch/x86/platform/scx200/scx200_32.c b/arch/x86/platform/scx200/scx200_32.c
new file mode 100644
index 00000000000..7e004acbe52
--- /dev/null
+++ b/arch/x86/platform/scx200/scx200_32.c
@@ -0,0 +1,131 @@
+/*
+ *  Copyright (c) 2001,2002 Christer Weinigel <wingel@nano-system.com>
+ *
+ *  National Semiconductor SCx200 support.
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/pci.h>
+
+#include <linux/scx200.h>
+#include <linux/scx200_gpio.h>
+
+/* Verify that the configuration block really is there */
+#define scx200_cb_probe(base) (inw((base) + SCx200_CBA) == (base))
+
+#define NAME "scx200"
+
+MODULE_AUTHOR("Christer Weinigel <wingel@nano-system.com>");
+MODULE_DESCRIPTION("NatSemi SCx200 Driver");
+MODULE_LICENSE("GPL");
+
+unsigned scx200_gpio_base = 0;
+unsigned long scx200_gpio_shadow[2];
+
+unsigned scx200_cb_base = 0;
+
+static struct pci_device_id scx200_tbl[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SCx200_BRIDGE) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SCx200_XBUS)   },
+	{ PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_XBUS)   },
+	{ },
+};
+MODULE_DEVICE_TABLE(pci,scx200_tbl);
+
+static int __devinit scx200_probe(struct pci_dev *, const struct pci_device_id *);
+
+static struct pci_driver scx200_pci_driver = {
+	.name = "scx200",
+	.id_table = scx200_tbl,
+	.probe = scx200_probe,
+};
+
+static DEFINE_MUTEX(scx200_gpio_config_lock);
+
+static void __devinit scx200_init_shadow(void)
+{
+	int bank;
+
+	/* read the current values driven on the GPIO signals */
+	for (bank = 0; bank < 2; ++bank)
+		scx200_gpio_shadow[bank] = inl(scx200_gpio_base + 0x10 * bank);
+}
+
+static int __devinit scx200_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+	unsigned base;
+
+	if (pdev->device == PCI_DEVICE_ID_NS_SCx200_BRIDGE ||
+	    pdev->device == PCI_DEVICE_ID_NS_SC1100_BRIDGE) {
+		base = pci_resource_start(pdev, 0);
+		printk(KERN_INFO NAME ": GPIO base 0x%x\n", base);
+
+		if (!request_region(base, SCx200_GPIO_SIZE, "NatSemi SCx200 GPIO")) {
+			printk(KERN_ERR NAME ": can't allocate I/O for GPIOs\n");
+			return -EBUSY;
+		}
+
+		scx200_gpio_base = base;
+		scx200_init_shadow();
+
+	} else {
+		/* find the base of the Configuration Block */
+		if (scx200_cb_probe(SCx200_CB_BASE_FIXED)) {
+			scx200_cb_base = SCx200_CB_BASE_FIXED;
+		} else {
+			pci_read_config_dword(pdev, SCx200_CBA_SCRATCH, &base);
+			if (scx200_cb_probe(base)) {
+				scx200_cb_base = base;
+			} else {
+				printk(KERN_WARNING NAME ": Configuration Block not found\n");
+				return -ENODEV;
+			}
+		}
+		printk(KERN_INFO NAME ": Configuration Block base 0x%x\n", scx200_cb_base);
+	}
+
+	return 0;
+}
+
+u32 scx200_gpio_configure(unsigned index, u32 mask, u32 bits)
+{
+	u32 config, new_config;
+
+	mutex_lock(&scx200_gpio_config_lock);
+
+	outl(index, scx200_gpio_base + 0x20);
+	config = inl(scx200_gpio_base + 0x24);
+
+	new_config = (config & mask) | bits;
+	outl(new_config, scx200_gpio_base + 0x24);
+
+	mutex_unlock(&scx200_gpio_config_lock);
+
+	return config;
+}
+
+static int __init scx200_init(void)
+{
+	printk(KERN_INFO NAME ": NatSemi SCx200 Driver\n");
+
+	return pci_register_driver(&scx200_pci_driver);
+}
+
+static void __exit scx200_cleanup(void)
+{
+	pci_unregister_driver(&scx200_pci_driver);
+	release_region(scx200_gpio_base, SCx200_GPIO_SIZE);
+}
+
+module_init(scx200_init);
+module_exit(scx200_cleanup);
+
+EXPORT_SYMBOL(scx200_gpio_base);
+EXPORT_SYMBOL(scx200_gpio_shadow);
+EXPORT_SYMBOL(scx200_gpio_configure);
+EXPORT_SYMBOL(scx200_cb_base);
-- 
cgit v1.2.3-18-g5258


From 9694d4afc1ebe1e46cacfb78b107cd8f9fb550ba Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 16 Oct 2010 10:38:13 +0200
Subject: x86: Move mrst to platform

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jacob Pan <jacob.jun.pan@intel.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
---
 arch/x86/kernel/Makefile        |   1 -
 arch/x86/kernel/mrst.c          | 311 ----------------------------------------
 arch/x86/platform/Makefile      |   1 +
 arch/x86/platform/mrst/Makefile |   1 +
 arch/x86/platform/mrst/mrst.c   | 311 ++++++++++++++++++++++++++++++++++++++++
 5 files changed, 313 insertions(+), 312 deletions(-)
 delete mode 100644 arch/x86/kernel/mrst.c
 create mode 100644 arch/x86/platform/mrst/Makefile
 create mode 100644 arch/x86/platform/mrst/mrst.c

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index f57eeea4bc1..4e1f862dd68 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -104,7 +104,6 @@ obj-$(CONFIG_PCSPKR_PLATFORM)	+= pcspeaker.o
 obj-$(CONFIG_OLPC)		+= olpc.o
 obj-$(CONFIG_OLPC_XO1)		+= olpc-xo1.o
 obj-$(CONFIG_OLPC_OPENFIRMWARE)	+= olpc_ofw.o
-obj-$(CONFIG_X86_MRST)		+= mrst.o
 
 microcode-y				:= microcode_core.o
 microcode-$(CONFIG_MICROCODE_INTEL)	+= microcode_intel.o
diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c
deleted file mode 100644
index 79ae68154e8..00000000000
--- a/arch/x86/kernel/mrst.c
+++ /dev/null
@@ -1,311 +0,0 @@
-/*
- * mrst.c: Intel Moorestown platform specific setup code
- *
- * (C) Copyright 2008 Intel Corporation
- * Author: Jacob Pan (jacob.jun.pan@intel.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- */
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/sfi.h>
-#include <linux/irq.h>
-#include <linux/module.h>
-
-#include <asm/setup.h>
-#include <asm/mpspec_def.h>
-#include <asm/hw_irq.h>
-#include <asm/apic.h>
-#include <asm/io_apic.h>
-#include <asm/mrst.h>
-#include <asm/io.h>
-#include <asm/i8259.h>
-#include <asm/apb_timer.h>
-
-/*
- * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock,
- * cmdline option x86_mrst_timer can be used to override the configuration
- * to prefer one or the other.
- * at runtime, there are basically three timer configurations:
- * 1. per cpu apbt clock only
- * 2. per cpu always-on lapic clocks only, this is Penwell/Medfield only
- * 3. per cpu lapic clock (C3STOP) and one apbt clock, with broadcast.
- *
- * by default (without cmdline option), platform code first detects cpu type
- * to see if we are on lincroft or penwell, then set up both lapic or apbt
- * clocks accordingly.
- * i.e. by default, medfield uses configuration #2, moorestown uses #1.
- * config #3 is supported but not recommended on medfield.
- *
- * rating and feature summary:
- * lapic (with C3STOP) --------- 100
- * apbt (always-on) ------------ 110
- * lapic (always-on,ARAT) ------ 150
- */
-
-__cpuinitdata enum mrst_timer_options mrst_timer_options;
-
-static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM];
-static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM];
-enum mrst_cpu_type __mrst_cpu_chip;
-EXPORT_SYMBOL_GPL(__mrst_cpu_chip);
-
-int sfi_mtimer_num;
-
-struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX];
-EXPORT_SYMBOL_GPL(sfi_mrtc_array);
-int sfi_mrtc_num;
-
-static inline void assign_to_mp_irq(struct mpc_intsrc *m,
-				    struct mpc_intsrc *mp_irq)
-{
-	memcpy(mp_irq, m, sizeof(struct mpc_intsrc));
-}
-
-static inline int mp_irq_cmp(struct mpc_intsrc *mp_irq,
-				struct mpc_intsrc *m)
-{
-	return memcmp(mp_irq, m, sizeof(struct mpc_intsrc));
-}
-
-static void save_mp_irq(struct mpc_intsrc *m)
-{
-	int i;
-
-	for (i = 0; i < mp_irq_entries; i++) {
-		if (!mp_irq_cmp(&mp_irqs[i], m))
-			return;
-	}
-
-	assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
-	if (++mp_irq_entries == MAX_IRQ_SOURCES)
-		panic("Max # of irq sources exceeded!!\n");
-}
-
-/* parse all the mtimer info to a static mtimer array */
-static int __init sfi_parse_mtmr(struct sfi_table_header *table)
-{
-	struct sfi_table_simple *sb;
-	struct sfi_timer_table_entry *pentry;
-	struct mpc_intsrc mp_irq;
-	int totallen;
-
-	sb = (struct sfi_table_simple *)table;
-	if (!sfi_mtimer_num) {
-		sfi_mtimer_num = SFI_GET_NUM_ENTRIES(sb,
-					struct sfi_timer_table_entry);
-		pentry = (struct sfi_timer_table_entry *) sb->pentry;
-		totallen = sfi_mtimer_num * sizeof(*pentry);
-		memcpy(sfi_mtimer_array, pentry, totallen);
-	}
-
-	printk(KERN_INFO "SFI: MTIMER info (num = %d):\n", sfi_mtimer_num);
-	pentry = sfi_mtimer_array;
-	for (totallen = 0; totallen < sfi_mtimer_num; totallen++, pentry++) {
-		printk(KERN_INFO "timer[%d]: paddr = 0x%08x, freq = %dHz,"
-			" irq = %d\n", totallen, (u32)pentry->phys_addr,
-			pentry->freq_hz, pentry->irq);
-			if (!pentry->irq)
-				continue;
-			mp_irq.type = MP_IOAPIC;
-			mp_irq.irqtype = mp_INT;
-/* triggering mode edge bit 2-3, active high polarity bit 0-1 */
-			mp_irq.irqflag = 5;
-			mp_irq.srcbus = 0;
-			mp_irq.srcbusirq = pentry->irq;	/* IRQ */
-			mp_irq.dstapic = MP_APIC_ALL;
-			mp_irq.dstirq = pentry->irq;
-			save_mp_irq(&mp_irq);
-	}
-
-	return 0;
-}
-
-struct sfi_timer_table_entry *sfi_get_mtmr(int hint)
-{
-	int i;
-	if (hint < sfi_mtimer_num) {
-		if (!sfi_mtimer_usage[hint]) {
-			pr_debug("hint taken for timer %d irq %d\n",\
-				hint, sfi_mtimer_array[hint].irq);
-			sfi_mtimer_usage[hint] = 1;
-			return &sfi_mtimer_array[hint];
-		}
-	}
-	/* take the first timer available */
-	for (i = 0; i < sfi_mtimer_num;) {
-		if (!sfi_mtimer_usage[i]) {
-			sfi_mtimer_usage[i] = 1;
-			return &sfi_mtimer_array[i];
-		}
-		i++;
-	}
-	return NULL;
-}
-
-void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr)
-{
-	int i;
-	for (i = 0; i < sfi_mtimer_num;) {
-		if (mtmr->irq == sfi_mtimer_array[i].irq) {
-			sfi_mtimer_usage[i] = 0;
-			return;
-		}
-		i++;
-	}
-}
-
-/* parse all the mrtc info to a global mrtc array */
-int __init sfi_parse_mrtc(struct sfi_table_header *table)
-{
-	struct sfi_table_simple *sb;
-	struct sfi_rtc_table_entry *pentry;
-	struct mpc_intsrc mp_irq;
-
-	int totallen;
-
-	sb = (struct sfi_table_simple *)table;
-	if (!sfi_mrtc_num) {
-		sfi_mrtc_num = SFI_GET_NUM_ENTRIES(sb,
-						struct sfi_rtc_table_entry);
-		pentry = (struct sfi_rtc_table_entry *)sb->pentry;
-		totallen = sfi_mrtc_num * sizeof(*pentry);
-		memcpy(sfi_mrtc_array, pentry, totallen);
-	}
-
-	printk(KERN_INFO "SFI: RTC info (num = %d):\n", sfi_mrtc_num);
-	pentry = sfi_mrtc_array;
-	for (totallen = 0; totallen < sfi_mrtc_num; totallen++, pentry++) {
-		printk(KERN_INFO "RTC[%d]: paddr = 0x%08x, irq = %d\n",
-			totallen, (u32)pentry->phys_addr, pentry->irq);
-		mp_irq.type = MP_IOAPIC;
-		mp_irq.irqtype = mp_INT;
-		mp_irq.irqflag = 0;
-		mp_irq.srcbus = 0;
-		mp_irq.srcbusirq = pentry->irq;	/* IRQ */
-		mp_irq.dstapic = MP_APIC_ALL;
-		mp_irq.dstirq = pentry->irq;
-		save_mp_irq(&mp_irq);
-	}
-	return 0;
-}
-
-static unsigned long __init mrst_calibrate_tsc(void)
-{
-	unsigned long flags, fast_calibrate;
-
-	local_irq_save(flags);
-	fast_calibrate = apbt_quick_calibrate();
-	local_irq_restore(flags);
-
-	if (fast_calibrate)
-		return fast_calibrate;
-
-	return 0;
-}
-
-void __init mrst_time_init(void)
-{
-	switch (mrst_timer_options) {
-	case MRST_TIMER_APBT_ONLY:
-		break;
-	case MRST_TIMER_LAPIC_APBT:
-		x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock;
-		x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
-		break;
-	default:
-		if (!boot_cpu_has(X86_FEATURE_ARAT))
-			break;
-		x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock;
-		x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
-		return;
-	}
-	/* we need at least one APB timer */
-	sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
-	pre_init_apic_IRQ0();
-	apbt_time_init();
-}
-
-void __init mrst_rtc_init(void)
-{
-	sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc);
-}
-
-void __cpuinit mrst_arch_setup(void)
-{
-	if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27)
-		__mrst_cpu_chip = MRST_CPU_CHIP_PENWELL;
-	else if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x26)
-		__mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT;
-	else {
-		pr_err("Unknown Moorestown CPU (%d:%d), default to Lincroft\n",
-			boot_cpu_data.x86, boot_cpu_data.x86_model);
-		__mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT;
-	}
-	pr_debug("Moorestown CPU %s identified\n",
-		(__mrst_cpu_chip == MRST_CPU_CHIP_LINCROFT) ?
-		"Lincroft" : "Penwell");
-}
-
-/* MID systems don't have i8042 controller */
-static int mrst_i8042_detect(void)
-{
-	return 0;
-}
-
-/*
- * Moorestown specific x86_init function overrides and early setup
- * calls.
- */
-void __init x86_mrst_early_setup(void)
-{
-	x86_init.resources.probe_roms = x86_init_noop;
-	x86_init.resources.reserve_resources = x86_init_noop;
-
-	x86_init.timers.timer_init = mrst_time_init;
-	x86_init.timers.setup_percpu_clockev = x86_init_noop;
-
-	x86_init.irqs.pre_vector_init = x86_init_noop;
-
-	x86_init.oem.arch_setup = mrst_arch_setup;
-
-	x86_cpuinit.setup_percpu_clockev = apbt_setup_secondary_clock;
-
-	x86_platform.calibrate_tsc = mrst_calibrate_tsc;
-	x86_platform.i8042_detect = mrst_i8042_detect;
-	x86_init.pci.init = pci_mrst_init;
-	x86_init.pci.fixup_irqs = x86_init_noop;
-
-	legacy_pic = &null_legacy_pic;
-
-	/* Avoid searching for BIOS MP tables */
-	x86_init.mpparse.find_smp_config = x86_init_noop;
-	x86_init.mpparse.get_smp_config = x86_init_uint_noop;
-
-}
-
-/*
- * if user does not want to use per CPU apb timer, just give it a lower rating
- * than local apic timer and skip the late per cpu timer init.
- */
-static inline int __init setup_x86_mrst_timer(char *arg)
-{
-	if (!arg)
-		return -EINVAL;
-
-	if (strcmp("apbt_only", arg) == 0)
-		mrst_timer_options = MRST_TIMER_APBT_ONLY;
-	else if (strcmp("lapic_and_apbt", arg) == 0)
-		mrst_timer_options = MRST_TIMER_LAPIC_APBT;
-	else {
-		pr_warning("X86 MRST timer option %s not recognised"
-			   " use x86_mrst_timer=apbt_only or lapic_and_apbt\n",
-			   arg);
-		return -EINVAL;
-	}
-	return 0;
-}
-__setup("x86_mrst_timer=", setup_x86_mrst_timer);
diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile
index 1191989e9f0..06761ed5374 100644
--- a/arch/x86/platform/Makefile
+++ b/arch/x86/platform/Makefile
@@ -1,5 +1,6 @@
 # Platform specific code goes here
 obj-y	+= efi/
+obj-y	+= mrst/
 obj-y	+= scx200/
 obj-y	+= sfi/
 obj-y	+= visws/
diff --git a/arch/x86/platform/mrst/Makefile b/arch/x86/platform/mrst/Makefile
new file mode 100644
index 00000000000..efbbc552fa9
--- /dev/null
+++ b/arch/x86/platform/mrst/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_X86_MRST)		+= mrst.o
diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c
new file mode 100644
index 00000000000..79ae68154e8
--- /dev/null
+++ b/arch/x86/platform/mrst/mrst.c
@@ -0,0 +1,311 @@
+/*
+ * mrst.c: Intel Moorestown platform specific setup code
+ *
+ * (C) Copyright 2008 Intel Corporation
+ * Author: Jacob Pan (jacob.jun.pan@intel.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sfi.h>
+#include <linux/irq.h>
+#include <linux/module.h>
+
+#include <asm/setup.h>
+#include <asm/mpspec_def.h>
+#include <asm/hw_irq.h>
+#include <asm/apic.h>
+#include <asm/io_apic.h>
+#include <asm/mrst.h>
+#include <asm/io.h>
+#include <asm/i8259.h>
+#include <asm/apb_timer.h>
+
+/*
+ * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock,
+ * cmdline option x86_mrst_timer can be used to override the configuration
+ * to prefer one or the other.
+ * at runtime, there are basically three timer configurations:
+ * 1. per cpu apbt clock only
+ * 2. per cpu always-on lapic clocks only, this is Penwell/Medfield only
+ * 3. per cpu lapic clock (C3STOP) and one apbt clock, with broadcast.
+ *
+ * by default (without cmdline option), platform code first detects cpu type
+ * to see if we are on lincroft or penwell, then set up both lapic or apbt
+ * clocks accordingly.
+ * i.e. by default, medfield uses configuration #2, moorestown uses #1.
+ * config #3 is supported but not recommended on medfield.
+ *
+ * rating and feature summary:
+ * lapic (with C3STOP) --------- 100
+ * apbt (always-on) ------------ 110
+ * lapic (always-on,ARAT) ------ 150
+ */
+
+__cpuinitdata enum mrst_timer_options mrst_timer_options;
+
+static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM];
+static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM];
+enum mrst_cpu_type __mrst_cpu_chip;
+EXPORT_SYMBOL_GPL(__mrst_cpu_chip);
+
+int sfi_mtimer_num;
+
+struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX];
+EXPORT_SYMBOL_GPL(sfi_mrtc_array);
+int sfi_mrtc_num;
+
+static inline void assign_to_mp_irq(struct mpc_intsrc *m,
+				    struct mpc_intsrc *mp_irq)
+{
+	memcpy(mp_irq, m, sizeof(struct mpc_intsrc));
+}
+
+static inline int mp_irq_cmp(struct mpc_intsrc *mp_irq,
+				struct mpc_intsrc *m)
+{
+	return memcmp(mp_irq, m, sizeof(struct mpc_intsrc));
+}
+
+static void save_mp_irq(struct mpc_intsrc *m)
+{
+	int i;
+
+	for (i = 0; i < mp_irq_entries; i++) {
+		if (!mp_irq_cmp(&mp_irqs[i], m))
+			return;
+	}
+
+	assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
+	if (++mp_irq_entries == MAX_IRQ_SOURCES)
+		panic("Max # of irq sources exceeded!!\n");
+}
+
+/* parse all the mtimer info to a static mtimer array */
+static int __init sfi_parse_mtmr(struct sfi_table_header *table)
+{
+	struct sfi_table_simple *sb;
+	struct sfi_timer_table_entry *pentry;
+	struct mpc_intsrc mp_irq;
+	int totallen;
+
+	sb = (struct sfi_table_simple *)table;
+	if (!sfi_mtimer_num) {
+		sfi_mtimer_num = SFI_GET_NUM_ENTRIES(sb,
+					struct sfi_timer_table_entry);
+		pentry = (struct sfi_timer_table_entry *) sb->pentry;
+		totallen = sfi_mtimer_num * sizeof(*pentry);
+		memcpy(sfi_mtimer_array, pentry, totallen);
+	}
+
+	printk(KERN_INFO "SFI: MTIMER info (num = %d):\n", sfi_mtimer_num);
+	pentry = sfi_mtimer_array;
+	for (totallen = 0; totallen < sfi_mtimer_num; totallen++, pentry++) {
+		printk(KERN_INFO "timer[%d]: paddr = 0x%08x, freq = %dHz,"
+			" irq = %d\n", totallen, (u32)pentry->phys_addr,
+			pentry->freq_hz, pentry->irq);
+			if (!pentry->irq)
+				continue;
+			mp_irq.type = MP_IOAPIC;
+			mp_irq.irqtype = mp_INT;
+/* triggering mode edge bit 2-3, active high polarity bit 0-1 */
+			mp_irq.irqflag = 5;
+			mp_irq.srcbus = 0;
+			mp_irq.srcbusirq = pentry->irq;	/* IRQ */
+			mp_irq.dstapic = MP_APIC_ALL;
+			mp_irq.dstirq = pentry->irq;
+			save_mp_irq(&mp_irq);
+	}
+
+	return 0;
+}
+
+struct sfi_timer_table_entry *sfi_get_mtmr(int hint)
+{
+	int i;
+	if (hint < sfi_mtimer_num) {
+		if (!sfi_mtimer_usage[hint]) {
+			pr_debug("hint taken for timer %d irq %d\n",\
+				hint, sfi_mtimer_array[hint].irq);
+			sfi_mtimer_usage[hint] = 1;
+			return &sfi_mtimer_array[hint];
+		}
+	}
+	/* take the first timer available */
+	for (i = 0; i < sfi_mtimer_num;) {
+		if (!sfi_mtimer_usage[i]) {
+			sfi_mtimer_usage[i] = 1;
+			return &sfi_mtimer_array[i];
+		}
+		i++;
+	}
+	return NULL;
+}
+
+void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr)
+{
+	int i;
+	for (i = 0; i < sfi_mtimer_num;) {
+		if (mtmr->irq == sfi_mtimer_array[i].irq) {
+			sfi_mtimer_usage[i] = 0;
+			return;
+		}
+		i++;
+	}
+}
+
+/* parse all the mrtc info to a global mrtc array */
+int __init sfi_parse_mrtc(struct sfi_table_header *table)
+{
+	struct sfi_table_simple *sb;
+	struct sfi_rtc_table_entry *pentry;
+	struct mpc_intsrc mp_irq;
+
+	int totallen;
+
+	sb = (struct sfi_table_simple *)table;
+	if (!sfi_mrtc_num) {
+		sfi_mrtc_num = SFI_GET_NUM_ENTRIES(sb,
+						struct sfi_rtc_table_entry);
+		pentry = (struct sfi_rtc_table_entry *)sb->pentry;
+		totallen = sfi_mrtc_num * sizeof(*pentry);
+		memcpy(sfi_mrtc_array, pentry, totallen);
+	}
+
+	printk(KERN_INFO "SFI: RTC info (num = %d):\n", sfi_mrtc_num);
+	pentry = sfi_mrtc_array;
+	for (totallen = 0; totallen < sfi_mrtc_num; totallen++, pentry++) {
+		printk(KERN_INFO "RTC[%d]: paddr = 0x%08x, irq = %d\n",
+			totallen, (u32)pentry->phys_addr, pentry->irq);
+		mp_irq.type = MP_IOAPIC;
+		mp_irq.irqtype = mp_INT;
+		mp_irq.irqflag = 0;
+		mp_irq.srcbus = 0;
+		mp_irq.srcbusirq = pentry->irq;	/* IRQ */
+		mp_irq.dstapic = MP_APIC_ALL;
+		mp_irq.dstirq = pentry->irq;
+		save_mp_irq(&mp_irq);
+	}
+	return 0;
+}
+
+static unsigned long __init mrst_calibrate_tsc(void)
+{
+	unsigned long flags, fast_calibrate;
+
+	local_irq_save(flags);
+	fast_calibrate = apbt_quick_calibrate();
+	local_irq_restore(flags);
+
+	if (fast_calibrate)
+		return fast_calibrate;
+
+	return 0;
+}
+
+void __init mrst_time_init(void)
+{
+	switch (mrst_timer_options) {
+	case MRST_TIMER_APBT_ONLY:
+		break;
+	case MRST_TIMER_LAPIC_APBT:
+		x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock;
+		x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
+		break;
+	default:
+		if (!boot_cpu_has(X86_FEATURE_ARAT))
+			break;
+		x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock;
+		x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
+		return;
+	}
+	/* we need at least one APB timer */
+	sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
+	pre_init_apic_IRQ0();
+	apbt_time_init();
+}
+
+void __init mrst_rtc_init(void)
+{
+	sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc);
+}
+
+void __cpuinit mrst_arch_setup(void)
+{
+	if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27)
+		__mrst_cpu_chip = MRST_CPU_CHIP_PENWELL;
+	else if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x26)
+		__mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT;
+	else {
+		pr_err("Unknown Moorestown CPU (%d:%d), default to Lincroft\n",
+			boot_cpu_data.x86, boot_cpu_data.x86_model);
+		__mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT;
+	}
+	pr_debug("Moorestown CPU %s identified\n",
+		(__mrst_cpu_chip == MRST_CPU_CHIP_LINCROFT) ?
+		"Lincroft" : "Penwell");
+}
+
+/* MID systems don't have i8042 controller */
+static int mrst_i8042_detect(void)
+{
+	return 0;
+}
+
+/*
+ * Moorestown specific x86_init function overrides and early setup
+ * calls.
+ */
+void __init x86_mrst_early_setup(void)
+{
+	x86_init.resources.probe_roms = x86_init_noop;
+	x86_init.resources.reserve_resources = x86_init_noop;
+
+	x86_init.timers.timer_init = mrst_time_init;
+	x86_init.timers.setup_percpu_clockev = x86_init_noop;
+
+	x86_init.irqs.pre_vector_init = x86_init_noop;
+
+	x86_init.oem.arch_setup = mrst_arch_setup;
+
+	x86_cpuinit.setup_percpu_clockev = apbt_setup_secondary_clock;
+
+	x86_platform.calibrate_tsc = mrst_calibrate_tsc;
+	x86_platform.i8042_detect = mrst_i8042_detect;
+	x86_init.pci.init = pci_mrst_init;
+	x86_init.pci.fixup_irqs = x86_init_noop;
+
+	legacy_pic = &null_legacy_pic;
+
+	/* Avoid searching for BIOS MP tables */
+	x86_init.mpparse.find_smp_config = x86_init_noop;
+	x86_init.mpparse.get_smp_config = x86_init_uint_noop;
+
+}
+
+/*
+ * if user does not want to use per CPU apb timer, just give it a lower rating
+ * than local apic timer and skip the late per cpu timer init.
+ */
+static inline int __init setup_x86_mrst_timer(char *arg)
+{
+	if (!arg)
+		return -EINVAL;
+
+	if (strcmp("apbt_only", arg) == 0)
+		mrst_timer_options = MRST_TIMER_APBT_ONLY;
+	else if (strcmp("lapic_and_apbt", arg) == 0)
+		mrst_timer_options = MRST_TIMER_LAPIC_APBT;
+	else {
+		pr_warning("X86 MRST timer option %s not recognised"
+			   " use x86_mrst_timer=apbt_only or lapic_and_apbt\n",
+			   arg);
+		return -EINVAL;
+	}
+	return 0;
+}
+__setup("x86_mrst_timer=", setup_x86_mrst_timer);
-- 
cgit v1.2.3-18-g5258


From 329b84e42e3ee348b114fd0bfe4b2421e6139257 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 23 Oct 2010 11:23:37 +0200
Subject: x86: Move uv to platform

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Mike Travis <travis@sgi.com>
---
 arch/x86/kernel/Makefile        |    1 -
 arch/x86/kernel/bios_uv.c       |  215 -----
 arch/x86/kernel/tlb_uv.c        | 1661 ---------------------------------------
 arch/x86/kernel/uv_irq.c        |  285 -------
 arch/x86/kernel/uv_sysfs.c      |   76 --
 arch/x86/kernel/uv_time.c       |  423 ----------
 arch/x86/platform/Makefile      |    1 +
 arch/x86/platform/uv/Makefile   |    1 +
 arch/x86/platform/uv/bios_uv.c  |  215 +++++
 arch/x86/platform/uv/tlb_uv.c   | 1661 +++++++++++++++++++++++++++++++++++++++
 arch/x86/platform/uv/uv_irq.c   |  285 +++++++
 arch/x86/platform/uv/uv_sysfs.c |   76 ++
 arch/x86/platform/uv/uv_time.c  |  423 ++++++++++
 13 files changed, 2662 insertions(+), 2661 deletions(-)
 delete mode 100644 arch/x86/kernel/bios_uv.c
 delete mode 100644 arch/x86/kernel/tlb_uv.c
 delete mode 100644 arch/x86/kernel/uv_irq.c
 delete mode 100644 arch/x86/kernel/uv_sysfs.c
 delete mode 100644 arch/x86/kernel/uv_time.c
 create mode 100644 arch/x86/platform/uv/Makefile
 create mode 100644 arch/x86/platform/uv/bios_uv.c
 create mode 100644 arch/x86/platform/uv/tlb_uv.c
 create mode 100644 arch/x86/platform/uv/uv_irq.c
 create mode 100644 arch/x86/platform/uv/uv_sysfs.c
 create mode 100644 arch/x86/platform/uv/uv_time.c

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 4e1f862dd68..08e2e4bf839 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -117,7 +117,6 @@ obj-$(CONFIG_SWIOTLB)			+= pci-swiotlb.o
 ###
 # 64 bit specific files
 ifeq ($(CONFIG_X86_64),y)
-	obj-$(CONFIG_X86_UV)		+= tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o
 	obj-$(CONFIG_AUDIT)		+= audit_64.o
 
 	obj-$(CONFIG_GART_IOMMU)	+= pci-gart_64.o aperture_64.o
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c
deleted file mode 100644
index 8bc57baaa9a..00000000000
--- a/arch/x86/kernel/bios_uv.c
+++ /dev/null
@@ -1,215 +0,0 @@
-/*
- * BIOS run time interface routines.
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
- *
- *  Copyright (c) 2008-2009 Silicon Graphics, Inc.  All Rights Reserved.
- *  Copyright (c) Russ Anderson <rja@sgi.com>
- */
-
-#include <linux/efi.h>
-#include <asm/efi.h>
-#include <linux/io.h>
-#include <asm/uv/bios.h>
-#include <asm/uv/uv_hub.h>
-
-static struct uv_systab uv_systab;
-
-s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
-{
-	struct uv_systab *tab = &uv_systab;
-	s64 ret;
-
-	if (!tab->function)
-		/*
-		 * BIOS does not support UV systab
-		 */
-		return BIOS_STATUS_UNIMPLEMENTED;
-
-	ret = efi_call6((void *)__va(tab->function), (u64)which,
-			a1, a2, a3, a4, a5);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(uv_bios_call);
-
-s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
-					u64 a4, u64 a5)
-{
-	unsigned long bios_flags;
-	s64 ret;
-
-	local_irq_save(bios_flags);
-	ret = uv_bios_call(which, a1, a2, a3, a4, a5);
-	local_irq_restore(bios_flags);
-
-	return ret;
-}
-
-s64 uv_bios_call_reentrant(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
-					u64 a4, u64 a5)
-{
-	s64 ret;
-
-	preempt_disable();
-	ret = uv_bios_call(which, a1, a2, a3, a4, a5);
-	preempt_enable();
-
-	return ret;
-}
-
-
-long sn_partition_id;
-EXPORT_SYMBOL_GPL(sn_partition_id);
-long sn_coherency_id;
-EXPORT_SYMBOL_GPL(sn_coherency_id);
-long sn_region_size;
-EXPORT_SYMBOL_GPL(sn_region_size);
-long system_serial_number;
-EXPORT_SYMBOL_GPL(system_serial_number);
-int uv_type;
-EXPORT_SYMBOL_GPL(uv_type);
-
-
-s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher,
-		long *region, long *ssn)
-{
-	s64 ret;
-	u64 v0, v1;
-	union partition_info_u part;
-
-	ret = uv_bios_call_irqsave(UV_BIOS_GET_SN_INFO, fc,
-				(u64)(&v0), (u64)(&v1), 0, 0);
-	if (ret != BIOS_STATUS_SUCCESS)
-		return ret;
-
-	part.val = v0;
-	if (uvtype)
-		*uvtype = part.hub_version;
-	if (partid)
-		*partid = part.partition_id;
-	if (coher)
-		*coher = part.coherence_id;
-	if (region)
-		*region = part.region_size;
-	if (ssn)
-		*ssn = v1;
-	return ret;
-}
-EXPORT_SYMBOL_GPL(uv_bios_get_sn_info);
-
-int
-uv_bios_mq_watchlist_alloc(unsigned long addr, unsigned int mq_size,
-			   unsigned long *intr_mmr_offset)
-{
-	u64 watchlist;
-	s64 ret;
-
-	/*
-	 * bios returns watchlist number or negative error number.
-	 */
-	ret = (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_ALLOC, addr,
-			mq_size, (u64)intr_mmr_offset,
-			(u64)&watchlist, 0);
-	if (ret < BIOS_STATUS_SUCCESS)
-		return ret;
-
-	return watchlist;
-}
-EXPORT_SYMBOL_GPL(uv_bios_mq_watchlist_alloc);
-
-int
-uv_bios_mq_watchlist_free(int blade, int watchlist_num)
-{
-	return (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_FREE,
-				blade, watchlist_num, 0, 0, 0);
-}
-EXPORT_SYMBOL_GPL(uv_bios_mq_watchlist_free);
-
-s64
-uv_bios_change_memprotect(u64 paddr, u64 len, enum uv_memprotect perms)
-{
-	return uv_bios_call_irqsave(UV_BIOS_MEMPROTECT, paddr, len,
-					perms, 0, 0);
-}
-EXPORT_SYMBOL_GPL(uv_bios_change_memprotect);
-
-s64
-uv_bios_reserved_page_pa(u64 buf, u64 *cookie, u64 *addr, u64 *len)
-{
-	s64 ret;
-
-	ret = uv_bios_call_irqsave(UV_BIOS_GET_PARTITION_ADDR, (u64)cookie,
-					(u64)addr, buf, (u64)len, 0);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(uv_bios_reserved_page_pa);
-
-s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second)
-{
-	return uv_bios_call(UV_BIOS_FREQ_BASE, clock_type,
-			   (u64)ticks_per_second, 0, 0, 0);
-}
-EXPORT_SYMBOL_GPL(uv_bios_freq_base);
-
-/*
- * uv_bios_set_legacy_vga_target - Set Legacy VGA I/O Target
- * @decode: true to enable target, false to disable target
- * @domain: PCI domain number
- * @bus: PCI bus number
- *
- * Returns:
- *    0: Success
- *    -EINVAL: Invalid domain or bus number
- *    -ENOSYS: Capability not available
- *    -EBUSY: Legacy VGA I/O cannot be retargeted at this time
- */
-int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus)
-{
-	return uv_bios_call(UV_BIOS_SET_LEGACY_VGA_TARGET,
-				(u64)decode, (u64)domain, (u64)bus, 0, 0);
-}
-EXPORT_SYMBOL_GPL(uv_bios_set_legacy_vga_target);
-
-
-#ifdef CONFIG_EFI
-void uv_bios_init(void)
-{
-	struct uv_systab *tab;
-
-	if ((efi.uv_systab == EFI_INVALID_TABLE_ADDR) ||
-	    (efi.uv_systab == (unsigned long)NULL)) {
-		printk(KERN_CRIT "No EFI UV System Table.\n");
-		uv_systab.function = (unsigned long)NULL;
-		return;
-	}
-
-	tab = (struct uv_systab *)ioremap(efi.uv_systab,
-					sizeof(struct uv_systab));
-	if (strncmp(tab->signature, "UVST", 4) != 0)
-		printk(KERN_ERR "bad signature in UV system table!");
-
-	/*
-	 * Copy table to permanent spot for later use.
-	 */
-	memcpy(&uv_systab, tab, sizeof(struct uv_systab));
-	iounmap(tab);
-
-	printk(KERN_INFO "EFI UV System Table Revision %d\n",
-					uv_systab.revision);
-}
-#else	/* !CONFIG_EFI */
-
-void uv_bios_init(void) { }
-#endif
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
deleted file mode 100644
index 20ea20a39e2..00000000000
--- a/arch/x86/kernel/tlb_uv.c
+++ /dev/null
@@ -1,1661 +0,0 @@
-/*
- *	SGI UltraViolet TLB flush routines.
- *
- *	(c) 2008-2010 Cliff Wickman <cpw@sgi.com>, SGI.
- *
- *	This code is released under the GNU General Public License version 2 or
- *	later.
- */
-#include <linux/seq_file.h>
-#include <linux/proc_fs.h>
-#include <linux/debugfs.h>
-#include <linux/kernel.h>
-#include <linux/slab.h>
-
-#include <asm/mmu_context.h>
-#include <asm/uv/uv.h>
-#include <asm/uv/uv_mmrs.h>
-#include <asm/uv/uv_hub.h>
-#include <asm/uv/uv_bau.h>
-#include <asm/apic.h>
-#include <asm/idle.h>
-#include <asm/tsc.h>
-#include <asm/irq_vectors.h>
-#include <asm/timer.h>
-
-/* timeouts in nanoseconds (indexed by UVH_AGING_PRESCALE_SEL urgency7 30:28) */
-static int timeout_base_ns[] = {
-		20,
-		160,
-		1280,
-		10240,
-		81920,
-		655360,
-		5242880,
-		167772160
-};
-static int timeout_us;
-static int nobau;
-static int baudisabled;
-static spinlock_t disable_lock;
-static cycles_t congested_cycles;
-
-/* tunables: */
-static int max_bau_concurrent = MAX_BAU_CONCURRENT;
-static int max_bau_concurrent_constant = MAX_BAU_CONCURRENT;
-static int plugged_delay = PLUGGED_DELAY;
-static int plugsb4reset = PLUGSB4RESET;
-static int timeoutsb4reset = TIMEOUTSB4RESET;
-static int ipi_reset_limit = IPI_RESET_LIMIT;
-static int complete_threshold = COMPLETE_THRESHOLD;
-static int congested_response_us = CONGESTED_RESPONSE_US;
-static int congested_reps = CONGESTED_REPS;
-static int congested_period = CONGESTED_PERIOD;
-static struct dentry *tunables_dir;
-static struct dentry *tunables_file;
-
-static int __init setup_nobau(char *arg)
-{
-	nobau = 1;
-	return 0;
-}
-early_param("nobau", setup_nobau);
-
-/* base pnode in this partition */
-static int uv_partition_base_pnode __read_mostly;
-/* position of pnode (which is nasid>>1): */
-static int uv_nshift __read_mostly;
-static unsigned long uv_mmask __read_mostly;
-
-static DEFINE_PER_CPU(struct ptc_stats, ptcstats);
-static DEFINE_PER_CPU(struct bau_control, bau_control);
-static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
-
-/*
- * Determine the first node on a uvhub. 'Nodes' are used for kernel
- * memory allocation.
- */
-static int __init uvhub_to_first_node(int uvhub)
-{
-	int node, b;
-
-	for_each_online_node(node) {
-		b = uv_node_to_blade_id(node);
-		if (uvhub == b)
-			return node;
-	}
-	return -1;
-}
-
-/*
- * Determine the apicid of the first cpu on a uvhub.
- */
-static int __init uvhub_to_first_apicid(int uvhub)
-{
-	int cpu;
-
-	for_each_present_cpu(cpu)
-		if (uvhub == uv_cpu_to_blade_id(cpu))
-			return per_cpu(x86_cpu_to_apicid, cpu);
-	return -1;
-}
-
-/*
- * Free a software acknowledge hardware resource by clearing its Pending
- * bit. This will return a reply to the sender.
- * If the message has timed out, a reply has already been sent by the
- * hardware but the resource has not been released. In that case our
- * clear of the Timeout bit (as well) will free the resource. No reply will
- * be sent (the hardware will only do one reply per message).
- */
-static inline void uv_reply_to_message(struct msg_desc *mdp,
-				       struct bau_control *bcp)
-{
-	unsigned long dw;
-	struct bau_payload_queue_entry *msg;
-
-	msg = mdp->msg;
-	if (!msg->canceled) {
-		dw = (msg->sw_ack_vector << UV_SW_ACK_NPENDING) |
-						msg->sw_ack_vector;
-		uv_write_local_mmr(
-				UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw);
-	}
-	msg->replied_to = 1;
-	msg->sw_ack_vector = 0;
-}
-
-/*
- * Process the receipt of a RETRY message
- */
-static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
-					    struct bau_control *bcp)
-{
-	int i;
-	int cancel_count = 0;
-	int slot2;
-	unsigned long msg_res;
-	unsigned long mmr = 0;
-	struct bau_payload_queue_entry *msg;
-	struct bau_payload_queue_entry *msg2;
-	struct ptc_stats *stat;
-
-	msg = mdp->msg;
-	stat = bcp->statp;
-	stat->d_retries++;
-	/*
-	 * cancel any message from msg+1 to the retry itself
-	 */
-	for (msg2 = msg+1, i = 0; i < DEST_Q_SIZE; msg2++, i++) {
-		if (msg2 > mdp->va_queue_last)
-			msg2 = mdp->va_queue_first;
-		if (msg2 == msg)
-			break;
-
-		/* same conditions for cancellation as uv_do_reset */
-		if ((msg2->replied_to == 0) && (msg2->canceled == 0) &&
-		    (msg2->sw_ack_vector) && ((msg2->sw_ack_vector &
-			msg->sw_ack_vector) == 0) &&
-		    (msg2->sending_cpu == msg->sending_cpu) &&
-		    (msg2->msg_type != MSG_NOOP)) {
-			slot2 = msg2 - mdp->va_queue_first;
-			mmr = uv_read_local_mmr
-				(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
-			msg_res = msg2->sw_ack_vector;
-			/*
-			 * This is a message retry; clear the resources held
-			 * by the previous message only if they timed out.
-			 * If it has not timed out we have an unexpected
-			 * situation to report.
-			 */
-			if (mmr & (msg_res << UV_SW_ACK_NPENDING)) {
-				/*
-				 * is the resource timed out?
-				 * make everyone ignore the cancelled message.
-				 */
-				msg2->canceled = 1;
-				stat->d_canceled++;
-				cancel_count++;
-				uv_write_local_mmr(
-				    UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
-					(msg_res << UV_SW_ACK_NPENDING) |
-					 msg_res);
-			}
-		}
-	}
-	if (!cancel_count)
-		stat->d_nocanceled++;
-}
-
-/*
- * Do all the things a cpu should do for a TLB shootdown message.
- * Other cpu's may come here at the same time for this message.
- */
-static void uv_bau_process_message(struct msg_desc *mdp,
-				   struct bau_control *bcp)
-{
-	int msg_ack_count;
-	short socket_ack_count = 0;
-	struct ptc_stats *stat;
-	struct bau_payload_queue_entry *msg;
-	struct bau_control *smaster = bcp->socket_master;
-
-	/*
-	 * This must be a normal message, or retry of a normal message
-	 */
-	msg = mdp->msg;
-	stat = bcp->statp;
-	if (msg->address == TLB_FLUSH_ALL) {
-		local_flush_tlb();
-		stat->d_alltlb++;
-	} else {
-		__flush_tlb_one(msg->address);
-		stat->d_onetlb++;
-	}
-	stat->d_requestee++;
-
-	/*
-	 * One cpu on each uvhub has the additional job on a RETRY
-	 * of releasing the resource held by the message that is
-	 * being retried.  That message is identified by sending
-	 * cpu number.
-	 */
-	if (msg->msg_type == MSG_RETRY && bcp == bcp->uvhub_master)
-		uv_bau_process_retry_msg(mdp, bcp);
-
-	/*
-	 * This is a sw_ack message, so we have to reply to it.
-	 * Count each responding cpu on the socket. This avoids
-	 * pinging the count's cache line back and forth between
-	 * the sockets.
-	 */
-	socket_ack_count = atomic_add_short_return(1, (struct atomic_short *)
-			&smaster->socket_acknowledge_count[mdp->msg_slot]);
-	if (socket_ack_count == bcp->cpus_in_socket) {
-		/*
-		 * Both sockets dump their completed count total into
-		 * the message's count.
-		 */
-		smaster->socket_acknowledge_count[mdp->msg_slot] = 0;
-		msg_ack_count = atomic_add_short_return(socket_ack_count,
-				(struct atomic_short *)&msg->acknowledge_count);
-
-		if (msg_ack_count == bcp->cpus_in_uvhub) {
-			/*
-			 * All cpus in uvhub saw it; reply
-			 */
-			uv_reply_to_message(mdp, bcp);
-		}
-	}
-
-	return;
-}
-
-/*
- * Determine the first cpu on a uvhub.
- */
-static int uvhub_to_first_cpu(int uvhub)
-{
-	int cpu;
-	for_each_present_cpu(cpu)
-		if (uvhub == uv_cpu_to_blade_id(cpu))
-			return cpu;
-	return -1;
-}
-
-/*
- * Last resort when we get a large number of destination timeouts is
- * to clear resources held by a given cpu.
- * Do this with IPI so that all messages in the BAU message queue
- * can be identified by their nonzero sw_ack_vector field.
- *
- * This is entered for a single cpu on the uvhub.
- * The sender want's this uvhub to free a specific message's
- * sw_ack resources.
- */
-static void
-uv_do_reset(void *ptr)
-{
-	int i;
-	int slot;
-	int count = 0;
-	unsigned long mmr;
-	unsigned long msg_res;
-	struct bau_control *bcp;
-	struct reset_args *rap;
-	struct bau_payload_queue_entry *msg;
-	struct ptc_stats *stat;
-
-	bcp = &per_cpu(bau_control, smp_processor_id());
-	rap = (struct reset_args *)ptr;
-	stat = bcp->statp;
-	stat->d_resets++;
-
-	/*
-	 * We're looking for the given sender, and
-	 * will free its sw_ack resource.
-	 * If all cpu's finally responded after the timeout, its
-	 * message 'replied_to' was set.
-	 */
-	for (msg = bcp->va_queue_first, i = 0; i < DEST_Q_SIZE; msg++, i++) {
-		/* uv_do_reset: same conditions for cancellation as
-		   uv_bau_process_retry_msg() */
-		if ((msg->replied_to == 0) &&
-		    (msg->canceled == 0) &&
-		    (msg->sending_cpu == rap->sender) &&
-		    (msg->sw_ack_vector) &&
-		    (msg->msg_type != MSG_NOOP)) {
-			/*
-			 * make everyone else ignore this message
-			 */
-			msg->canceled = 1;
-			slot = msg - bcp->va_queue_first;
-			count++;
-			/*
-			 * only reset the resource if it is still pending
-			 */
-			mmr = uv_read_local_mmr
-					(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
-			msg_res = msg->sw_ack_vector;
-			if (mmr & msg_res) {
-				stat->d_rcanceled++;
-				uv_write_local_mmr(
-				    UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
-					(msg_res << UV_SW_ACK_NPENDING) |
-					 msg_res);
-			}
-		}
-	}
-	return;
-}
-
-/*
- * Use IPI to get all target uvhubs to release resources held by
- * a given sending cpu number.
- */
-static void uv_reset_with_ipi(struct bau_target_uvhubmask *distribution,
-			      int sender)
-{
-	int uvhub;
-	int cpu;
-	cpumask_t mask;
-	struct reset_args reset_args;
-
-	reset_args.sender = sender;
-
-	cpus_clear(mask);
-	/* find a single cpu for each uvhub in this distribution mask */
-	for (uvhub = 0;
-		    uvhub < sizeof(struct bau_target_uvhubmask) * BITSPERBYTE;
-		    uvhub++) {
-		if (!bau_uvhub_isset(uvhub, distribution))
-			continue;
-		/* find a cpu for this uvhub */
-		cpu = uvhub_to_first_cpu(uvhub);
-		cpu_set(cpu, mask);
-	}
-	/* IPI all cpus; Preemption is already disabled */
-	smp_call_function_many(&mask, uv_do_reset, (void *)&reset_args, 1);
-	return;
-}
-
-static inline unsigned long
-cycles_2_us(unsigned long long cyc)
-{
-	unsigned long long ns;
-	unsigned long us;
-	ns =  (cyc * per_cpu(cyc2ns, smp_processor_id()))
-						>> CYC2NS_SCALE_FACTOR;
-	us = ns / 1000;
-	return us;
-}
-
-/*
- * wait for all cpus on this hub to finish their sends and go quiet
- * leaves uvhub_quiesce set so that no new broadcasts are started by
- * bau_flush_send_and_wait()
- */
-static inline void
-quiesce_local_uvhub(struct bau_control *hmaster)
-{
-	atomic_add_short_return(1, (struct atomic_short *)
-		 &hmaster->uvhub_quiesce);
-}
-
-/*
- * mark this quiet-requestor as done
- */
-static inline void
-end_uvhub_quiesce(struct bau_control *hmaster)
-{
-	atomic_add_short_return(-1, (struct atomic_short *)
-		&hmaster->uvhub_quiesce);
-}
-
-/*
- * Wait for completion of a broadcast software ack message
- * return COMPLETE, RETRY(PLUGGED or TIMEOUT) or GIVEUP
- */
-static int uv_wait_completion(struct bau_desc *bau_desc,
-	unsigned long mmr_offset, int right_shift, int this_cpu,
-	struct bau_control *bcp, struct bau_control *smaster, long try)
-{
-	unsigned long descriptor_status;
-	cycles_t ttime;
-	struct ptc_stats *stat = bcp->statp;
-	struct bau_control *hmaster;
-
-	hmaster = bcp->uvhub_master;
-
-	/* spin on the status MMR, waiting for it to go idle */
-	while ((descriptor_status = (((unsigned long)
-		uv_read_local_mmr(mmr_offset) >>
-			right_shift) & UV_ACT_STATUS_MASK)) !=
-			DESC_STATUS_IDLE) {
-		/*
-		 * Our software ack messages may be blocked because there are
-		 * no swack resources available.  As long as none of them
-		 * has timed out hardware will NACK our message and its
-		 * state will stay IDLE.
-		 */
-		if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) {
-			stat->s_stimeout++;
-			return FLUSH_GIVEUP;
-		} else if (descriptor_status ==
-					DESC_STATUS_DESTINATION_TIMEOUT) {
-			stat->s_dtimeout++;
-			ttime = get_cycles();
-
-			/*
-			 * Our retries may be blocked by all destination
-			 * swack resources being consumed, and a timeout
-			 * pending.  In that case hardware returns the
-			 * ERROR that looks like a destination timeout.
-			 */
-			if (cycles_2_us(ttime - bcp->send_message) <
-							timeout_us) {
-				bcp->conseccompletes = 0;
-				return FLUSH_RETRY_PLUGGED;
-			}
-
-			bcp->conseccompletes = 0;
-			return FLUSH_RETRY_TIMEOUT;
-		} else {
-			/*
-			 * descriptor_status is still BUSY
-			 */
-			cpu_relax();
-		}
-	}
-	bcp->conseccompletes++;
-	return FLUSH_COMPLETE;
-}
-
-static inline cycles_t
-sec_2_cycles(unsigned long sec)
-{
-	unsigned long ns;
-	cycles_t cyc;
-
-	ns = sec * 1000000000;
-	cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
-	return cyc;
-}
-
-/*
- * conditionally add 1 to *v, unless *v is >= u
- * return 0 if we cannot add 1 to *v because it is >= u
- * return 1 if we can add 1 to *v because it is < u
- * the add is atomic
- *
- * This is close to atomic_add_unless(), but this allows the 'u' value
- * to be lowered below the current 'v'.  atomic_add_unless can only stop
- * on equal.
- */
-static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
-{
-	spin_lock(lock);
-	if (atomic_read(v) >= u) {
-		spin_unlock(lock);
-		return 0;
-	}
-	atomic_inc(v);
-	spin_unlock(lock);
-	return 1;
-}
-
-/*
- * Our retries are blocked by all destination swack resources being
- * in use, and a timeout is pending. In that case hardware immediately
- * returns the ERROR that looks like a destination timeout.
- */
-static void
-destination_plugged(struct bau_desc *bau_desc, struct bau_control *bcp,
-			struct bau_control *hmaster, struct ptc_stats *stat)
-{
-	udelay(bcp->plugged_delay);
-	bcp->plugged_tries++;
-	if (bcp->plugged_tries >= bcp->plugsb4reset) {
-		bcp->plugged_tries = 0;
-		quiesce_local_uvhub(hmaster);
-		spin_lock(&hmaster->queue_lock);
-		uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu);
-		spin_unlock(&hmaster->queue_lock);
-		end_uvhub_quiesce(hmaster);
-		bcp->ipi_attempts++;
-		stat->s_resets_plug++;
-	}
-}
-
-static void
-destination_timeout(struct bau_desc *bau_desc, struct bau_control *bcp,
-			struct bau_control *hmaster, struct ptc_stats *stat)
-{
-	hmaster->max_bau_concurrent = 1;
-	bcp->timeout_tries++;
-	if (bcp->timeout_tries >= bcp->timeoutsb4reset) {
-		bcp->timeout_tries = 0;
-		quiesce_local_uvhub(hmaster);
-		spin_lock(&hmaster->queue_lock);
-		uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu);
-		spin_unlock(&hmaster->queue_lock);
-		end_uvhub_quiesce(hmaster);
-		bcp->ipi_attempts++;
-		stat->s_resets_timeout++;
-	}
-}
-
-/*
- * Completions are taking a very long time due to a congested numalink
- * network.
- */
-static void
-disable_for_congestion(struct bau_control *bcp, struct ptc_stats *stat)
-{
-	int tcpu;
-	struct bau_control *tbcp;
-
-	/* let only one cpu do this disabling */
-	spin_lock(&disable_lock);
-	if (!baudisabled && bcp->period_requests &&
-	    ((bcp->period_time / bcp->period_requests) > congested_cycles)) {
-		/* it becomes this cpu's job to turn on the use of the
-		   BAU again */
-		baudisabled = 1;
-		bcp->set_bau_off = 1;
-		bcp->set_bau_on_time = get_cycles() +
-			sec_2_cycles(bcp->congested_period);
-		stat->s_bau_disabled++;
-		for_each_present_cpu(tcpu) {
-			tbcp = &per_cpu(bau_control, tcpu);
-				tbcp->baudisabled = 1;
-		}
-	}
-	spin_unlock(&disable_lock);
-}
-
-/**
- * uv_flush_send_and_wait
- *
- * Send a broadcast and wait for it to complete.
- *
- * The flush_mask contains the cpus the broadcast is to be sent to including
- * cpus that are on the local uvhub.
- *
- * Returns 0 if all flushing represented in the mask was done.
- * Returns 1 if it gives up entirely and the original cpu mask is to be
- * returned to the kernel.
- */
-int uv_flush_send_and_wait(struct bau_desc *bau_desc,
-			   struct cpumask *flush_mask, struct bau_control *bcp)
-{
-	int right_shift;
-	int completion_status = 0;
-	int seq_number = 0;
-	long try = 0;
-	int cpu = bcp->uvhub_cpu;
-	int this_cpu = bcp->cpu;
-	unsigned long mmr_offset;
-	unsigned long index;
-	cycles_t time1;
-	cycles_t time2;
-	cycles_t elapsed;
-	struct ptc_stats *stat = bcp->statp;
-	struct bau_control *smaster = bcp->socket_master;
-	struct bau_control *hmaster = bcp->uvhub_master;
-
-	if (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
-			&hmaster->active_descriptor_count,
-			hmaster->max_bau_concurrent)) {
-		stat->s_throttles++;
-		do {
-			cpu_relax();
-		} while (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
-			&hmaster->active_descriptor_count,
-			hmaster->max_bau_concurrent));
-	}
-	while (hmaster->uvhub_quiesce)
-		cpu_relax();
-
-	if (cpu < UV_CPUS_PER_ACT_STATUS) {
-		mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
-		right_shift = cpu * UV_ACT_STATUS_SIZE;
-	} else {
-		mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
-		right_shift =
-		    ((cpu - UV_CPUS_PER_ACT_STATUS) * UV_ACT_STATUS_SIZE);
-	}
-	time1 = get_cycles();
-	do {
-		if (try == 0) {
-			bau_desc->header.msg_type = MSG_REGULAR;
-			seq_number = bcp->message_number++;
-		} else {
-			bau_desc->header.msg_type = MSG_RETRY;
-			stat->s_retry_messages++;
-		}
-		bau_desc->header.sequence = seq_number;
-		index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) |
-			bcp->uvhub_cpu;
-		bcp->send_message = get_cycles();
-		uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index);
-		try++;
-		completion_status = uv_wait_completion(bau_desc, mmr_offset,
-			right_shift, this_cpu, bcp, smaster, try);
-
-		if (completion_status == FLUSH_RETRY_PLUGGED) {
-			destination_plugged(bau_desc, bcp, hmaster, stat);
-		} else if (completion_status == FLUSH_RETRY_TIMEOUT) {
-			destination_timeout(bau_desc, bcp, hmaster, stat);
-		}
-		if (bcp->ipi_attempts >= bcp->ipi_reset_limit) {
-			bcp->ipi_attempts = 0;
-			completion_status = FLUSH_GIVEUP;
-			break;
-		}
-		cpu_relax();
-	} while ((completion_status == FLUSH_RETRY_PLUGGED) ||
-		 (completion_status == FLUSH_RETRY_TIMEOUT));
-	time2 = get_cycles();
-	bcp->plugged_tries = 0;
-	bcp->timeout_tries = 0;
-	if ((completion_status == FLUSH_COMPLETE) &&
-	    (bcp->conseccompletes > bcp->complete_threshold) &&
-	    (hmaster->max_bau_concurrent <
-					hmaster->max_bau_concurrent_constant))
-			hmaster->max_bau_concurrent++;
-	while (hmaster->uvhub_quiesce)
-		cpu_relax();
-	atomic_dec(&hmaster->active_descriptor_count);
-	if (time2 > time1) {
-		elapsed = time2 - time1;
-		stat->s_time += elapsed;
-		if ((completion_status == FLUSH_COMPLETE) && (try == 1)) {
-			bcp->period_requests++;
-			bcp->period_time += elapsed;
-			if ((elapsed > congested_cycles) &&
-			    (bcp->period_requests > bcp->congested_reps)) {
-				disable_for_congestion(bcp, stat);
-			}
-		}
-	} else
-		stat->s_requestor--;
-	if (completion_status == FLUSH_COMPLETE && try > 1)
-		stat->s_retriesok++;
-	else if (completion_status == FLUSH_GIVEUP) {
-		stat->s_giveup++;
-		return 1;
-	}
-	return 0;
-}
-
-/**
- * uv_flush_tlb_others - globally purge translation cache of a virtual
- * address or all TLB's
- * @cpumask: mask of all cpu's in which the address is to be removed
- * @mm: mm_struct containing virtual address range
- * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu)
- * @cpu: the current cpu
- *
- * This is the entry point for initiating any UV global TLB shootdown.
- *
- * Purges the translation caches of all specified processors of the given
- * virtual address, or purges all TLB's on specified processors.
- *
- * The caller has derived the cpumask from the mm_struct.  This function
- * is called only if there are bits set in the mask. (e.g. flush_tlb_page())
- *
- * The cpumask is converted into a uvhubmask of the uvhubs containing
- * those cpus.
- *
- * Note that this function should be called with preemption disabled.
- *
- * Returns NULL if all remote flushing was done.
- * Returns pointer to cpumask if some remote flushing remains to be
- * done.  The returned pointer is valid till preemption is re-enabled.
- */
-const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
-					  struct mm_struct *mm,
-					  unsigned long va, unsigned int cpu)
-{
-	int tcpu;
-	int uvhub;
-	int locals = 0;
-	int remotes = 0;
-	int hubs = 0;
-	struct bau_desc *bau_desc;
-	struct cpumask *flush_mask;
-	struct ptc_stats *stat;
-	struct bau_control *bcp;
-	struct bau_control *tbcp;
-
-	/* kernel was booted 'nobau' */
-	if (nobau)
-		return cpumask;
-
-	bcp = &per_cpu(bau_control, cpu);
-	stat = bcp->statp;
-
-	/* bau was disabled due to slow response */
-	if (bcp->baudisabled) {
-		/* the cpu that disabled it must re-enable it */
-		if (bcp->set_bau_off) {
-			if (get_cycles() >= bcp->set_bau_on_time) {
-				stat->s_bau_reenabled++;
-				baudisabled = 0;
-				for_each_present_cpu(tcpu) {
-					tbcp = &per_cpu(bau_control, tcpu);
-					tbcp->baudisabled = 0;
-					tbcp->period_requests = 0;
-					tbcp->period_time = 0;
-				}
-			}
-		}
-		return cpumask;
-	}
-
-	/*
-	 * Each sending cpu has a per-cpu mask which it fills from the caller's
-	 * cpu mask.  All cpus are converted to uvhubs and copied to the
-	 * activation descriptor.
-	 */
-	flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu);
-	/* don't actually do a shootdown of the local cpu */
-	cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
-	if (cpu_isset(cpu, *cpumask))
-		stat->s_ntargself++;
-
-	bau_desc = bcp->descriptor_base;
-	bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu;
-	bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
-
-	/* cpu statistics */
-	for_each_cpu(tcpu, flush_mask) {
-		uvhub = uv_cpu_to_blade_id(tcpu);
-		bau_uvhub_set(uvhub, &bau_desc->distribution);
-		if (uvhub == bcp->uvhub)
-			locals++;
-		else
-			remotes++;
-	}
-	if ((locals + remotes) == 0)
-		return NULL;
-	stat->s_requestor++;
-	stat->s_ntargcpu += remotes + locals;
-	stat->s_ntargremotes += remotes;
-	stat->s_ntarglocals += locals;
-	remotes = bau_uvhub_weight(&bau_desc->distribution);
-
-	/* uvhub statistics */
-	hubs = bau_uvhub_weight(&bau_desc->distribution);
-	if (locals) {
-		stat->s_ntarglocaluvhub++;
-		stat->s_ntargremoteuvhub += (hubs - 1);
-	} else
-		stat->s_ntargremoteuvhub += hubs;
-	stat->s_ntarguvhub += hubs;
-	if (hubs >= 16)
-		stat->s_ntarguvhub16++;
-	else if (hubs >= 8)
-		stat->s_ntarguvhub8++;
-	else if (hubs >= 4)
-		stat->s_ntarguvhub4++;
-	else if (hubs >= 2)
-		stat->s_ntarguvhub2++;
-	else
-		stat->s_ntarguvhub1++;
-
-	bau_desc->payload.address = va;
-	bau_desc->payload.sending_cpu = cpu;
-
-	/*
-	 * uv_flush_send_and_wait returns 0 if all cpu's were messaged,
-	 * or 1 if it gave up and the original cpumask should be returned.
-	 */
-	if (!uv_flush_send_and_wait(bau_desc, flush_mask, bcp))
-		return NULL;
-	else
-		return cpumask;
-}
-
-/*
- * The BAU message interrupt comes here. (registered by set_intr_gate)
- * See entry_64.S
- *
- * We received a broadcast assist message.
- *
- * Interrupts are disabled; this interrupt could represent
- * the receipt of several messages.
- *
- * All cores/threads on this hub get this interrupt.
- * The last one to see it does the software ack.
- * (the resource will not be freed until noninterruptable cpus see this
- *  interrupt; hardware may timeout the s/w ack and reply ERROR)
- */
-void uv_bau_message_interrupt(struct pt_regs *regs)
-{
-	int count = 0;
-	cycles_t time_start;
-	struct bau_payload_queue_entry *msg;
-	struct bau_control *bcp;
-	struct ptc_stats *stat;
-	struct msg_desc msgdesc;
-
-	time_start = get_cycles();
-	bcp = &per_cpu(bau_control, smp_processor_id());
-	stat = bcp->statp;
-	msgdesc.va_queue_first = bcp->va_queue_first;
-	msgdesc.va_queue_last = bcp->va_queue_last;
-	msg = bcp->bau_msg_head;
-	while (msg->sw_ack_vector) {
-		count++;
-		msgdesc.msg_slot = msg - msgdesc.va_queue_first;
-		msgdesc.sw_ack_slot = ffs(msg->sw_ack_vector) - 1;
-		msgdesc.msg = msg;
-		uv_bau_process_message(&msgdesc, bcp);
-		msg++;
-		if (msg > msgdesc.va_queue_last)
-			msg = msgdesc.va_queue_first;
-		bcp->bau_msg_head = msg;
-	}
-	stat->d_time += (get_cycles() - time_start);
-	if (!count)
-		stat->d_nomsg++;
-	else if (count > 1)
-		stat->d_multmsg++;
-	ack_APIC_irq();
-}
-
-/*
- * uv_enable_timeouts
- *
- * Each target uvhub (i.e. a uvhub that has no cpu's) needs to have
- * shootdown message timeouts enabled.  The timeout does not cause
- * an interrupt, but causes an error message to be returned to
- * the sender.
- */
-static void uv_enable_timeouts(void)
-{
-	int uvhub;
-	int nuvhubs;
-	int pnode;
-	unsigned long mmr_image;
-
-	nuvhubs = uv_num_possible_blades();
-
-	for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
-		if (!uv_blade_nr_possible_cpus(uvhub))
-			continue;
-
-		pnode = uv_blade_to_pnode(uvhub);
-		mmr_image =
-		    uv_read_global_mmr64(pnode, UVH_LB_BAU_MISC_CONTROL);
-		/*
-		 * Set the timeout period and then lock it in, in three
-		 * steps; captures and locks in the period.
-		 *
-		 * To program the period, the SOFT_ACK_MODE must be off.
-		 */
-		mmr_image &= ~((unsigned long)1 <<
-		    UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT);
-		uv_write_global_mmr64
-		    (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
-		/*
-		 * Set the 4-bit period.
-		 */
-		mmr_image &= ~((unsigned long)0xf <<
-		     UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT);
-		mmr_image |= (UV_INTD_SOFT_ACK_TIMEOUT_PERIOD <<
-		     UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT);
-		uv_write_global_mmr64
-		    (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
-		/*
-		 * Subsequent reversals of the timebase bit (3) cause an
-		 * immediate timeout of one or all INTD resources as
-		 * indicated in bits 2:0 (7 causes all of them to timeout).
-		 */
-		mmr_image |= ((unsigned long)1 <<
-		    UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT);
-		uv_write_global_mmr64
-		    (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
-	}
-}
-
-static void *uv_ptc_seq_start(struct seq_file *file, loff_t *offset)
-{
-	if (*offset < num_possible_cpus())
-		return offset;
-	return NULL;
-}
-
-static void *uv_ptc_seq_next(struct seq_file *file, void *data, loff_t *offset)
-{
-	(*offset)++;
-	if (*offset < num_possible_cpus())
-		return offset;
-	return NULL;
-}
-
-static void uv_ptc_seq_stop(struct seq_file *file, void *data)
-{
-}
-
-static inline unsigned long long
-microsec_2_cycles(unsigned long microsec)
-{
-	unsigned long ns;
-	unsigned long long cyc;
-
-	ns = microsec * 1000;
-	cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
-	return cyc;
-}
-
-/*
- * Display the statistics thru /proc.
- * 'data' points to the cpu number
- */
-static int uv_ptc_seq_show(struct seq_file *file, void *data)
-{
-	struct ptc_stats *stat;
-	int cpu;
-
-	cpu = *(loff_t *)data;
-
-	if (!cpu) {
-		seq_printf(file,
-			"# cpu sent stime self locals remotes ncpus localhub ");
-		seq_printf(file,
-			"remotehub numuvhubs numuvhubs16 numuvhubs8 ");
-		seq_printf(file,
-			"numuvhubs4 numuvhubs2 numuvhubs1 dto ");
-		seq_printf(file,
-			"retries rok resetp resett giveup sto bz throt ");
-		seq_printf(file,
-			"sw_ack recv rtime all ");
-		seq_printf(file,
-			"one mult none retry canc nocan reset rcan ");
-		seq_printf(file,
-			"disable enable\n");
-	}
-	if (cpu < num_possible_cpus() && cpu_online(cpu)) {
-		stat = &per_cpu(ptcstats, cpu);
-		/* source side statistics */
-		seq_printf(file,
-			"cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
-			   cpu, stat->s_requestor, cycles_2_us(stat->s_time),
-			   stat->s_ntargself, stat->s_ntarglocals,
-			   stat->s_ntargremotes, stat->s_ntargcpu,
-			   stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub,
-			   stat->s_ntarguvhub, stat->s_ntarguvhub16);
-		seq_printf(file, "%ld %ld %ld %ld %ld ",
-			   stat->s_ntarguvhub8, stat->s_ntarguvhub4,
-			   stat->s_ntarguvhub2, stat->s_ntarguvhub1,
-			   stat->s_dtimeout);
-		seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ",
-			   stat->s_retry_messages, stat->s_retriesok,
-			   stat->s_resets_plug, stat->s_resets_timeout,
-			   stat->s_giveup, stat->s_stimeout,
-			   stat->s_busy, stat->s_throttles);
-
-		/* destination side statistics */
-		seq_printf(file,
-			   "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
-			   uv_read_global_mmr64(uv_cpu_to_pnode(cpu),
-					UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE),
-			   stat->d_requestee, cycles_2_us(stat->d_time),
-			   stat->d_alltlb, stat->d_onetlb, stat->d_multmsg,
-			   stat->d_nomsg, stat->d_retries, stat->d_canceled,
-			   stat->d_nocanceled, stat->d_resets,
-			   stat->d_rcanceled);
-		seq_printf(file, "%ld %ld\n",
-			stat->s_bau_disabled, stat->s_bau_reenabled);
-	}
-
-	return 0;
-}
-
-/*
- * Display the tunables thru debugfs
- */
-static ssize_t tunables_read(struct file *file, char __user *userbuf,
-						size_t count, loff_t *ppos)
-{
-	char *buf;
-	int ret;
-
-	buf = kasprintf(GFP_KERNEL, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n",
-		"max_bau_concurrent plugged_delay plugsb4reset",
-		"timeoutsb4reset ipi_reset_limit complete_threshold",
-		"congested_response_us congested_reps congested_period",
-		max_bau_concurrent, plugged_delay, plugsb4reset,
-		timeoutsb4reset, ipi_reset_limit, complete_threshold,
-		congested_response_us, congested_reps, congested_period);
-
-	if (!buf)
-		return -ENOMEM;
-
-	ret = simple_read_from_buffer(userbuf, count, ppos, buf, strlen(buf));
-	kfree(buf);
-	return ret;
-}
-
-/*
- * -1: resetf the statistics
- *  0: display meaning of the statistics
- */
-static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
-				 size_t count, loff_t *data)
-{
-	int cpu;
-	long input_arg;
-	char optstr[64];
-	struct ptc_stats *stat;
-
-	if (count == 0 || count > sizeof(optstr))
-		return -EINVAL;
-	if (copy_from_user(optstr, user, count))
-		return -EFAULT;
-	optstr[count - 1] = '\0';
-	if (strict_strtol(optstr, 10, &input_arg) < 0) {
-		printk(KERN_DEBUG "%s is invalid\n", optstr);
-		return -EINVAL;
-	}
-
-	if (input_arg == 0) {
-		printk(KERN_DEBUG "# cpu:      cpu number\n");
-		printk(KERN_DEBUG "Sender statistics:\n");
-		printk(KERN_DEBUG
-		"sent:     number of shootdown messages sent\n");
-		printk(KERN_DEBUG
-		"stime:    time spent sending messages\n");
-		printk(KERN_DEBUG
-		"numuvhubs: number of hubs targeted with shootdown\n");
-		printk(KERN_DEBUG
-		"numuvhubs16: number times 16 or more hubs targeted\n");
-		printk(KERN_DEBUG
-		"numuvhubs8: number times 8 or more hubs targeted\n");
-		printk(KERN_DEBUG
-		"numuvhubs4: number times 4 or more hubs targeted\n");
-		printk(KERN_DEBUG
-		"numuvhubs2: number times 2 or more hubs targeted\n");
-		printk(KERN_DEBUG
-		"numuvhubs1: number times 1 hub targeted\n");
-		printk(KERN_DEBUG
-		"numcpus:  number of cpus targeted with shootdown\n");
-		printk(KERN_DEBUG
-		"dto:      number of destination timeouts\n");
-		printk(KERN_DEBUG
-		"retries:  destination timeout retries sent\n");
-		printk(KERN_DEBUG
-		"rok:   :  destination timeouts successfully retried\n");
-		printk(KERN_DEBUG
-		"resetp:   ipi-style resource resets for plugs\n");
-		printk(KERN_DEBUG
-		"resett:   ipi-style resource resets for timeouts\n");
-		printk(KERN_DEBUG
-		"giveup:   fall-backs to ipi-style shootdowns\n");
-		printk(KERN_DEBUG
-		"sto:      number of source timeouts\n");
-		printk(KERN_DEBUG
-		"bz:       number of stay-busy's\n");
-		printk(KERN_DEBUG
-		"throt:    number times spun in throttle\n");
-		printk(KERN_DEBUG "Destination side statistics:\n");
-		printk(KERN_DEBUG
-		"sw_ack:   image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n");
-		printk(KERN_DEBUG
-		"recv:     shootdown messages received\n");
-		printk(KERN_DEBUG
-		"rtime:    time spent processing messages\n");
-		printk(KERN_DEBUG
-		"all:      shootdown all-tlb messages\n");
-		printk(KERN_DEBUG
-		"one:      shootdown one-tlb messages\n");
-		printk(KERN_DEBUG
-		"mult:     interrupts that found multiple messages\n");
-		printk(KERN_DEBUG
-		"none:     interrupts that found no messages\n");
-		printk(KERN_DEBUG
-		"retry:    number of retry messages processed\n");
-		printk(KERN_DEBUG
-		"canc:     number messages canceled by retries\n");
-		printk(KERN_DEBUG
-		"nocan:    number retries that found nothing to cancel\n");
-		printk(KERN_DEBUG
-		"reset:    number of ipi-style reset requests processed\n");
-		printk(KERN_DEBUG
-		"rcan:     number messages canceled by reset requests\n");
-		printk(KERN_DEBUG
-		"disable:  number times use of the BAU was disabled\n");
-		printk(KERN_DEBUG
-		"enable:   number times use of the BAU was re-enabled\n");
-	} else if (input_arg == -1) {
-		for_each_present_cpu(cpu) {
-			stat = &per_cpu(ptcstats, cpu);
-			memset(stat, 0, sizeof(struct ptc_stats));
-		}
-	}
-
-	return count;
-}
-
-static int local_atoi(const char *name)
-{
-	int val = 0;
-
-	for (;; name++) {
-		switch (*name) {
-		case '0' ... '9':
-			val = 10*val+(*name-'0');
-			break;
-		default:
-			return val;
-		}
-	}
-}
-
-/*
- * set the tunables
- * 0 values reset them to defaults
- */
-static ssize_t tunables_write(struct file *file, const char __user *user,
-				 size_t count, loff_t *data)
-{
-	int cpu;
-	int cnt = 0;
-	int val;
-	char *p;
-	char *q;
-	char instr[64];
-	struct bau_control *bcp;
-
-	if (count == 0 || count > sizeof(instr)-1)
-		return -EINVAL;
-	if (copy_from_user(instr, user, count))
-		return -EFAULT;
-
-	instr[count] = '\0';
-	/* count the fields */
-	p = instr + strspn(instr, WHITESPACE);
-	q = p;
-	for (; *p; p = q + strspn(q, WHITESPACE)) {
-		q = p + strcspn(p, WHITESPACE);
-		cnt++;
-		if (q == p)
-			break;
-	}
-	if (cnt != 9) {
-		printk(KERN_INFO "bau tunable error: should be 9 numbers\n");
-		return -EINVAL;
-	}
-
-	p = instr + strspn(instr, WHITESPACE);
-	q = p;
-	for (cnt = 0; *p; p = q + strspn(q, WHITESPACE), cnt++) {
-		q = p + strcspn(p, WHITESPACE);
-		val = local_atoi(p);
-		switch (cnt) {
-		case 0:
-			if (val == 0) {
-				max_bau_concurrent = MAX_BAU_CONCURRENT;
-				max_bau_concurrent_constant =
-							MAX_BAU_CONCURRENT;
-				continue;
-			}
-			bcp = &per_cpu(bau_control, smp_processor_id());
-			if (val < 1 || val > bcp->cpus_in_uvhub) {
-				printk(KERN_DEBUG
-				"Error: BAU max concurrent %d is invalid\n",
-				val);
-				return -EINVAL;
-			}
-			max_bau_concurrent = val;
-			max_bau_concurrent_constant = val;
-			continue;
-		case 1:
-			if (val == 0)
-				plugged_delay = PLUGGED_DELAY;
-			else
-				plugged_delay = val;
-			continue;
-		case 2:
-			if (val == 0)
-				plugsb4reset = PLUGSB4RESET;
-			else
-				plugsb4reset = val;
-			continue;
-		case 3:
-			if (val == 0)
-				timeoutsb4reset = TIMEOUTSB4RESET;
-			else
-				timeoutsb4reset = val;
-			continue;
-		case 4:
-			if (val == 0)
-				ipi_reset_limit = IPI_RESET_LIMIT;
-			else
-				ipi_reset_limit = val;
-			continue;
-		case 5:
-			if (val == 0)
-				complete_threshold = COMPLETE_THRESHOLD;
-			else
-				complete_threshold = val;
-			continue;
-		case 6:
-			if (val == 0)
-				congested_response_us = CONGESTED_RESPONSE_US;
-			else
-				congested_response_us = val;
-			continue;
-		case 7:
-			if (val == 0)
-				congested_reps = CONGESTED_REPS;
-			else
-				congested_reps = val;
-			continue;
-		case 8:
-			if (val == 0)
-				congested_period = CONGESTED_PERIOD;
-			else
-				congested_period = val;
-			continue;
-		}
-		if (q == p)
-			break;
-	}
-	for_each_present_cpu(cpu) {
-		bcp = &per_cpu(bau_control, cpu);
-		bcp->max_bau_concurrent = max_bau_concurrent;
-		bcp->max_bau_concurrent_constant = max_bau_concurrent;
-		bcp->plugged_delay = plugged_delay;
-		bcp->plugsb4reset = plugsb4reset;
-		bcp->timeoutsb4reset = timeoutsb4reset;
-		bcp->ipi_reset_limit = ipi_reset_limit;
-		bcp->complete_threshold = complete_threshold;
-		bcp->congested_response_us = congested_response_us;
-		bcp->congested_reps = congested_reps;
-		bcp->congested_period = congested_period;
-	}
-	return count;
-}
-
-static const struct seq_operations uv_ptc_seq_ops = {
-	.start		= uv_ptc_seq_start,
-	.next		= uv_ptc_seq_next,
-	.stop		= uv_ptc_seq_stop,
-	.show		= uv_ptc_seq_show
-};
-
-static int uv_ptc_proc_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &uv_ptc_seq_ops);
-}
-
-static int tunables_open(struct inode *inode, struct file *file)
-{
-	return 0;
-}
-
-static const struct file_operations proc_uv_ptc_operations = {
-	.open		= uv_ptc_proc_open,
-	.read		= seq_read,
-	.write		= uv_ptc_proc_write,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
-static const struct file_operations tunables_fops = {
-	.open		= tunables_open,
-	.read		= tunables_read,
-	.write		= tunables_write,
-	.llseek		= default_llseek,
-};
-
-static int __init uv_ptc_init(void)
-{
-	struct proc_dir_entry *proc_uv_ptc;
-
-	if (!is_uv_system())
-		return 0;
-
-	proc_uv_ptc = proc_create(UV_PTC_BASENAME, 0444, NULL,
-				  &proc_uv_ptc_operations);
-	if (!proc_uv_ptc) {
-		printk(KERN_ERR "unable to create %s proc entry\n",
-		       UV_PTC_BASENAME);
-		return -EINVAL;
-	}
-
-	tunables_dir = debugfs_create_dir(UV_BAU_TUNABLES_DIR, NULL);
-	if (!tunables_dir) {
-		printk(KERN_ERR "unable to create debugfs directory %s\n",
-		       UV_BAU_TUNABLES_DIR);
-		return -EINVAL;
-	}
-	tunables_file = debugfs_create_file(UV_BAU_TUNABLES_FILE, 0600,
-			tunables_dir, NULL, &tunables_fops);
-	if (!tunables_file) {
-		printk(KERN_ERR "unable to create debugfs file %s\n",
-		       UV_BAU_TUNABLES_FILE);
-		return -EINVAL;
-	}
-	return 0;
-}
-
-/*
- * initialize the sending side's sending buffers
- */
-static void
-uv_activation_descriptor_init(int node, int pnode)
-{
-	int i;
-	int cpu;
-	unsigned long pa;
-	unsigned long m;
-	unsigned long n;
-	struct bau_desc *bau_desc;
-	struct bau_desc *bd2;
-	struct bau_control *bcp;
-
-	/*
-	 * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR)
-	 * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per uvhub
-	 */
-	bau_desc = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)*
-		UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node);
-	BUG_ON(!bau_desc);
-
-	pa = uv_gpa(bau_desc); /* need the real nasid*/
-	n = pa >> uv_nshift;
-	m = pa & uv_mmask;
-
-	uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE,
-			      (n << UV_DESC_BASE_PNODE_SHIFT | m));
-
-	/*
-	 * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each
-	 * cpu even though we only use the first one; one descriptor can
-	 * describe a broadcast to 256 uv hubs.
-	 */
-	for (i = 0, bd2 = bau_desc; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR);
-		i++, bd2++) {
-		memset(bd2, 0, sizeof(struct bau_desc));
-		bd2->header.sw_ack_flag = 1;
-		/*
-		 * base_dest_nodeid is the nasid (pnode<<1) of the first uvhub
-		 * in the partition. The bit map will indicate uvhub numbers,
-		 * which are 0-N in a partition. Pnodes are unique system-wide.
-		 */
-		bd2->header.base_dest_nodeid = uv_partition_base_pnode << 1;
-		bd2->header.dest_subnodeid = 0x10; /* the LB */
-		bd2->header.command = UV_NET_ENDPOINT_INTD;
-		bd2->header.int_both = 1;
-		/*
-		 * all others need to be set to zero:
-		 *   fairness chaining multilevel count replied_to
-		 */
-	}
-	for_each_present_cpu(cpu) {
-		if (pnode != uv_blade_to_pnode(uv_cpu_to_blade_id(cpu)))
-			continue;
-		bcp = &per_cpu(bau_control, cpu);
-		bcp->descriptor_base = bau_desc;
-	}
-}
-
-/*
- * initialize the destination side's receiving buffers
- * entered for each uvhub in the partition
- * - node is first node (kernel memory notion) on the uvhub
- * - pnode is the uvhub's physical identifier
- */
-static void
-uv_payload_queue_init(int node, int pnode)
-{
-	int pn;
-	int cpu;
-	char *cp;
-	unsigned long pa;
-	struct bau_payload_queue_entry *pqp;
-	struct bau_payload_queue_entry *pqp_malloc;
-	struct bau_control *bcp;
-
-	pqp = (struct bau_payload_queue_entry *) kmalloc_node(
-		(DEST_Q_SIZE + 1) * sizeof(struct bau_payload_queue_entry),
-		GFP_KERNEL, node);
-	BUG_ON(!pqp);
-	pqp_malloc = pqp;
-
-	cp = (char *)pqp + 31;
-	pqp = (struct bau_payload_queue_entry *)(((unsigned long)cp >> 5) << 5);
-
-	for_each_present_cpu(cpu) {
-		if (pnode != uv_cpu_to_pnode(cpu))
-			continue;
-		/* for every cpu on this pnode: */
-		bcp = &per_cpu(bau_control, cpu);
-		bcp->va_queue_first = pqp;
-		bcp->bau_msg_head = pqp;
-		bcp->va_queue_last = pqp + (DEST_Q_SIZE - 1);
-	}
-	/*
-	 * need the pnode of where the memory was really allocated
-	 */
-	pa = uv_gpa(pqp);
-	pn = pa >> uv_nshift;
-	uv_write_global_mmr64(pnode,
-			      UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST,
-			      ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) |
-			      uv_physnodeaddr(pqp));
-	uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL,
-			      uv_physnodeaddr(pqp));
-	uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST,
-			      (unsigned long)
-			      uv_physnodeaddr(pqp + (DEST_Q_SIZE - 1)));
-	/* in effect, all msg_type's are set to MSG_NOOP */
-	memset(pqp, 0, sizeof(struct bau_payload_queue_entry) * DEST_Q_SIZE);
-}
-
-/*
- * Initialization of each UV hub's structures
- */
-static void __init uv_init_uvhub(int uvhub, int vector)
-{
-	int node;
-	int pnode;
-	unsigned long apicid;
-
-	node = uvhub_to_first_node(uvhub);
-	pnode = uv_blade_to_pnode(uvhub);
-	uv_activation_descriptor_init(node, pnode);
-	uv_payload_queue_init(node, pnode);
-	/*
-	 * the below initialization can't be in firmware because the
-	 * messaging IRQ will be determined by the OS
-	 */
-	apicid = uvhub_to_first_apicid(uvhub);
-	uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
-				      ((apicid << 32) | vector));
-}
-
-/*
- * We will set BAU_MISC_CONTROL with a timeout period.
- * But the BIOS has set UVH_AGING_PRESCALE_SEL and UVH_TRANSACTION_TIMEOUT.
- * So the destination timeout period has be be calculated from them.
- */
-static int
-calculate_destination_timeout(void)
-{
-	unsigned long mmr_image;
-	int mult1;
-	int mult2;
-	int index;
-	int base;
-	int ret;
-	unsigned long ts_ns;
-
-	mult1 = UV_INTD_SOFT_ACK_TIMEOUT_PERIOD & BAU_MISC_CONTROL_MULT_MASK;
-	mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL);
-	index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK;
-	mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT);
-	mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK;
-	base = timeout_base_ns[index];
-	ts_ns = base * mult1 * mult2;
-	ret = ts_ns / 1000;
-	return ret;
-}
-
-/*
- * initialize the bau_control structure for each cpu
- */
-static void __init uv_init_per_cpu(int nuvhubs)
-{
-	int i;
-	int cpu;
-	int pnode;
-	int uvhub;
-	int have_hmaster;
-	short socket = 0;
-	unsigned short socket_mask;
-	unsigned char *uvhub_mask;
-	struct bau_control *bcp;
-	struct uvhub_desc *bdp;
-	struct socket_desc *sdp;
-	struct bau_control *hmaster = NULL;
-	struct bau_control *smaster = NULL;
-	struct socket_desc {
-		short num_cpus;
-		short cpu_number[16];
-	};
-	struct uvhub_desc {
-		unsigned short socket_mask;
-		short num_cpus;
-		short uvhub;
-		short pnode;
-		struct socket_desc socket[2];
-	};
-	struct uvhub_desc *uvhub_descs;
-
-	timeout_us = calculate_destination_timeout();
-
-	uvhub_descs = (struct uvhub_desc *)
-		kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL);
-	memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc));
-	uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL);
-	for_each_present_cpu(cpu) {
-		bcp = &per_cpu(bau_control, cpu);
-		memset(bcp, 0, sizeof(struct bau_control));
-		pnode = uv_cpu_hub_info(cpu)->pnode;
-		uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
-		*(uvhub_mask + (uvhub/8)) |= (1 << (uvhub%8));
-		bdp = &uvhub_descs[uvhub];
-		bdp->num_cpus++;
-		bdp->uvhub = uvhub;
-		bdp->pnode = pnode;
-		/* kludge: 'assuming' one node per socket, and assuming that
-		   disabling a socket just leaves a gap in node numbers */
-		socket = (cpu_to_node(cpu) & 1);
-		bdp->socket_mask |= (1 << socket);
-		sdp = &bdp->socket[socket];
-		sdp->cpu_number[sdp->num_cpus] = cpu;
-		sdp->num_cpus++;
-	}
-	for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
-		if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8))))
-			continue;
-		have_hmaster = 0;
-		bdp = &uvhub_descs[uvhub];
-		socket_mask = bdp->socket_mask;
-		socket = 0;
-		while (socket_mask) {
-			if (!(socket_mask & 1))
-				goto nextsocket;
-			sdp = &bdp->socket[socket];
-			for (i = 0; i < sdp->num_cpus; i++) {
-				cpu = sdp->cpu_number[i];
-				bcp = &per_cpu(bau_control, cpu);
-				bcp->cpu = cpu;
-				if (i == 0) {
-					smaster = bcp;
-					if (!have_hmaster) {
-						have_hmaster++;
-						hmaster = bcp;
-					}
-				}
-				bcp->cpus_in_uvhub = bdp->num_cpus;
-				bcp->cpus_in_socket = sdp->num_cpus;
-				bcp->socket_master = smaster;
-				bcp->uvhub = bdp->uvhub;
-				bcp->uvhub_master = hmaster;
-				bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->
-						blade_processor_id;
-			}
-nextsocket:
-			socket++;
-			socket_mask = (socket_mask >> 1);
-		}
-	}
-	kfree(uvhub_descs);
-	kfree(uvhub_mask);
-	for_each_present_cpu(cpu) {
-		bcp = &per_cpu(bau_control, cpu);
-		bcp->baudisabled = 0;
-		bcp->statp = &per_cpu(ptcstats, cpu);
-		/* time interval to catch a hardware stay-busy bug */
-		bcp->timeout_interval = microsec_2_cycles(2*timeout_us);
-		bcp->max_bau_concurrent = max_bau_concurrent;
-		bcp->max_bau_concurrent_constant = max_bau_concurrent;
-		bcp->plugged_delay = plugged_delay;
-		bcp->plugsb4reset = plugsb4reset;
-		bcp->timeoutsb4reset = timeoutsb4reset;
-		bcp->ipi_reset_limit = ipi_reset_limit;
-		bcp->complete_threshold = complete_threshold;
-		bcp->congested_response_us = congested_response_us;
-		bcp->congested_reps = congested_reps;
-		bcp->congested_period = congested_period;
-	}
-}
-
-/*
- * Initialization of BAU-related structures
- */
-static int __init uv_bau_init(void)
-{
-	int uvhub;
-	int pnode;
-	int nuvhubs;
-	int cur_cpu;
-	int vector;
-	unsigned long mmr;
-
-	if (!is_uv_system())
-		return 0;
-
-	if (nobau)
-		return 0;
-
-	for_each_possible_cpu(cur_cpu)
-		zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu),
-				       GFP_KERNEL, cpu_to_node(cur_cpu));
-
-	uv_nshift = uv_hub_info->m_val;
-	uv_mmask = (1UL << uv_hub_info->m_val) - 1;
-	nuvhubs = uv_num_possible_blades();
-	spin_lock_init(&disable_lock);
-	congested_cycles = microsec_2_cycles(congested_response_us);
-
-	uv_init_per_cpu(nuvhubs);
-
-	uv_partition_base_pnode = 0x7fffffff;
-	for (uvhub = 0; uvhub < nuvhubs; uvhub++)
-		if (uv_blade_nr_possible_cpus(uvhub) &&
-			(uv_blade_to_pnode(uvhub) < uv_partition_base_pnode))
-			uv_partition_base_pnode = uv_blade_to_pnode(uvhub);
-
-	vector = UV_BAU_MESSAGE;
-	for_each_possible_blade(uvhub)
-		if (uv_blade_nr_possible_cpus(uvhub))
-			uv_init_uvhub(uvhub, vector);
-
-	uv_enable_timeouts();
-	alloc_intr_gate(vector, uv_bau_message_intr1);
-
-	for_each_possible_blade(uvhub) {
-		if (uv_blade_nr_possible_cpus(uvhub)) {
-			pnode = uv_blade_to_pnode(uvhub);
-			/* INIT the bau */
-			uv_write_global_mmr64(pnode,
-					UVH_LB_BAU_SB_ACTIVATION_CONTROL,
-					((unsigned long)1 << 63));
-			mmr = 1; /* should be 1 to broadcast to both sockets */
-			uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST,
-						mmr);
-		}
-	}
-
-	return 0;
-}
-core_initcall(uv_bau_init);
-fs_initcall(uv_ptc_init);
diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c
deleted file mode 100644
index 7b24460917d..00000000000
--- a/arch/x86/kernel/uv_irq.c
+++ /dev/null
@@ -1,285 +0,0 @@
-/*
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * SGI UV IRQ functions
- *
- * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved.
- */
-
-#include <linux/module.h>
-#include <linux/rbtree.h>
-#include <linux/slab.h>
-#include <linux/irq.h>
-
-#include <asm/apic.h>
-#include <asm/uv/uv_irq.h>
-#include <asm/uv/uv_hub.h>
-
-/* MMR offset and pnode of hub sourcing interrupts for a given irq */
-struct uv_irq_2_mmr_pnode{
-	struct rb_node		list;
-	unsigned long		offset;
-	int			pnode;
-	int			irq;
-};
-
-static spinlock_t		uv_irq_lock;
-static struct rb_root		uv_irq_root;
-
-static int uv_set_irq_affinity(struct irq_data *, const struct cpumask *, bool);
-
-static void uv_noop(struct irq_data *data) { }
-
-static void uv_ack_apic(struct irq_data *data)
-{
-	ack_APIC_irq();
-}
-
-static struct irq_chip uv_irq_chip = {
-	.name			= "UV-CORE",
-	.irq_mask		= uv_noop,
-	.irq_unmask		= uv_noop,
-	.irq_eoi		= uv_ack_apic,
-	.irq_set_affinity	= uv_set_irq_affinity,
-};
-
-/*
- * Add offset and pnode information of the hub sourcing interrupts to the
- * rb tree for a specific irq.
- */
-static int uv_set_irq_2_mmr_info(int irq, unsigned long offset, unsigned blade)
-{
-	struct rb_node **link = &uv_irq_root.rb_node;
-	struct rb_node *parent = NULL;
-	struct uv_irq_2_mmr_pnode *n;
-	struct uv_irq_2_mmr_pnode *e;
-	unsigned long irqflags;
-
-	n = kmalloc_node(sizeof(struct uv_irq_2_mmr_pnode), GFP_KERNEL,
-				uv_blade_to_memory_nid(blade));
-	if (!n)
-		return -ENOMEM;
-
-	n->irq = irq;
-	n->offset = offset;
-	n->pnode = uv_blade_to_pnode(blade);
-	spin_lock_irqsave(&uv_irq_lock, irqflags);
-	/* Find the right place in the rbtree: */
-	while (*link) {
-		parent = *link;
-		e = rb_entry(parent, struct uv_irq_2_mmr_pnode, list);
-
-		if (unlikely(irq == e->irq)) {
-			/* irq entry exists */
-			e->pnode = uv_blade_to_pnode(blade);
-			e->offset = offset;
-			spin_unlock_irqrestore(&uv_irq_lock, irqflags);
-			kfree(n);
-			return 0;
-		}
-
-		if (irq < e->irq)
-			link = &(*link)->rb_left;
-		else
-			link = &(*link)->rb_right;
-	}
-
-	/* Insert the node into the rbtree. */
-	rb_link_node(&n->list, parent, link);
-	rb_insert_color(&n->list, &uv_irq_root);
-
-	spin_unlock_irqrestore(&uv_irq_lock, irqflags);
-	return 0;
-}
-
-/* Retrieve offset and pnode information from the rb tree for a specific irq */
-int uv_irq_2_mmr_info(int irq, unsigned long *offset, int *pnode)
-{
-	struct uv_irq_2_mmr_pnode *e;
-	struct rb_node *n;
-	unsigned long irqflags;
-
-	spin_lock_irqsave(&uv_irq_lock, irqflags);
-	n = uv_irq_root.rb_node;
-	while (n) {
-		e = rb_entry(n, struct uv_irq_2_mmr_pnode, list);
-
-		if (e->irq == irq) {
-			*offset = e->offset;
-			*pnode = e->pnode;
-			spin_unlock_irqrestore(&uv_irq_lock, irqflags);
-			return 0;
-		}
-
-		if (irq < e->irq)
-			n = n->rb_left;
-		else
-			n = n->rb_right;
-	}
-	spin_unlock_irqrestore(&uv_irq_lock, irqflags);
-	return -1;
-}
-
-/*
- * Re-target the irq to the specified CPU and enable the specified MMR located
- * on the specified blade to allow the sending of MSIs to the specified CPU.
- */
-static int
-arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
-		       unsigned long mmr_offset, int limit)
-{
-	const struct cpumask *eligible_cpu = cpumask_of(cpu);
-	struct irq_cfg *cfg = get_irq_chip_data(irq);
-	unsigned long mmr_value;
-	struct uv_IO_APIC_route_entry *entry;
-	int mmr_pnode, err;
-
-	BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
-			sizeof(unsigned long));
-
-	err = assign_irq_vector(irq, cfg, eligible_cpu);
-	if (err != 0)
-		return err;
-
-	if (limit == UV_AFFINITY_CPU)
-		irq_set_status_flags(irq, IRQ_NO_BALANCING);
-	else
-		irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
-
-	set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq,
-				      irq_name);
-
-	mmr_value = 0;
-	entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
-	entry->vector		= cfg->vector;
-	entry->delivery_mode	= apic->irq_delivery_mode;
-	entry->dest_mode	= apic->irq_dest_mode;
-	entry->polarity		= 0;
-	entry->trigger		= 0;
-	entry->mask		= 0;
-	entry->dest		= apic->cpu_mask_to_apicid(eligible_cpu);
-
-	mmr_pnode = uv_blade_to_pnode(mmr_blade);
-	uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
-
-	if (cfg->move_in_progress)
-		send_cleanup_vector(cfg);
-
-	return irq;
-}
-
-/*
- * Disable the specified MMR located on the specified blade so that MSIs are
- * longer allowed to be sent.
- */
-static void arch_disable_uv_irq(int mmr_pnode, unsigned long mmr_offset)
-{
-	unsigned long mmr_value;
-	struct uv_IO_APIC_route_entry *entry;
-
-	BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
-			sizeof(unsigned long));
-
-	mmr_value = 0;
-	entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
-	entry->mask = 1;
-
-	uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
-}
-
-static int
-uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask,
-		    bool force)
-{
-	struct irq_cfg *cfg = data->chip_data;
-	unsigned int dest;
-	unsigned long mmr_value, mmr_offset;
-	struct uv_IO_APIC_route_entry *entry;
-	int mmr_pnode;
-
-	if (__ioapic_set_affinity(data, mask, &dest))
-		return -1;
-
-	mmr_value = 0;
-	entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
-
-	entry->vector		= cfg->vector;
-	entry->delivery_mode	= apic->irq_delivery_mode;
-	entry->dest_mode	= apic->irq_dest_mode;
-	entry->polarity		= 0;
-	entry->trigger		= 0;
-	entry->mask		= 0;
-	entry->dest		= dest;
-
-	/* Get previously stored MMR and pnode of hub sourcing interrupts */
-	if (uv_irq_2_mmr_info(data->irq, &mmr_offset, &mmr_pnode))
-		return -1;
-
-	uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
-
-	if (cfg->move_in_progress)
-		send_cleanup_vector(cfg);
-
-	return 0;
-}
-
-/*
- * Set up a mapping of an available irq and vector, and enable the specified
- * MMR that defines the MSI that is to be sent to the specified CPU when an
- * interrupt is raised.
- */
-int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
-		 unsigned long mmr_offset, int limit)
-{
-	int irq, ret;
-
-	irq = create_irq_nr(NR_IRQS_LEGACY, uv_blade_to_memory_nid(mmr_blade));
-
-	if (irq <= 0)
-		return -EBUSY;
-
-	ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset,
-		limit);
-	if (ret == irq)
-		uv_set_irq_2_mmr_info(irq, mmr_offset, mmr_blade);
-	else
-		destroy_irq(irq);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(uv_setup_irq);
-
-/*
- * Tear down a mapping of an irq and vector, and disable the specified MMR that
- * defined the MSI that was to be sent to the specified CPU when an interrupt
- * was raised.
- *
- * Set mmr_blade and mmr_offset to what was passed in on uv_setup_irq().
- */
-void uv_teardown_irq(unsigned int irq)
-{
-	struct uv_irq_2_mmr_pnode *e;
-	struct rb_node *n;
-	unsigned long irqflags;
-
-	spin_lock_irqsave(&uv_irq_lock, irqflags);
-	n = uv_irq_root.rb_node;
-	while (n) {
-		e = rb_entry(n, struct uv_irq_2_mmr_pnode, list);
-		if (e->irq == irq) {
-			arch_disable_uv_irq(e->pnode, e->offset);
-			rb_erase(n, &uv_irq_root);
-			kfree(e);
-			break;
-		}
-		if (irq < e->irq)
-			n = n->rb_left;
-		else
-			n = n->rb_right;
-	}
-	spin_unlock_irqrestore(&uv_irq_lock, irqflags);
-	destroy_irq(irq);
-}
-EXPORT_SYMBOL_GPL(uv_teardown_irq);
diff --git a/arch/x86/kernel/uv_sysfs.c b/arch/x86/kernel/uv_sysfs.c
deleted file mode 100644
index 309c70fb775..00000000000
--- a/arch/x86/kernel/uv_sysfs.c
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * This file supports the /sys/firmware/sgi_uv interfaces for SGI UV.
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
- *
- *  Copyright (c) 2008 Silicon Graphics, Inc.  All Rights Reserved.
- *  Copyright (c) Russ Anderson
- */
-
-#include <linux/sysdev.h>
-#include <asm/uv/bios.h>
-#include <asm/uv/uv.h>
-
-struct kobject *sgi_uv_kobj;
-
-static ssize_t partition_id_show(struct kobject *kobj,
-			struct kobj_attribute *attr, char *buf)
-{
-	return snprintf(buf, PAGE_SIZE, "%ld\n", sn_partition_id);
-}
-
-static ssize_t coherence_id_show(struct kobject *kobj,
-			struct kobj_attribute *attr, char *buf)
-{
-	return snprintf(buf, PAGE_SIZE, "%ld\n", partition_coherence_id());
-}
-
-static struct kobj_attribute partition_id_attr =
-	__ATTR(partition_id, S_IRUGO, partition_id_show, NULL);
-
-static struct kobj_attribute coherence_id_attr =
-	__ATTR(coherence_id, S_IRUGO, coherence_id_show, NULL);
-
-
-static int __init sgi_uv_sysfs_init(void)
-{
-	unsigned long ret;
-
-	if (!is_uv_system())
-		return -ENODEV;
-
-	if (!sgi_uv_kobj)
-		sgi_uv_kobj = kobject_create_and_add("sgi_uv", firmware_kobj);
-	if (!sgi_uv_kobj) {
-		printk(KERN_WARNING "kobject_create_and_add sgi_uv failed\n");
-		return -EINVAL;
-	}
-
-	ret = sysfs_create_file(sgi_uv_kobj, &partition_id_attr.attr);
-	if (ret) {
-		printk(KERN_WARNING "sysfs_create_file partition_id failed\n");
-		return ret;
-	}
-
-	ret = sysfs_create_file(sgi_uv_kobj, &coherence_id_attr.attr);
-	if (ret) {
-		printk(KERN_WARNING "sysfs_create_file coherence_id failed\n");
-		return ret;
-	}
-
-	return 0;
-}
-
-device_initcall(sgi_uv_sysfs_init);
diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c
deleted file mode 100644
index 56e421bc379..00000000000
--- a/arch/x86/kernel/uv_time.c
+++ /dev/null
@@ -1,423 +0,0 @@
-/*
- * SGI RTC clock/timer routines.
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
- *
- *  Copyright (c) 2009 Silicon Graphics, Inc.  All Rights Reserved.
- *  Copyright (c) Dimitri Sivanich
- */
-#include <linux/clockchips.h>
-#include <linux/slab.h>
-
-#include <asm/uv/uv_mmrs.h>
-#include <asm/uv/uv_hub.h>
-#include <asm/uv/bios.h>
-#include <asm/uv/uv.h>
-#include <asm/apic.h>
-#include <asm/cpu.h>
-
-#define RTC_NAME		"sgi_rtc"
-
-static cycle_t uv_read_rtc(struct clocksource *cs);
-static int uv_rtc_next_event(unsigned long, struct clock_event_device *);
-static void uv_rtc_timer_setup(enum clock_event_mode,
-				struct clock_event_device *);
-
-static struct clocksource clocksource_uv = {
-	.name		= RTC_NAME,
-	.rating		= 400,
-	.read		= uv_read_rtc,
-	.mask		= (cycle_t)UVH_RTC_REAL_TIME_CLOCK_MASK,
-	.shift		= 10,
-	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
-};
-
-static struct clock_event_device clock_event_device_uv = {
-	.name		= RTC_NAME,
-	.features	= CLOCK_EVT_FEAT_ONESHOT,
-	.shift		= 20,
-	.rating		= 400,
-	.irq		= -1,
-	.set_next_event	= uv_rtc_next_event,
-	.set_mode	= uv_rtc_timer_setup,
-	.event_handler	= NULL,
-};
-
-static DEFINE_PER_CPU(struct clock_event_device, cpu_ced);
-
-/* There is one of these allocated per node */
-struct uv_rtc_timer_head {
-	spinlock_t	lock;
-	/* next cpu waiting for timer, local node relative: */
-	int		next_cpu;
-	/* number of cpus on this node: */
-	int		ncpus;
-	struct {
-		int	lcpu;		/* systemwide logical cpu number */
-		u64	expires;	/* next timer expiration for this cpu */
-	} cpu[1];
-};
-
-/*
- * Access to uv_rtc_timer_head via blade id.
- */
-static struct uv_rtc_timer_head		**blade_info __read_mostly;
-
-static int				uv_rtc_evt_enable;
-
-/*
- * Hardware interface routines
- */
-
-/* Send IPIs to another node */
-static void uv_rtc_send_IPI(int cpu)
-{
-	unsigned long apicid, val;
-	int pnode;
-
-	apicid = cpu_physical_id(cpu);
-	pnode = uv_apicid_to_pnode(apicid);
-	val = (1UL << UVH_IPI_INT_SEND_SHFT) |
-	      (apicid << UVH_IPI_INT_APIC_ID_SHFT) |
-	      (X86_PLATFORM_IPI_VECTOR << UVH_IPI_INT_VECTOR_SHFT);
-
-	uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
-}
-
-/* Check for an RTC interrupt pending */
-static int uv_intr_pending(int pnode)
-{
-	return uv_read_global_mmr64(pnode, UVH_EVENT_OCCURRED0) &
-		UVH_EVENT_OCCURRED0_RTC1_MASK;
-}
-
-/* Setup interrupt and return non-zero if early expiration occurred. */
-static int uv_setup_intr(int cpu, u64 expires)
-{
-	u64 val;
-	int pnode = uv_cpu_to_pnode(cpu);
-
-	uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG,
-		UVH_RTC1_INT_CONFIG_M_MASK);
-	uv_write_global_mmr64(pnode, UVH_INT_CMPB, -1L);
-
-	uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS,
-		UVH_EVENT_OCCURRED0_RTC1_MASK);
-
-	val = (X86_PLATFORM_IPI_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) |
-		((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT);
-
-	/* Set configuration */
-	uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, val);
-	/* Initialize comparator value */
-	uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires);
-
-	if (uv_read_rtc(NULL) <= expires)
-		return 0;
-
-	return !uv_intr_pending(pnode);
-}
-
-/*
- * Per-cpu timer tracking routines
- */
-
-static __init void uv_rtc_deallocate_timers(void)
-{
-	int bid;
-
-	for_each_possible_blade(bid) {
-		kfree(blade_info[bid]);
-	}
-	kfree(blade_info);
-}
-
-/* Allocate per-node list of cpu timer expiration times. */
-static __init int uv_rtc_allocate_timers(void)
-{
-	int cpu;
-
-	blade_info = kmalloc(uv_possible_blades * sizeof(void *), GFP_KERNEL);
-	if (!blade_info)
-		return -ENOMEM;
-	memset(blade_info, 0, uv_possible_blades * sizeof(void *));
-
-	for_each_present_cpu(cpu) {
-		int nid = cpu_to_node(cpu);
-		int bid = uv_cpu_to_blade_id(cpu);
-		int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
-		struct uv_rtc_timer_head *head = blade_info[bid];
-
-		if (!head) {
-			head = kmalloc_node(sizeof(struct uv_rtc_timer_head) +
-				(uv_blade_nr_possible_cpus(bid) *
-					2 * sizeof(u64)),
-				GFP_KERNEL, nid);
-			if (!head) {
-				uv_rtc_deallocate_timers();
-				return -ENOMEM;
-			}
-			spin_lock_init(&head->lock);
-			head->ncpus = uv_blade_nr_possible_cpus(bid);
-			head->next_cpu = -1;
-			blade_info[bid] = head;
-		}
-
-		head->cpu[bcpu].lcpu = cpu;
-		head->cpu[bcpu].expires = ULLONG_MAX;
-	}
-
-	return 0;
-}
-
-/* Find and set the next expiring timer.  */
-static void uv_rtc_find_next_timer(struct uv_rtc_timer_head *head, int pnode)
-{
-	u64 lowest = ULLONG_MAX;
-	int c, bcpu = -1;
-
-	head->next_cpu = -1;
-	for (c = 0; c < head->ncpus; c++) {
-		u64 exp = head->cpu[c].expires;
-		if (exp < lowest) {
-			bcpu = c;
-			lowest = exp;
-		}
-	}
-	if (bcpu >= 0) {
-		head->next_cpu = bcpu;
-		c = head->cpu[bcpu].lcpu;
-		if (uv_setup_intr(c, lowest))
-			/* If we didn't set it up in time, trigger */
-			uv_rtc_send_IPI(c);
-	} else {
-		uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG,
-			UVH_RTC1_INT_CONFIG_M_MASK);
-	}
-}
-
-/*
- * Set expiration time for current cpu.
- *
- * Returns 1 if we missed the expiration time.
- */
-static int uv_rtc_set_timer(int cpu, u64 expires)
-{
-	int pnode = uv_cpu_to_pnode(cpu);
-	int bid = uv_cpu_to_blade_id(cpu);
-	struct uv_rtc_timer_head *head = blade_info[bid];
-	int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
-	u64 *t = &head->cpu[bcpu].expires;
-	unsigned long flags;
-	int next_cpu;
-
-	spin_lock_irqsave(&head->lock, flags);
-
-	next_cpu = head->next_cpu;
-	*t = expires;
-
-	/* Will this one be next to go off? */
-	if (next_cpu < 0 || bcpu == next_cpu ||
-			expires < head->cpu[next_cpu].expires) {
-		head->next_cpu = bcpu;
-		if (uv_setup_intr(cpu, expires)) {
-			*t = ULLONG_MAX;
-			uv_rtc_find_next_timer(head, pnode);
-			spin_unlock_irqrestore(&head->lock, flags);
-			return -ETIME;
-		}
-	}
-
-	spin_unlock_irqrestore(&head->lock, flags);
-	return 0;
-}
-
-/*
- * Unset expiration time for current cpu.
- *
- * Returns 1 if this timer was pending.
- */
-static int uv_rtc_unset_timer(int cpu, int force)
-{
-	int pnode = uv_cpu_to_pnode(cpu);
-	int bid = uv_cpu_to_blade_id(cpu);
-	struct uv_rtc_timer_head *head = blade_info[bid];
-	int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
-	u64 *t = &head->cpu[bcpu].expires;
-	unsigned long flags;
-	int rc = 0;
-
-	spin_lock_irqsave(&head->lock, flags);
-
-	if ((head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) || force)
-		rc = 1;
-
-	if (rc) {
-		*t = ULLONG_MAX;
-		/* Was the hardware setup for this timer? */
-		if (head->next_cpu == bcpu)
-			uv_rtc_find_next_timer(head, pnode);
-	}
-
-	spin_unlock_irqrestore(&head->lock, flags);
-
-	return rc;
-}
-
-
-/*
- * Kernel interface routines.
- */
-
-/*
- * Read the RTC.
- *
- * Starting with HUB rev 2.0, the UV RTC register is replicated across all
- * cachelines of it's own page.  This allows faster simultaneous reads
- * from a given socket.
- */
-static cycle_t uv_read_rtc(struct clocksource *cs)
-{
-	unsigned long offset;
-
-	if (uv_get_min_hub_revision_id() == 1)
-		offset = 0;
-	else
-		offset = (uv_blade_processor_id() * L1_CACHE_BYTES) % PAGE_SIZE;
-
-	return (cycle_t)uv_read_local_mmr(UVH_RTC | offset);
-}
-
-/*
- * Program the next event, relative to now
- */
-static int uv_rtc_next_event(unsigned long delta,
-			     struct clock_event_device *ced)
-{
-	int ced_cpu = cpumask_first(ced->cpumask);
-
-	return uv_rtc_set_timer(ced_cpu, delta + uv_read_rtc(NULL));
-}
-
-/*
- * Setup the RTC timer in oneshot mode
- */
-static void uv_rtc_timer_setup(enum clock_event_mode mode,
-			       struct clock_event_device *evt)
-{
-	int ced_cpu = cpumask_first(evt->cpumask);
-
-	switch (mode) {
-	case CLOCK_EVT_MODE_PERIODIC:
-	case CLOCK_EVT_MODE_ONESHOT:
-	case CLOCK_EVT_MODE_RESUME:
-		/* Nothing to do here yet */
-		break;
-	case CLOCK_EVT_MODE_UNUSED:
-	case CLOCK_EVT_MODE_SHUTDOWN:
-		uv_rtc_unset_timer(ced_cpu, 1);
-		break;
-	}
-}
-
-static void uv_rtc_interrupt(void)
-{
-	int cpu = smp_processor_id();
-	struct clock_event_device *ced = &per_cpu(cpu_ced, cpu);
-
-	if (!ced || !ced->event_handler)
-		return;
-
-	if (uv_rtc_unset_timer(cpu, 0) != 1)
-		return;
-
-	ced->event_handler(ced);
-}
-
-static int __init uv_enable_evt_rtc(char *str)
-{
-	uv_rtc_evt_enable = 1;
-
-	return 1;
-}
-__setup("uvrtcevt", uv_enable_evt_rtc);
-
-static __init void uv_rtc_register_clockevents(struct work_struct *dummy)
-{
-	struct clock_event_device *ced = &__get_cpu_var(cpu_ced);
-
-	*ced = clock_event_device_uv;
-	ced->cpumask = cpumask_of(smp_processor_id());
-	clockevents_register_device(ced);
-}
-
-static __init int uv_rtc_setup_clock(void)
-{
-	int rc;
-
-	if (!is_uv_system())
-		return -ENODEV;
-
-	clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second,
-				clocksource_uv.shift);
-
-	/* If single blade, prefer tsc */
-	if (uv_num_possible_blades() == 1)
-		clocksource_uv.rating = 250;
-
-	rc = clocksource_register(&clocksource_uv);
-	if (rc)
-		printk(KERN_INFO "UV RTC clocksource failed rc %d\n", rc);
-	else
-		printk(KERN_INFO "UV RTC clocksource registered freq %lu MHz\n",
-			sn_rtc_cycles_per_second/(unsigned long)1E6);
-
-	if (rc || !uv_rtc_evt_enable || x86_platform_ipi_callback)
-		return rc;
-
-	/* Setup and register clockevents */
-	rc = uv_rtc_allocate_timers();
-	if (rc)
-		goto error;
-
-	x86_platform_ipi_callback = uv_rtc_interrupt;
-
-	clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second,
-				NSEC_PER_SEC, clock_event_device_uv.shift);
-
-	clock_event_device_uv.min_delta_ns = NSEC_PER_SEC /
-						sn_rtc_cycles_per_second;
-
-	clock_event_device_uv.max_delta_ns = clocksource_uv.mask *
-				(NSEC_PER_SEC / sn_rtc_cycles_per_second);
-
-	rc = schedule_on_each_cpu(uv_rtc_register_clockevents);
-	if (rc) {
-		x86_platform_ipi_callback = NULL;
-		uv_rtc_deallocate_timers();
-		goto error;
-	}
-
-	printk(KERN_INFO "UV RTC clockevents registered\n");
-
-	return 0;
-
-error:
-	clocksource_unregister(&clocksource_uv);
-	printk(KERN_INFO "UV RTC clockevents failed rc %d\n", rc);
-
-	return rc;
-}
-arch_initcall(uv_rtc_setup_clock);
diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile
index 06761ed5374..8519b01f1ac 100644
--- a/arch/x86/platform/Makefile
+++ b/arch/x86/platform/Makefile
@@ -4,3 +4,4 @@ obj-y	+= mrst/
 obj-y	+= scx200/
 obj-y	+= sfi/
 obj-y	+= visws/
+obj-y	+= uv/
diff --git a/arch/x86/platform/uv/Makefile b/arch/x86/platform/uv/Makefile
new file mode 100644
index 00000000000..6c40995fefb
--- /dev/null
+++ b/arch/x86/platform/uv/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_X86_UV)		+= tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o
diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c
new file mode 100644
index 00000000000..8bc57baaa9a
--- /dev/null
+++ b/arch/x86/platform/uv/bios_uv.c
@@ -0,0 +1,215 @@
+/*
+ * BIOS run time interface routines.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *
+ *  Copyright (c) 2008-2009 Silicon Graphics, Inc.  All Rights Reserved.
+ *  Copyright (c) Russ Anderson <rja@sgi.com>
+ */
+
+#include <linux/efi.h>
+#include <asm/efi.h>
+#include <linux/io.h>
+#include <asm/uv/bios.h>
+#include <asm/uv/uv_hub.h>
+
+static struct uv_systab uv_systab;
+
+s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
+{
+	struct uv_systab *tab = &uv_systab;
+	s64 ret;
+
+	if (!tab->function)
+		/*
+		 * BIOS does not support UV systab
+		 */
+		return BIOS_STATUS_UNIMPLEMENTED;
+
+	ret = efi_call6((void *)__va(tab->function), (u64)which,
+			a1, a2, a3, a4, a5);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(uv_bios_call);
+
+s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
+					u64 a4, u64 a5)
+{
+	unsigned long bios_flags;
+	s64 ret;
+
+	local_irq_save(bios_flags);
+	ret = uv_bios_call(which, a1, a2, a3, a4, a5);
+	local_irq_restore(bios_flags);
+
+	return ret;
+}
+
+s64 uv_bios_call_reentrant(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
+					u64 a4, u64 a5)
+{
+	s64 ret;
+
+	preempt_disable();
+	ret = uv_bios_call(which, a1, a2, a3, a4, a5);
+	preempt_enable();
+
+	return ret;
+}
+
+
+long sn_partition_id;
+EXPORT_SYMBOL_GPL(sn_partition_id);
+long sn_coherency_id;
+EXPORT_SYMBOL_GPL(sn_coherency_id);
+long sn_region_size;
+EXPORT_SYMBOL_GPL(sn_region_size);
+long system_serial_number;
+EXPORT_SYMBOL_GPL(system_serial_number);
+int uv_type;
+EXPORT_SYMBOL_GPL(uv_type);
+
+
+s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher,
+		long *region, long *ssn)
+{
+	s64 ret;
+	u64 v0, v1;
+	union partition_info_u part;
+
+	ret = uv_bios_call_irqsave(UV_BIOS_GET_SN_INFO, fc,
+				(u64)(&v0), (u64)(&v1), 0, 0);
+	if (ret != BIOS_STATUS_SUCCESS)
+		return ret;
+
+	part.val = v0;
+	if (uvtype)
+		*uvtype = part.hub_version;
+	if (partid)
+		*partid = part.partition_id;
+	if (coher)
+		*coher = part.coherence_id;
+	if (region)
+		*region = part.region_size;
+	if (ssn)
+		*ssn = v1;
+	return ret;
+}
+EXPORT_SYMBOL_GPL(uv_bios_get_sn_info);
+
+int
+uv_bios_mq_watchlist_alloc(unsigned long addr, unsigned int mq_size,
+			   unsigned long *intr_mmr_offset)
+{
+	u64 watchlist;
+	s64 ret;
+
+	/*
+	 * bios returns watchlist number or negative error number.
+	 */
+	ret = (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_ALLOC, addr,
+			mq_size, (u64)intr_mmr_offset,
+			(u64)&watchlist, 0);
+	if (ret < BIOS_STATUS_SUCCESS)
+		return ret;
+
+	return watchlist;
+}
+EXPORT_SYMBOL_GPL(uv_bios_mq_watchlist_alloc);
+
+int
+uv_bios_mq_watchlist_free(int blade, int watchlist_num)
+{
+	return (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_FREE,
+				blade, watchlist_num, 0, 0, 0);
+}
+EXPORT_SYMBOL_GPL(uv_bios_mq_watchlist_free);
+
+s64
+uv_bios_change_memprotect(u64 paddr, u64 len, enum uv_memprotect perms)
+{
+	return uv_bios_call_irqsave(UV_BIOS_MEMPROTECT, paddr, len,
+					perms, 0, 0);
+}
+EXPORT_SYMBOL_GPL(uv_bios_change_memprotect);
+
+s64
+uv_bios_reserved_page_pa(u64 buf, u64 *cookie, u64 *addr, u64 *len)
+{
+	s64 ret;
+
+	ret = uv_bios_call_irqsave(UV_BIOS_GET_PARTITION_ADDR, (u64)cookie,
+					(u64)addr, buf, (u64)len, 0);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(uv_bios_reserved_page_pa);
+
+s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second)
+{
+	return uv_bios_call(UV_BIOS_FREQ_BASE, clock_type,
+			   (u64)ticks_per_second, 0, 0, 0);
+}
+EXPORT_SYMBOL_GPL(uv_bios_freq_base);
+
+/*
+ * uv_bios_set_legacy_vga_target - Set Legacy VGA I/O Target
+ * @decode: true to enable target, false to disable target
+ * @domain: PCI domain number
+ * @bus: PCI bus number
+ *
+ * Returns:
+ *    0: Success
+ *    -EINVAL: Invalid domain or bus number
+ *    -ENOSYS: Capability not available
+ *    -EBUSY: Legacy VGA I/O cannot be retargeted at this time
+ */
+int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus)
+{
+	return uv_bios_call(UV_BIOS_SET_LEGACY_VGA_TARGET,
+				(u64)decode, (u64)domain, (u64)bus, 0, 0);
+}
+EXPORT_SYMBOL_GPL(uv_bios_set_legacy_vga_target);
+
+
+#ifdef CONFIG_EFI
+void uv_bios_init(void)
+{
+	struct uv_systab *tab;
+
+	if ((efi.uv_systab == EFI_INVALID_TABLE_ADDR) ||
+	    (efi.uv_systab == (unsigned long)NULL)) {
+		printk(KERN_CRIT "No EFI UV System Table.\n");
+		uv_systab.function = (unsigned long)NULL;
+		return;
+	}
+
+	tab = (struct uv_systab *)ioremap(efi.uv_systab,
+					sizeof(struct uv_systab));
+	if (strncmp(tab->signature, "UVST", 4) != 0)
+		printk(KERN_ERR "bad signature in UV system table!");
+
+	/*
+	 * Copy table to permanent spot for later use.
+	 */
+	memcpy(&uv_systab, tab, sizeof(struct uv_systab));
+	iounmap(tab);
+
+	printk(KERN_INFO "EFI UV System Table Revision %d\n",
+					uv_systab.revision);
+}
+#else	/* !CONFIG_EFI */
+
+void uv_bios_init(void) { }
+#endif
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
new file mode 100644
index 00000000000..20ea20a39e2
--- /dev/null
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -0,0 +1,1661 @@
+/*
+ *	SGI UltraViolet TLB flush routines.
+ *
+ *	(c) 2008-2010 Cliff Wickman <cpw@sgi.com>, SGI.
+ *
+ *	This code is released under the GNU General Public License version 2 or
+ *	later.
+ */
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+#include <linux/debugfs.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+
+#include <asm/mmu_context.h>
+#include <asm/uv/uv.h>
+#include <asm/uv/uv_mmrs.h>
+#include <asm/uv/uv_hub.h>
+#include <asm/uv/uv_bau.h>
+#include <asm/apic.h>
+#include <asm/idle.h>
+#include <asm/tsc.h>
+#include <asm/irq_vectors.h>
+#include <asm/timer.h>
+
+/* timeouts in nanoseconds (indexed by UVH_AGING_PRESCALE_SEL urgency7 30:28) */
+static int timeout_base_ns[] = {
+		20,
+		160,
+		1280,
+		10240,
+		81920,
+		655360,
+		5242880,
+		167772160
+};
+static int timeout_us;
+static int nobau;
+static int baudisabled;
+static spinlock_t disable_lock;
+static cycles_t congested_cycles;
+
+/* tunables: */
+static int max_bau_concurrent = MAX_BAU_CONCURRENT;
+static int max_bau_concurrent_constant = MAX_BAU_CONCURRENT;
+static int plugged_delay = PLUGGED_DELAY;
+static int plugsb4reset = PLUGSB4RESET;
+static int timeoutsb4reset = TIMEOUTSB4RESET;
+static int ipi_reset_limit = IPI_RESET_LIMIT;
+static int complete_threshold = COMPLETE_THRESHOLD;
+static int congested_response_us = CONGESTED_RESPONSE_US;
+static int congested_reps = CONGESTED_REPS;
+static int congested_period = CONGESTED_PERIOD;
+static struct dentry *tunables_dir;
+static struct dentry *tunables_file;
+
+static int __init setup_nobau(char *arg)
+{
+	nobau = 1;
+	return 0;
+}
+early_param("nobau", setup_nobau);
+
+/* base pnode in this partition */
+static int uv_partition_base_pnode __read_mostly;
+/* position of pnode (which is nasid>>1): */
+static int uv_nshift __read_mostly;
+static unsigned long uv_mmask __read_mostly;
+
+static DEFINE_PER_CPU(struct ptc_stats, ptcstats);
+static DEFINE_PER_CPU(struct bau_control, bau_control);
+static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
+
+/*
+ * Determine the first node on a uvhub. 'Nodes' are used for kernel
+ * memory allocation.
+ */
+static int __init uvhub_to_first_node(int uvhub)
+{
+	int node, b;
+
+	for_each_online_node(node) {
+		b = uv_node_to_blade_id(node);
+		if (uvhub == b)
+			return node;
+	}
+	return -1;
+}
+
+/*
+ * Determine the apicid of the first cpu on a uvhub.
+ */
+static int __init uvhub_to_first_apicid(int uvhub)
+{
+	int cpu;
+
+	for_each_present_cpu(cpu)
+		if (uvhub == uv_cpu_to_blade_id(cpu))
+			return per_cpu(x86_cpu_to_apicid, cpu);
+	return -1;
+}
+
+/*
+ * Free a software acknowledge hardware resource by clearing its Pending
+ * bit. This will return a reply to the sender.
+ * If the message has timed out, a reply has already been sent by the
+ * hardware but the resource has not been released. In that case our
+ * clear of the Timeout bit (as well) will free the resource. No reply will
+ * be sent (the hardware will only do one reply per message).
+ */
+static inline void uv_reply_to_message(struct msg_desc *mdp,
+				       struct bau_control *bcp)
+{
+	unsigned long dw;
+	struct bau_payload_queue_entry *msg;
+
+	msg = mdp->msg;
+	if (!msg->canceled) {
+		dw = (msg->sw_ack_vector << UV_SW_ACK_NPENDING) |
+						msg->sw_ack_vector;
+		uv_write_local_mmr(
+				UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw);
+	}
+	msg->replied_to = 1;
+	msg->sw_ack_vector = 0;
+}
+
+/*
+ * Process the receipt of a RETRY message
+ */
+static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
+					    struct bau_control *bcp)
+{
+	int i;
+	int cancel_count = 0;
+	int slot2;
+	unsigned long msg_res;
+	unsigned long mmr = 0;
+	struct bau_payload_queue_entry *msg;
+	struct bau_payload_queue_entry *msg2;
+	struct ptc_stats *stat;
+
+	msg = mdp->msg;
+	stat = bcp->statp;
+	stat->d_retries++;
+	/*
+	 * cancel any message from msg+1 to the retry itself
+	 */
+	for (msg2 = msg+1, i = 0; i < DEST_Q_SIZE; msg2++, i++) {
+		if (msg2 > mdp->va_queue_last)
+			msg2 = mdp->va_queue_first;
+		if (msg2 == msg)
+			break;
+
+		/* same conditions for cancellation as uv_do_reset */
+		if ((msg2->replied_to == 0) && (msg2->canceled == 0) &&
+		    (msg2->sw_ack_vector) && ((msg2->sw_ack_vector &
+			msg->sw_ack_vector) == 0) &&
+		    (msg2->sending_cpu == msg->sending_cpu) &&
+		    (msg2->msg_type != MSG_NOOP)) {
+			slot2 = msg2 - mdp->va_queue_first;
+			mmr = uv_read_local_mmr
+				(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
+			msg_res = msg2->sw_ack_vector;
+			/*
+			 * This is a message retry; clear the resources held
+			 * by the previous message only if they timed out.
+			 * If it has not timed out we have an unexpected
+			 * situation to report.
+			 */
+			if (mmr & (msg_res << UV_SW_ACK_NPENDING)) {
+				/*
+				 * is the resource timed out?
+				 * make everyone ignore the cancelled message.
+				 */
+				msg2->canceled = 1;
+				stat->d_canceled++;
+				cancel_count++;
+				uv_write_local_mmr(
+				    UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
+					(msg_res << UV_SW_ACK_NPENDING) |
+					 msg_res);
+			}
+		}
+	}
+	if (!cancel_count)
+		stat->d_nocanceled++;
+}
+
+/*
+ * Do all the things a cpu should do for a TLB shootdown message.
+ * Other cpu's may come here at the same time for this message.
+ */
+static void uv_bau_process_message(struct msg_desc *mdp,
+				   struct bau_control *bcp)
+{
+	int msg_ack_count;
+	short socket_ack_count = 0;
+	struct ptc_stats *stat;
+	struct bau_payload_queue_entry *msg;
+	struct bau_control *smaster = bcp->socket_master;
+
+	/*
+	 * This must be a normal message, or retry of a normal message
+	 */
+	msg = mdp->msg;
+	stat = bcp->statp;
+	if (msg->address == TLB_FLUSH_ALL) {
+		local_flush_tlb();
+		stat->d_alltlb++;
+	} else {
+		__flush_tlb_one(msg->address);
+		stat->d_onetlb++;
+	}
+	stat->d_requestee++;
+
+	/*
+	 * One cpu on each uvhub has the additional job on a RETRY
+	 * of releasing the resource held by the message that is
+	 * being retried.  That message is identified by sending
+	 * cpu number.
+	 */
+	if (msg->msg_type == MSG_RETRY && bcp == bcp->uvhub_master)
+		uv_bau_process_retry_msg(mdp, bcp);
+
+	/*
+	 * This is a sw_ack message, so we have to reply to it.
+	 * Count each responding cpu on the socket. This avoids
+	 * pinging the count's cache line back and forth between
+	 * the sockets.
+	 */
+	socket_ack_count = atomic_add_short_return(1, (struct atomic_short *)
+			&smaster->socket_acknowledge_count[mdp->msg_slot]);
+	if (socket_ack_count == bcp->cpus_in_socket) {
+		/*
+		 * Both sockets dump their completed count total into
+		 * the message's count.
+		 */
+		smaster->socket_acknowledge_count[mdp->msg_slot] = 0;
+		msg_ack_count = atomic_add_short_return(socket_ack_count,
+				(struct atomic_short *)&msg->acknowledge_count);
+
+		if (msg_ack_count == bcp->cpus_in_uvhub) {
+			/*
+			 * All cpus in uvhub saw it; reply
+			 */
+			uv_reply_to_message(mdp, bcp);
+		}
+	}
+
+	return;
+}
+
+/*
+ * Determine the first cpu on a uvhub.
+ */
+static int uvhub_to_first_cpu(int uvhub)
+{
+	int cpu;
+	for_each_present_cpu(cpu)
+		if (uvhub == uv_cpu_to_blade_id(cpu))
+			return cpu;
+	return -1;
+}
+
+/*
+ * Last resort when we get a large number of destination timeouts is
+ * to clear resources held by a given cpu.
+ * Do this with IPI so that all messages in the BAU message queue
+ * can be identified by their nonzero sw_ack_vector field.
+ *
+ * This is entered for a single cpu on the uvhub.
+ * The sender want's this uvhub to free a specific message's
+ * sw_ack resources.
+ */
+static void
+uv_do_reset(void *ptr)
+{
+	int i;
+	int slot;
+	int count = 0;
+	unsigned long mmr;
+	unsigned long msg_res;
+	struct bau_control *bcp;
+	struct reset_args *rap;
+	struct bau_payload_queue_entry *msg;
+	struct ptc_stats *stat;
+
+	bcp = &per_cpu(bau_control, smp_processor_id());
+	rap = (struct reset_args *)ptr;
+	stat = bcp->statp;
+	stat->d_resets++;
+
+	/*
+	 * We're looking for the given sender, and
+	 * will free its sw_ack resource.
+	 * If all cpu's finally responded after the timeout, its
+	 * message 'replied_to' was set.
+	 */
+	for (msg = bcp->va_queue_first, i = 0; i < DEST_Q_SIZE; msg++, i++) {
+		/* uv_do_reset: same conditions for cancellation as
+		   uv_bau_process_retry_msg() */
+		if ((msg->replied_to == 0) &&
+		    (msg->canceled == 0) &&
+		    (msg->sending_cpu == rap->sender) &&
+		    (msg->sw_ack_vector) &&
+		    (msg->msg_type != MSG_NOOP)) {
+			/*
+			 * make everyone else ignore this message
+			 */
+			msg->canceled = 1;
+			slot = msg - bcp->va_queue_first;
+			count++;
+			/*
+			 * only reset the resource if it is still pending
+			 */
+			mmr = uv_read_local_mmr
+					(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
+			msg_res = msg->sw_ack_vector;
+			if (mmr & msg_res) {
+				stat->d_rcanceled++;
+				uv_write_local_mmr(
+				    UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
+					(msg_res << UV_SW_ACK_NPENDING) |
+					 msg_res);
+			}
+		}
+	}
+	return;
+}
+
+/*
+ * Use IPI to get all target uvhubs to release resources held by
+ * a given sending cpu number.
+ */
+static void uv_reset_with_ipi(struct bau_target_uvhubmask *distribution,
+			      int sender)
+{
+	int uvhub;
+	int cpu;
+	cpumask_t mask;
+	struct reset_args reset_args;
+
+	reset_args.sender = sender;
+
+	cpus_clear(mask);
+	/* find a single cpu for each uvhub in this distribution mask */
+	for (uvhub = 0;
+		    uvhub < sizeof(struct bau_target_uvhubmask) * BITSPERBYTE;
+		    uvhub++) {
+		if (!bau_uvhub_isset(uvhub, distribution))
+			continue;
+		/* find a cpu for this uvhub */
+		cpu = uvhub_to_first_cpu(uvhub);
+		cpu_set(cpu, mask);
+	}
+	/* IPI all cpus; Preemption is already disabled */
+	smp_call_function_many(&mask, uv_do_reset, (void *)&reset_args, 1);
+	return;
+}
+
+static inline unsigned long
+cycles_2_us(unsigned long long cyc)
+{
+	unsigned long long ns;
+	unsigned long us;
+	ns =  (cyc * per_cpu(cyc2ns, smp_processor_id()))
+						>> CYC2NS_SCALE_FACTOR;
+	us = ns / 1000;
+	return us;
+}
+
+/*
+ * wait for all cpus on this hub to finish their sends and go quiet
+ * leaves uvhub_quiesce set so that no new broadcasts are started by
+ * bau_flush_send_and_wait()
+ */
+static inline void
+quiesce_local_uvhub(struct bau_control *hmaster)
+{
+	atomic_add_short_return(1, (struct atomic_short *)
+		 &hmaster->uvhub_quiesce);
+}
+
+/*
+ * mark this quiet-requestor as done
+ */
+static inline void
+end_uvhub_quiesce(struct bau_control *hmaster)
+{
+	atomic_add_short_return(-1, (struct atomic_short *)
+		&hmaster->uvhub_quiesce);
+}
+
+/*
+ * Wait for completion of a broadcast software ack message
+ * return COMPLETE, RETRY(PLUGGED or TIMEOUT) or GIVEUP
+ */
+static int uv_wait_completion(struct bau_desc *bau_desc,
+	unsigned long mmr_offset, int right_shift, int this_cpu,
+	struct bau_control *bcp, struct bau_control *smaster, long try)
+{
+	unsigned long descriptor_status;
+	cycles_t ttime;
+	struct ptc_stats *stat = bcp->statp;
+	struct bau_control *hmaster;
+
+	hmaster = bcp->uvhub_master;
+
+	/* spin on the status MMR, waiting for it to go idle */
+	while ((descriptor_status = (((unsigned long)
+		uv_read_local_mmr(mmr_offset) >>
+			right_shift) & UV_ACT_STATUS_MASK)) !=
+			DESC_STATUS_IDLE) {
+		/*
+		 * Our software ack messages may be blocked because there are
+		 * no swack resources available.  As long as none of them
+		 * has timed out hardware will NACK our message and its
+		 * state will stay IDLE.
+		 */
+		if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) {
+			stat->s_stimeout++;
+			return FLUSH_GIVEUP;
+		} else if (descriptor_status ==
+					DESC_STATUS_DESTINATION_TIMEOUT) {
+			stat->s_dtimeout++;
+			ttime = get_cycles();
+
+			/*
+			 * Our retries may be blocked by all destination
+			 * swack resources being consumed, and a timeout
+			 * pending.  In that case hardware returns the
+			 * ERROR that looks like a destination timeout.
+			 */
+			if (cycles_2_us(ttime - bcp->send_message) <
+							timeout_us) {
+				bcp->conseccompletes = 0;
+				return FLUSH_RETRY_PLUGGED;
+			}
+
+			bcp->conseccompletes = 0;
+			return FLUSH_RETRY_TIMEOUT;
+		} else {
+			/*
+			 * descriptor_status is still BUSY
+			 */
+			cpu_relax();
+		}
+	}
+	bcp->conseccompletes++;
+	return FLUSH_COMPLETE;
+}
+
+static inline cycles_t
+sec_2_cycles(unsigned long sec)
+{
+	unsigned long ns;
+	cycles_t cyc;
+
+	ns = sec * 1000000000;
+	cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
+	return cyc;
+}
+
+/*
+ * conditionally add 1 to *v, unless *v is >= u
+ * return 0 if we cannot add 1 to *v because it is >= u
+ * return 1 if we can add 1 to *v because it is < u
+ * the add is atomic
+ *
+ * This is close to atomic_add_unless(), but this allows the 'u' value
+ * to be lowered below the current 'v'.  atomic_add_unless can only stop
+ * on equal.
+ */
+static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
+{
+	spin_lock(lock);
+	if (atomic_read(v) >= u) {
+		spin_unlock(lock);
+		return 0;
+	}
+	atomic_inc(v);
+	spin_unlock(lock);
+	return 1;
+}
+
+/*
+ * Our retries are blocked by all destination swack resources being
+ * in use, and a timeout is pending. In that case hardware immediately
+ * returns the ERROR that looks like a destination timeout.
+ */
+static void
+destination_plugged(struct bau_desc *bau_desc, struct bau_control *bcp,
+			struct bau_control *hmaster, struct ptc_stats *stat)
+{
+	udelay(bcp->plugged_delay);
+	bcp->plugged_tries++;
+	if (bcp->plugged_tries >= bcp->plugsb4reset) {
+		bcp->plugged_tries = 0;
+		quiesce_local_uvhub(hmaster);
+		spin_lock(&hmaster->queue_lock);
+		uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu);
+		spin_unlock(&hmaster->queue_lock);
+		end_uvhub_quiesce(hmaster);
+		bcp->ipi_attempts++;
+		stat->s_resets_plug++;
+	}
+}
+
+static void
+destination_timeout(struct bau_desc *bau_desc, struct bau_control *bcp,
+			struct bau_control *hmaster, struct ptc_stats *stat)
+{
+	hmaster->max_bau_concurrent = 1;
+	bcp->timeout_tries++;
+	if (bcp->timeout_tries >= bcp->timeoutsb4reset) {
+		bcp->timeout_tries = 0;
+		quiesce_local_uvhub(hmaster);
+		spin_lock(&hmaster->queue_lock);
+		uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu);
+		spin_unlock(&hmaster->queue_lock);
+		end_uvhub_quiesce(hmaster);
+		bcp->ipi_attempts++;
+		stat->s_resets_timeout++;
+	}
+}
+
+/*
+ * Completions are taking a very long time due to a congested numalink
+ * network.
+ */
+static void
+disable_for_congestion(struct bau_control *bcp, struct ptc_stats *stat)
+{
+	int tcpu;
+	struct bau_control *tbcp;
+
+	/* let only one cpu do this disabling */
+	spin_lock(&disable_lock);
+	if (!baudisabled && bcp->period_requests &&
+	    ((bcp->period_time / bcp->period_requests) > congested_cycles)) {
+		/* it becomes this cpu's job to turn on the use of the
+		   BAU again */
+		baudisabled = 1;
+		bcp->set_bau_off = 1;
+		bcp->set_bau_on_time = get_cycles() +
+			sec_2_cycles(bcp->congested_period);
+		stat->s_bau_disabled++;
+		for_each_present_cpu(tcpu) {
+			tbcp = &per_cpu(bau_control, tcpu);
+				tbcp->baudisabled = 1;
+		}
+	}
+	spin_unlock(&disable_lock);
+}
+
+/**
+ * uv_flush_send_and_wait
+ *
+ * Send a broadcast and wait for it to complete.
+ *
+ * The flush_mask contains the cpus the broadcast is to be sent to including
+ * cpus that are on the local uvhub.
+ *
+ * Returns 0 if all flushing represented in the mask was done.
+ * Returns 1 if it gives up entirely and the original cpu mask is to be
+ * returned to the kernel.
+ */
+int uv_flush_send_and_wait(struct bau_desc *bau_desc,
+			   struct cpumask *flush_mask, struct bau_control *bcp)
+{
+	int right_shift;
+	int completion_status = 0;
+	int seq_number = 0;
+	long try = 0;
+	int cpu = bcp->uvhub_cpu;
+	int this_cpu = bcp->cpu;
+	unsigned long mmr_offset;
+	unsigned long index;
+	cycles_t time1;
+	cycles_t time2;
+	cycles_t elapsed;
+	struct ptc_stats *stat = bcp->statp;
+	struct bau_control *smaster = bcp->socket_master;
+	struct bau_control *hmaster = bcp->uvhub_master;
+
+	if (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
+			&hmaster->active_descriptor_count,
+			hmaster->max_bau_concurrent)) {
+		stat->s_throttles++;
+		do {
+			cpu_relax();
+		} while (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
+			&hmaster->active_descriptor_count,
+			hmaster->max_bau_concurrent));
+	}
+	while (hmaster->uvhub_quiesce)
+		cpu_relax();
+
+	if (cpu < UV_CPUS_PER_ACT_STATUS) {
+		mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
+		right_shift = cpu * UV_ACT_STATUS_SIZE;
+	} else {
+		mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
+		right_shift =
+		    ((cpu - UV_CPUS_PER_ACT_STATUS) * UV_ACT_STATUS_SIZE);
+	}
+	time1 = get_cycles();
+	do {
+		if (try == 0) {
+			bau_desc->header.msg_type = MSG_REGULAR;
+			seq_number = bcp->message_number++;
+		} else {
+			bau_desc->header.msg_type = MSG_RETRY;
+			stat->s_retry_messages++;
+		}
+		bau_desc->header.sequence = seq_number;
+		index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) |
+			bcp->uvhub_cpu;
+		bcp->send_message = get_cycles();
+		uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index);
+		try++;
+		completion_status = uv_wait_completion(bau_desc, mmr_offset,
+			right_shift, this_cpu, bcp, smaster, try);
+
+		if (completion_status == FLUSH_RETRY_PLUGGED) {
+			destination_plugged(bau_desc, bcp, hmaster, stat);
+		} else if (completion_status == FLUSH_RETRY_TIMEOUT) {
+			destination_timeout(bau_desc, bcp, hmaster, stat);
+		}
+		if (bcp->ipi_attempts >= bcp->ipi_reset_limit) {
+			bcp->ipi_attempts = 0;
+			completion_status = FLUSH_GIVEUP;
+			break;
+		}
+		cpu_relax();
+	} while ((completion_status == FLUSH_RETRY_PLUGGED) ||
+		 (completion_status == FLUSH_RETRY_TIMEOUT));
+	time2 = get_cycles();
+	bcp->plugged_tries = 0;
+	bcp->timeout_tries = 0;
+	if ((completion_status == FLUSH_COMPLETE) &&
+	    (bcp->conseccompletes > bcp->complete_threshold) &&
+	    (hmaster->max_bau_concurrent <
+					hmaster->max_bau_concurrent_constant))
+			hmaster->max_bau_concurrent++;
+	while (hmaster->uvhub_quiesce)
+		cpu_relax();
+	atomic_dec(&hmaster->active_descriptor_count);
+	if (time2 > time1) {
+		elapsed = time2 - time1;
+		stat->s_time += elapsed;
+		if ((completion_status == FLUSH_COMPLETE) && (try == 1)) {
+			bcp->period_requests++;
+			bcp->period_time += elapsed;
+			if ((elapsed > congested_cycles) &&
+			    (bcp->period_requests > bcp->congested_reps)) {
+				disable_for_congestion(bcp, stat);
+			}
+		}
+	} else
+		stat->s_requestor--;
+	if (completion_status == FLUSH_COMPLETE && try > 1)
+		stat->s_retriesok++;
+	else if (completion_status == FLUSH_GIVEUP) {
+		stat->s_giveup++;
+		return 1;
+	}
+	return 0;
+}
+
+/**
+ * uv_flush_tlb_others - globally purge translation cache of a virtual
+ * address or all TLB's
+ * @cpumask: mask of all cpu's in which the address is to be removed
+ * @mm: mm_struct containing virtual address range
+ * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu)
+ * @cpu: the current cpu
+ *
+ * This is the entry point for initiating any UV global TLB shootdown.
+ *
+ * Purges the translation caches of all specified processors of the given
+ * virtual address, or purges all TLB's on specified processors.
+ *
+ * The caller has derived the cpumask from the mm_struct.  This function
+ * is called only if there are bits set in the mask. (e.g. flush_tlb_page())
+ *
+ * The cpumask is converted into a uvhubmask of the uvhubs containing
+ * those cpus.
+ *
+ * Note that this function should be called with preemption disabled.
+ *
+ * Returns NULL if all remote flushing was done.
+ * Returns pointer to cpumask if some remote flushing remains to be
+ * done.  The returned pointer is valid till preemption is re-enabled.
+ */
+const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
+					  struct mm_struct *mm,
+					  unsigned long va, unsigned int cpu)
+{
+	int tcpu;
+	int uvhub;
+	int locals = 0;
+	int remotes = 0;
+	int hubs = 0;
+	struct bau_desc *bau_desc;
+	struct cpumask *flush_mask;
+	struct ptc_stats *stat;
+	struct bau_control *bcp;
+	struct bau_control *tbcp;
+
+	/* kernel was booted 'nobau' */
+	if (nobau)
+		return cpumask;
+
+	bcp = &per_cpu(bau_control, cpu);
+	stat = bcp->statp;
+
+	/* bau was disabled due to slow response */
+	if (bcp->baudisabled) {
+		/* the cpu that disabled it must re-enable it */
+		if (bcp->set_bau_off) {
+			if (get_cycles() >= bcp->set_bau_on_time) {
+				stat->s_bau_reenabled++;
+				baudisabled = 0;
+				for_each_present_cpu(tcpu) {
+					tbcp = &per_cpu(bau_control, tcpu);
+					tbcp->baudisabled = 0;
+					tbcp->period_requests = 0;
+					tbcp->period_time = 0;
+				}
+			}
+		}
+		return cpumask;
+	}
+
+	/*
+	 * Each sending cpu has a per-cpu mask which it fills from the caller's
+	 * cpu mask.  All cpus are converted to uvhubs and copied to the
+	 * activation descriptor.
+	 */
+	flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu);
+	/* don't actually do a shootdown of the local cpu */
+	cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
+	if (cpu_isset(cpu, *cpumask))
+		stat->s_ntargself++;
+
+	bau_desc = bcp->descriptor_base;
+	bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu;
+	bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
+
+	/* cpu statistics */
+	for_each_cpu(tcpu, flush_mask) {
+		uvhub = uv_cpu_to_blade_id(tcpu);
+		bau_uvhub_set(uvhub, &bau_desc->distribution);
+		if (uvhub == bcp->uvhub)
+			locals++;
+		else
+			remotes++;
+	}
+	if ((locals + remotes) == 0)
+		return NULL;
+	stat->s_requestor++;
+	stat->s_ntargcpu += remotes + locals;
+	stat->s_ntargremotes += remotes;
+	stat->s_ntarglocals += locals;
+	remotes = bau_uvhub_weight(&bau_desc->distribution);
+
+	/* uvhub statistics */
+	hubs = bau_uvhub_weight(&bau_desc->distribution);
+	if (locals) {
+		stat->s_ntarglocaluvhub++;
+		stat->s_ntargremoteuvhub += (hubs - 1);
+	} else
+		stat->s_ntargremoteuvhub += hubs;
+	stat->s_ntarguvhub += hubs;
+	if (hubs >= 16)
+		stat->s_ntarguvhub16++;
+	else if (hubs >= 8)
+		stat->s_ntarguvhub8++;
+	else if (hubs >= 4)
+		stat->s_ntarguvhub4++;
+	else if (hubs >= 2)
+		stat->s_ntarguvhub2++;
+	else
+		stat->s_ntarguvhub1++;
+
+	bau_desc->payload.address = va;
+	bau_desc->payload.sending_cpu = cpu;
+
+	/*
+	 * uv_flush_send_and_wait returns 0 if all cpu's were messaged,
+	 * or 1 if it gave up and the original cpumask should be returned.
+	 */
+	if (!uv_flush_send_and_wait(bau_desc, flush_mask, bcp))
+		return NULL;
+	else
+		return cpumask;
+}
+
+/*
+ * The BAU message interrupt comes here. (registered by set_intr_gate)
+ * See entry_64.S
+ *
+ * We received a broadcast assist message.
+ *
+ * Interrupts are disabled; this interrupt could represent
+ * the receipt of several messages.
+ *
+ * All cores/threads on this hub get this interrupt.
+ * The last one to see it does the software ack.
+ * (the resource will not be freed until noninterruptable cpus see this
+ *  interrupt; hardware may timeout the s/w ack and reply ERROR)
+ */
+void uv_bau_message_interrupt(struct pt_regs *regs)
+{
+	int count = 0;
+	cycles_t time_start;
+	struct bau_payload_queue_entry *msg;
+	struct bau_control *bcp;
+	struct ptc_stats *stat;
+	struct msg_desc msgdesc;
+
+	time_start = get_cycles();
+	bcp = &per_cpu(bau_control, smp_processor_id());
+	stat = bcp->statp;
+	msgdesc.va_queue_first = bcp->va_queue_first;
+	msgdesc.va_queue_last = bcp->va_queue_last;
+	msg = bcp->bau_msg_head;
+	while (msg->sw_ack_vector) {
+		count++;
+		msgdesc.msg_slot = msg - msgdesc.va_queue_first;
+		msgdesc.sw_ack_slot = ffs(msg->sw_ack_vector) - 1;
+		msgdesc.msg = msg;
+		uv_bau_process_message(&msgdesc, bcp);
+		msg++;
+		if (msg > msgdesc.va_queue_last)
+			msg = msgdesc.va_queue_first;
+		bcp->bau_msg_head = msg;
+	}
+	stat->d_time += (get_cycles() - time_start);
+	if (!count)
+		stat->d_nomsg++;
+	else if (count > 1)
+		stat->d_multmsg++;
+	ack_APIC_irq();
+}
+
+/*
+ * uv_enable_timeouts
+ *
+ * Each target uvhub (i.e. a uvhub that has no cpu's) needs to have
+ * shootdown message timeouts enabled.  The timeout does not cause
+ * an interrupt, but causes an error message to be returned to
+ * the sender.
+ */
+static void uv_enable_timeouts(void)
+{
+	int uvhub;
+	int nuvhubs;
+	int pnode;
+	unsigned long mmr_image;
+
+	nuvhubs = uv_num_possible_blades();
+
+	for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
+		if (!uv_blade_nr_possible_cpus(uvhub))
+			continue;
+
+		pnode = uv_blade_to_pnode(uvhub);
+		mmr_image =
+		    uv_read_global_mmr64(pnode, UVH_LB_BAU_MISC_CONTROL);
+		/*
+		 * Set the timeout period and then lock it in, in three
+		 * steps; captures and locks in the period.
+		 *
+		 * To program the period, the SOFT_ACK_MODE must be off.
+		 */
+		mmr_image &= ~((unsigned long)1 <<
+		    UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT);
+		uv_write_global_mmr64
+		    (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
+		/*
+		 * Set the 4-bit period.
+		 */
+		mmr_image &= ~((unsigned long)0xf <<
+		     UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT);
+		mmr_image |= (UV_INTD_SOFT_ACK_TIMEOUT_PERIOD <<
+		     UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT);
+		uv_write_global_mmr64
+		    (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
+		/*
+		 * Subsequent reversals of the timebase bit (3) cause an
+		 * immediate timeout of one or all INTD resources as
+		 * indicated in bits 2:0 (7 causes all of them to timeout).
+		 */
+		mmr_image |= ((unsigned long)1 <<
+		    UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT);
+		uv_write_global_mmr64
+		    (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
+	}
+}
+
+static void *uv_ptc_seq_start(struct seq_file *file, loff_t *offset)
+{
+	if (*offset < num_possible_cpus())
+		return offset;
+	return NULL;
+}
+
+static void *uv_ptc_seq_next(struct seq_file *file, void *data, loff_t *offset)
+{
+	(*offset)++;
+	if (*offset < num_possible_cpus())
+		return offset;
+	return NULL;
+}
+
+static void uv_ptc_seq_stop(struct seq_file *file, void *data)
+{
+}
+
+static inline unsigned long long
+microsec_2_cycles(unsigned long microsec)
+{
+	unsigned long ns;
+	unsigned long long cyc;
+
+	ns = microsec * 1000;
+	cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
+	return cyc;
+}
+
+/*
+ * Display the statistics thru /proc.
+ * 'data' points to the cpu number
+ */
+static int uv_ptc_seq_show(struct seq_file *file, void *data)
+{
+	struct ptc_stats *stat;
+	int cpu;
+
+	cpu = *(loff_t *)data;
+
+	if (!cpu) {
+		seq_printf(file,
+			"# cpu sent stime self locals remotes ncpus localhub ");
+		seq_printf(file,
+			"remotehub numuvhubs numuvhubs16 numuvhubs8 ");
+		seq_printf(file,
+			"numuvhubs4 numuvhubs2 numuvhubs1 dto ");
+		seq_printf(file,
+			"retries rok resetp resett giveup sto bz throt ");
+		seq_printf(file,
+			"sw_ack recv rtime all ");
+		seq_printf(file,
+			"one mult none retry canc nocan reset rcan ");
+		seq_printf(file,
+			"disable enable\n");
+	}
+	if (cpu < num_possible_cpus() && cpu_online(cpu)) {
+		stat = &per_cpu(ptcstats, cpu);
+		/* source side statistics */
+		seq_printf(file,
+			"cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
+			   cpu, stat->s_requestor, cycles_2_us(stat->s_time),
+			   stat->s_ntargself, stat->s_ntarglocals,
+			   stat->s_ntargremotes, stat->s_ntargcpu,
+			   stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub,
+			   stat->s_ntarguvhub, stat->s_ntarguvhub16);
+		seq_printf(file, "%ld %ld %ld %ld %ld ",
+			   stat->s_ntarguvhub8, stat->s_ntarguvhub4,
+			   stat->s_ntarguvhub2, stat->s_ntarguvhub1,
+			   stat->s_dtimeout);
+		seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ",
+			   stat->s_retry_messages, stat->s_retriesok,
+			   stat->s_resets_plug, stat->s_resets_timeout,
+			   stat->s_giveup, stat->s_stimeout,
+			   stat->s_busy, stat->s_throttles);
+
+		/* destination side statistics */
+		seq_printf(file,
+			   "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
+			   uv_read_global_mmr64(uv_cpu_to_pnode(cpu),
+					UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE),
+			   stat->d_requestee, cycles_2_us(stat->d_time),
+			   stat->d_alltlb, stat->d_onetlb, stat->d_multmsg,
+			   stat->d_nomsg, stat->d_retries, stat->d_canceled,
+			   stat->d_nocanceled, stat->d_resets,
+			   stat->d_rcanceled);
+		seq_printf(file, "%ld %ld\n",
+			stat->s_bau_disabled, stat->s_bau_reenabled);
+	}
+
+	return 0;
+}
+
+/*
+ * Display the tunables thru debugfs
+ */
+static ssize_t tunables_read(struct file *file, char __user *userbuf,
+						size_t count, loff_t *ppos)
+{
+	char *buf;
+	int ret;
+
+	buf = kasprintf(GFP_KERNEL, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n",
+		"max_bau_concurrent plugged_delay plugsb4reset",
+		"timeoutsb4reset ipi_reset_limit complete_threshold",
+		"congested_response_us congested_reps congested_period",
+		max_bau_concurrent, plugged_delay, plugsb4reset,
+		timeoutsb4reset, ipi_reset_limit, complete_threshold,
+		congested_response_us, congested_reps, congested_period);
+
+	if (!buf)
+		return -ENOMEM;
+
+	ret = simple_read_from_buffer(userbuf, count, ppos, buf, strlen(buf));
+	kfree(buf);
+	return ret;
+}
+
+/*
+ * -1: resetf the statistics
+ *  0: display meaning of the statistics
+ */
+static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
+				 size_t count, loff_t *data)
+{
+	int cpu;
+	long input_arg;
+	char optstr[64];
+	struct ptc_stats *stat;
+
+	if (count == 0 || count > sizeof(optstr))
+		return -EINVAL;
+	if (copy_from_user(optstr, user, count))
+		return -EFAULT;
+	optstr[count - 1] = '\0';
+	if (strict_strtol(optstr, 10, &input_arg) < 0) {
+		printk(KERN_DEBUG "%s is invalid\n", optstr);
+		return -EINVAL;
+	}
+
+	if (input_arg == 0) {
+		printk(KERN_DEBUG "# cpu:      cpu number\n");
+		printk(KERN_DEBUG "Sender statistics:\n");
+		printk(KERN_DEBUG
+		"sent:     number of shootdown messages sent\n");
+		printk(KERN_DEBUG
+		"stime:    time spent sending messages\n");
+		printk(KERN_DEBUG
+		"numuvhubs: number of hubs targeted with shootdown\n");
+		printk(KERN_DEBUG
+		"numuvhubs16: number times 16 or more hubs targeted\n");
+		printk(KERN_DEBUG
+		"numuvhubs8: number times 8 or more hubs targeted\n");
+		printk(KERN_DEBUG
+		"numuvhubs4: number times 4 or more hubs targeted\n");
+		printk(KERN_DEBUG
+		"numuvhubs2: number times 2 or more hubs targeted\n");
+		printk(KERN_DEBUG
+		"numuvhubs1: number times 1 hub targeted\n");
+		printk(KERN_DEBUG
+		"numcpus:  number of cpus targeted with shootdown\n");
+		printk(KERN_DEBUG
+		"dto:      number of destination timeouts\n");
+		printk(KERN_DEBUG
+		"retries:  destination timeout retries sent\n");
+		printk(KERN_DEBUG
+		"rok:   :  destination timeouts successfully retried\n");
+		printk(KERN_DEBUG
+		"resetp:   ipi-style resource resets for plugs\n");
+		printk(KERN_DEBUG
+		"resett:   ipi-style resource resets for timeouts\n");
+		printk(KERN_DEBUG
+		"giveup:   fall-backs to ipi-style shootdowns\n");
+		printk(KERN_DEBUG
+		"sto:      number of source timeouts\n");
+		printk(KERN_DEBUG
+		"bz:       number of stay-busy's\n");
+		printk(KERN_DEBUG
+		"throt:    number times spun in throttle\n");
+		printk(KERN_DEBUG "Destination side statistics:\n");
+		printk(KERN_DEBUG
+		"sw_ack:   image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n");
+		printk(KERN_DEBUG
+		"recv:     shootdown messages received\n");
+		printk(KERN_DEBUG
+		"rtime:    time spent processing messages\n");
+		printk(KERN_DEBUG
+		"all:      shootdown all-tlb messages\n");
+		printk(KERN_DEBUG
+		"one:      shootdown one-tlb messages\n");
+		printk(KERN_DEBUG
+		"mult:     interrupts that found multiple messages\n");
+		printk(KERN_DEBUG
+		"none:     interrupts that found no messages\n");
+		printk(KERN_DEBUG
+		"retry:    number of retry messages processed\n");
+		printk(KERN_DEBUG
+		"canc:     number messages canceled by retries\n");
+		printk(KERN_DEBUG
+		"nocan:    number retries that found nothing to cancel\n");
+		printk(KERN_DEBUG
+		"reset:    number of ipi-style reset requests processed\n");
+		printk(KERN_DEBUG
+		"rcan:     number messages canceled by reset requests\n");
+		printk(KERN_DEBUG
+		"disable:  number times use of the BAU was disabled\n");
+		printk(KERN_DEBUG
+		"enable:   number times use of the BAU was re-enabled\n");
+	} else if (input_arg == -1) {
+		for_each_present_cpu(cpu) {
+			stat = &per_cpu(ptcstats, cpu);
+			memset(stat, 0, sizeof(struct ptc_stats));
+		}
+	}
+
+	return count;
+}
+
+static int local_atoi(const char *name)
+{
+	int val = 0;
+
+	for (;; name++) {
+		switch (*name) {
+		case '0' ... '9':
+			val = 10*val+(*name-'0');
+			break;
+		default:
+			return val;
+		}
+	}
+}
+
+/*
+ * set the tunables
+ * 0 values reset them to defaults
+ */
+static ssize_t tunables_write(struct file *file, const char __user *user,
+				 size_t count, loff_t *data)
+{
+	int cpu;
+	int cnt = 0;
+	int val;
+	char *p;
+	char *q;
+	char instr[64];
+	struct bau_control *bcp;
+
+	if (count == 0 || count > sizeof(instr)-1)
+		return -EINVAL;
+	if (copy_from_user(instr, user, count))
+		return -EFAULT;
+
+	instr[count] = '\0';
+	/* count the fields */
+	p = instr + strspn(instr, WHITESPACE);
+	q = p;
+	for (; *p; p = q + strspn(q, WHITESPACE)) {
+		q = p + strcspn(p, WHITESPACE);
+		cnt++;
+		if (q == p)
+			break;
+	}
+	if (cnt != 9) {
+		printk(KERN_INFO "bau tunable error: should be 9 numbers\n");
+		return -EINVAL;
+	}
+
+	p = instr + strspn(instr, WHITESPACE);
+	q = p;
+	for (cnt = 0; *p; p = q + strspn(q, WHITESPACE), cnt++) {
+		q = p + strcspn(p, WHITESPACE);
+		val = local_atoi(p);
+		switch (cnt) {
+		case 0:
+			if (val == 0) {
+				max_bau_concurrent = MAX_BAU_CONCURRENT;
+				max_bau_concurrent_constant =
+							MAX_BAU_CONCURRENT;
+				continue;
+			}
+			bcp = &per_cpu(bau_control, smp_processor_id());
+			if (val < 1 || val > bcp->cpus_in_uvhub) {
+				printk(KERN_DEBUG
+				"Error: BAU max concurrent %d is invalid\n",
+				val);
+				return -EINVAL;
+			}
+			max_bau_concurrent = val;
+			max_bau_concurrent_constant = val;
+			continue;
+		case 1:
+			if (val == 0)
+				plugged_delay = PLUGGED_DELAY;
+			else
+				plugged_delay = val;
+			continue;
+		case 2:
+			if (val == 0)
+				plugsb4reset = PLUGSB4RESET;
+			else
+				plugsb4reset = val;
+			continue;
+		case 3:
+			if (val == 0)
+				timeoutsb4reset = TIMEOUTSB4RESET;
+			else
+				timeoutsb4reset = val;
+			continue;
+		case 4:
+			if (val == 0)
+				ipi_reset_limit = IPI_RESET_LIMIT;
+			else
+				ipi_reset_limit = val;
+			continue;
+		case 5:
+			if (val == 0)
+				complete_threshold = COMPLETE_THRESHOLD;
+			else
+				complete_threshold = val;
+			continue;
+		case 6:
+			if (val == 0)
+				congested_response_us = CONGESTED_RESPONSE_US;
+			else
+				congested_response_us = val;
+			continue;
+		case 7:
+			if (val == 0)
+				congested_reps = CONGESTED_REPS;
+			else
+				congested_reps = val;
+			continue;
+		case 8:
+			if (val == 0)
+				congested_period = CONGESTED_PERIOD;
+			else
+				congested_period = val;
+			continue;
+		}
+		if (q == p)
+			break;
+	}
+	for_each_present_cpu(cpu) {
+		bcp = &per_cpu(bau_control, cpu);
+		bcp->max_bau_concurrent = max_bau_concurrent;
+		bcp->max_bau_concurrent_constant = max_bau_concurrent;
+		bcp->plugged_delay = plugged_delay;
+		bcp->plugsb4reset = plugsb4reset;
+		bcp->timeoutsb4reset = timeoutsb4reset;
+		bcp->ipi_reset_limit = ipi_reset_limit;
+		bcp->complete_threshold = complete_threshold;
+		bcp->congested_response_us = congested_response_us;
+		bcp->congested_reps = congested_reps;
+		bcp->congested_period = congested_period;
+	}
+	return count;
+}
+
+static const struct seq_operations uv_ptc_seq_ops = {
+	.start		= uv_ptc_seq_start,
+	.next		= uv_ptc_seq_next,
+	.stop		= uv_ptc_seq_stop,
+	.show		= uv_ptc_seq_show
+};
+
+static int uv_ptc_proc_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &uv_ptc_seq_ops);
+}
+
+static int tunables_open(struct inode *inode, struct file *file)
+{
+	return 0;
+}
+
+static const struct file_operations proc_uv_ptc_operations = {
+	.open		= uv_ptc_proc_open,
+	.read		= seq_read,
+	.write		= uv_ptc_proc_write,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static const struct file_operations tunables_fops = {
+	.open		= tunables_open,
+	.read		= tunables_read,
+	.write		= tunables_write,
+	.llseek		= default_llseek,
+};
+
+static int __init uv_ptc_init(void)
+{
+	struct proc_dir_entry *proc_uv_ptc;
+
+	if (!is_uv_system())
+		return 0;
+
+	proc_uv_ptc = proc_create(UV_PTC_BASENAME, 0444, NULL,
+				  &proc_uv_ptc_operations);
+	if (!proc_uv_ptc) {
+		printk(KERN_ERR "unable to create %s proc entry\n",
+		       UV_PTC_BASENAME);
+		return -EINVAL;
+	}
+
+	tunables_dir = debugfs_create_dir(UV_BAU_TUNABLES_DIR, NULL);
+	if (!tunables_dir) {
+		printk(KERN_ERR "unable to create debugfs directory %s\n",
+		       UV_BAU_TUNABLES_DIR);
+		return -EINVAL;
+	}
+	tunables_file = debugfs_create_file(UV_BAU_TUNABLES_FILE, 0600,
+			tunables_dir, NULL, &tunables_fops);
+	if (!tunables_file) {
+		printk(KERN_ERR "unable to create debugfs file %s\n",
+		       UV_BAU_TUNABLES_FILE);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/*
+ * initialize the sending side's sending buffers
+ */
+static void
+uv_activation_descriptor_init(int node, int pnode)
+{
+	int i;
+	int cpu;
+	unsigned long pa;
+	unsigned long m;
+	unsigned long n;
+	struct bau_desc *bau_desc;
+	struct bau_desc *bd2;
+	struct bau_control *bcp;
+
+	/*
+	 * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR)
+	 * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per uvhub
+	 */
+	bau_desc = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)*
+		UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node);
+	BUG_ON(!bau_desc);
+
+	pa = uv_gpa(bau_desc); /* need the real nasid*/
+	n = pa >> uv_nshift;
+	m = pa & uv_mmask;
+
+	uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE,
+			      (n << UV_DESC_BASE_PNODE_SHIFT | m));
+
+	/*
+	 * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each
+	 * cpu even though we only use the first one; one descriptor can
+	 * describe a broadcast to 256 uv hubs.
+	 */
+	for (i = 0, bd2 = bau_desc; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR);
+		i++, bd2++) {
+		memset(bd2, 0, sizeof(struct bau_desc));
+		bd2->header.sw_ack_flag = 1;
+		/*
+		 * base_dest_nodeid is the nasid (pnode<<1) of the first uvhub
+		 * in the partition. The bit map will indicate uvhub numbers,
+		 * which are 0-N in a partition. Pnodes are unique system-wide.
+		 */
+		bd2->header.base_dest_nodeid = uv_partition_base_pnode << 1;
+		bd2->header.dest_subnodeid = 0x10; /* the LB */
+		bd2->header.command = UV_NET_ENDPOINT_INTD;
+		bd2->header.int_both = 1;
+		/*
+		 * all others need to be set to zero:
+		 *   fairness chaining multilevel count replied_to
+		 */
+	}
+	for_each_present_cpu(cpu) {
+		if (pnode != uv_blade_to_pnode(uv_cpu_to_blade_id(cpu)))
+			continue;
+		bcp = &per_cpu(bau_control, cpu);
+		bcp->descriptor_base = bau_desc;
+	}
+}
+
+/*
+ * initialize the destination side's receiving buffers
+ * entered for each uvhub in the partition
+ * - node is first node (kernel memory notion) on the uvhub
+ * - pnode is the uvhub's physical identifier
+ */
+static void
+uv_payload_queue_init(int node, int pnode)
+{
+	int pn;
+	int cpu;
+	char *cp;
+	unsigned long pa;
+	struct bau_payload_queue_entry *pqp;
+	struct bau_payload_queue_entry *pqp_malloc;
+	struct bau_control *bcp;
+
+	pqp = (struct bau_payload_queue_entry *) kmalloc_node(
+		(DEST_Q_SIZE + 1) * sizeof(struct bau_payload_queue_entry),
+		GFP_KERNEL, node);
+	BUG_ON(!pqp);
+	pqp_malloc = pqp;
+
+	cp = (char *)pqp + 31;
+	pqp = (struct bau_payload_queue_entry *)(((unsigned long)cp >> 5) << 5);
+
+	for_each_present_cpu(cpu) {
+		if (pnode != uv_cpu_to_pnode(cpu))
+			continue;
+		/* for every cpu on this pnode: */
+		bcp = &per_cpu(bau_control, cpu);
+		bcp->va_queue_first = pqp;
+		bcp->bau_msg_head = pqp;
+		bcp->va_queue_last = pqp + (DEST_Q_SIZE - 1);
+	}
+	/*
+	 * need the pnode of where the memory was really allocated
+	 */
+	pa = uv_gpa(pqp);
+	pn = pa >> uv_nshift;
+	uv_write_global_mmr64(pnode,
+			      UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST,
+			      ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) |
+			      uv_physnodeaddr(pqp));
+	uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL,
+			      uv_physnodeaddr(pqp));
+	uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST,
+			      (unsigned long)
+			      uv_physnodeaddr(pqp + (DEST_Q_SIZE - 1)));
+	/* in effect, all msg_type's are set to MSG_NOOP */
+	memset(pqp, 0, sizeof(struct bau_payload_queue_entry) * DEST_Q_SIZE);
+}
+
+/*
+ * Initialization of each UV hub's structures
+ */
+static void __init uv_init_uvhub(int uvhub, int vector)
+{
+	int node;
+	int pnode;
+	unsigned long apicid;
+
+	node = uvhub_to_first_node(uvhub);
+	pnode = uv_blade_to_pnode(uvhub);
+	uv_activation_descriptor_init(node, pnode);
+	uv_payload_queue_init(node, pnode);
+	/*
+	 * the below initialization can't be in firmware because the
+	 * messaging IRQ will be determined by the OS
+	 */
+	apicid = uvhub_to_first_apicid(uvhub);
+	uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
+				      ((apicid << 32) | vector));
+}
+
+/*
+ * We will set BAU_MISC_CONTROL with a timeout period.
+ * But the BIOS has set UVH_AGING_PRESCALE_SEL and UVH_TRANSACTION_TIMEOUT.
+ * So the destination timeout period has be be calculated from them.
+ */
+static int
+calculate_destination_timeout(void)
+{
+	unsigned long mmr_image;
+	int mult1;
+	int mult2;
+	int index;
+	int base;
+	int ret;
+	unsigned long ts_ns;
+
+	mult1 = UV_INTD_SOFT_ACK_TIMEOUT_PERIOD & BAU_MISC_CONTROL_MULT_MASK;
+	mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL);
+	index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK;
+	mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT);
+	mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK;
+	base = timeout_base_ns[index];
+	ts_ns = base * mult1 * mult2;
+	ret = ts_ns / 1000;
+	return ret;
+}
+
+/*
+ * initialize the bau_control structure for each cpu
+ */
+static void __init uv_init_per_cpu(int nuvhubs)
+{
+	int i;
+	int cpu;
+	int pnode;
+	int uvhub;
+	int have_hmaster;
+	short socket = 0;
+	unsigned short socket_mask;
+	unsigned char *uvhub_mask;
+	struct bau_control *bcp;
+	struct uvhub_desc *bdp;
+	struct socket_desc *sdp;
+	struct bau_control *hmaster = NULL;
+	struct bau_control *smaster = NULL;
+	struct socket_desc {
+		short num_cpus;
+		short cpu_number[16];
+	};
+	struct uvhub_desc {
+		unsigned short socket_mask;
+		short num_cpus;
+		short uvhub;
+		short pnode;
+		struct socket_desc socket[2];
+	};
+	struct uvhub_desc *uvhub_descs;
+
+	timeout_us = calculate_destination_timeout();
+
+	uvhub_descs = (struct uvhub_desc *)
+		kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL);
+	memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc));
+	uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL);
+	for_each_present_cpu(cpu) {
+		bcp = &per_cpu(bau_control, cpu);
+		memset(bcp, 0, sizeof(struct bau_control));
+		pnode = uv_cpu_hub_info(cpu)->pnode;
+		uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
+		*(uvhub_mask + (uvhub/8)) |= (1 << (uvhub%8));
+		bdp = &uvhub_descs[uvhub];
+		bdp->num_cpus++;
+		bdp->uvhub = uvhub;
+		bdp->pnode = pnode;
+		/* kludge: 'assuming' one node per socket, and assuming that
+		   disabling a socket just leaves a gap in node numbers */
+		socket = (cpu_to_node(cpu) & 1);
+		bdp->socket_mask |= (1 << socket);
+		sdp = &bdp->socket[socket];
+		sdp->cpu_number[sdp->num_cpus] = cpu;
+		sdp->num_cpus++;
+	}
+	for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
+		if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8))))
+			continue;
+		have_hmaster = 0;
+		bdp = &uvhub_descs[uvhub];
+		socket_mask = bdp->socket_mask;
+		socket = 0;
+		while (socket_mask) {
+			if (!(socket_mask & 1))
+				goto nextsocket;
+			sdp = &bdp->socket[socket];
+			for (i = 0; i < sdp->num_cpus; i++) {
+				cpu = sdp->cpu_number[i];
+				bcp = &per_cpu(bau_control, cpu);
+				bcp->cpu = cpu;
+				if (i == 0) {
+					smaster = bcp;
+					if (!have_hmaster) {
+						have_hmaster++;
+						hmaster = bcp;
+					}
+				}
+				bcp->cpus_in_uvhub = bdp->num_cpus;
+				bcp->cpus_in_socket = sdp->num_cpus;
+				bcp->socket_master = smaster;
+				bcp->uvhub = bdp->uvhub;
+				bcp->uvhub_master = hmaster;
+				bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->
+						blade_processor_id;
+			}
+nextsocket:
+			socket++;
+			socket_mask = (socket_mask >> 1);
+		}
+	}
+	kfree(uvhub_descs);
+	kfree(uvhub_mask);
+	for_each_present_cpu(cpu) {
+		bcp = &per_cpu(bau_control, cpu);
+		bcp->baudisabled = 0;
+		bcp->statp = &per_cpu(ptcstats, cpu);
+		/* time interval to catch a hardware stay-busy bug */
+		bcp->timeout_interval = microsec_2_cycles(2*timeout_us);
+		bcp->max_bau_concurrent = max_bau_concurrent;
+		bcp->max_bau_concurrent_constant = max_bau_concurrent;
+		bcp->plugged_delay = plugged_delay;
+		bcp->plugsb4reset = plugsb4reset;
+		bcp->timeoutsb4reset = timeoutsb4reset;
+		bcp->ipi_reset_limit = ipi_reset_limit;
+		bcp->complete_threshold = complete_threshold;
+		bcp->congested_response_us = congested_response_us;
+		bcp->congested_reps = congested_reps;
+		bcp->congested_period = congested_period;
+	}
+}
+
+/*
+ * Initialization of BAU-related structures
+ */
+static int __init uv_bau_init(void)
+{
+	int uvhub;
+	int pnode;
+	int nuvhubs;
+	int cur_cpu;
+	int vector;
+	unsigned long mmr;
+
+	if (!is_uv_system())
+		return 0;
+
+	if (nobau)
+		return 0;
+
+	for_each_possible_cpu(cur_cpu)
+		zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu),
+				       GFP_KERNEL, cpu_to_node(cur_cpu));
+
+	uv_nshift = uv_hub_info->m_val;
+	uv_mmask = (1UL << uv_hub_info->m_val) - 1;
+	nuvhubs = uv_num_possible_blades();
+	spin_lock_init(&disable_lock);
+	congested_cycles = microsec_2_cycles(congested_response_us);
+
+	uv_init_per_cpu(nuvhubs);
+
+	uv_partition_base_pnode = 0x7fffffff;
+	for (uvhub = 0; uvhub < nuvhubs; uvhub++)
+		if (uv_blade_nr_possible_cpus(uvhub) &&
+			(uv_blade_to_pnode(uvhub) < uv_partition_base_pnode))
+			uv_partition_base_pnode = uv_blade_to_pnode(uvhub);
+
+	vector = UV_BAU_MESSAGE;
+	for_each_possible_blade(uvhub)
+		if (uv_blade_nr_possible_cpus(uvhub))
+			uv_init_uvhub(uvhub, vector);
+
+	uv_enable_timeouts();
+	alloc_intr_gate(vector, uv_bau_message_intr1);
+
+	for_each_possible_blade(uvhub) {
+		if (uv_blade_nr_possible_cpus(uvhub)) {
+			pnode = uv_blade_to_pnode(uvhub);
+			/* INIT the bau */
+			uv_write_global_mmr64(pnode,
+					UVH_LB_BAU_SB_ACTIVATION_CONTROL,
+					((unsigned long)1 << 63));
+			mmr = 1; /* should be 1 to broadcast to both sockets */
+			uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST,
+						mmr);
+		}
+	}
+
+	return 0;
+}
+core_initcall(uv_bau_init);
+fs_initcall(uv_ptc_init);
diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c
new file mode 100644
index 00000000000..7b24460917d
--- /dev/null
+++ b/arch/x86/platform/uv/uv_irq.c
@@ -0,0 +1,285 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * SGI UV IRQ functions
+ *
+ * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved.
+ */
+
+#include <linux/module.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <linux/irq.h>
+
+#include <asm/apic.h>
+#include <asm/uv/uv_irq.h>
+#include <asm/uv/uv_hub.h>
+
+/* MMR offset and pnode of hub sourcing interrupts for a given irq */
+struct uv_irq_2_mmr_pnode{
+	struct rb_node		list;
+	unsigned long		offset;
+	int			pnode;
+	int			irq;
+};
+
+static spinlock_t		uv_irq_lock;
+static struct rb_root		uv_irq_root;
+
+static int uv_set_irq_affinity(struct irq_data *, const struct cpumask *, bool);
+
+static void uv_noop(struct irq_data *data) { }
+
+static void uv_ack_apic(struct irq_data *data)
+{
+	ack_APIC_irq();
+}
+
+static struct irq_chip uv_irq_chip = {
+	.name			= "UV-CORE",
+	.irq_mask		= uv_noop,
+	.irq_unmask		= uv_noop,
+	.irq_eoi		= uv_ack_apic,
+	.irq_set_affinity	= uv_set_irq_affinity,
+};
+
+/*
+ * Add offset and pnode information of the hub sourcing interrupts to the
+ * rb tree for a specific irq.
+ */
+static int uv_set_irq_2_mmr_info(int irq, unsigned long offset, unsigned blade)
+{
+	struct rb_node **link = &uv_irq_root.rb_node;
+	struct rb_node *parent = NULL;
+	struct uv_irq_2_mmr_pnode *n;
+	struct uv_irq_2_mmr_pnode *e;
+	unsigned long irqflags;
+
+	n = kmalloc_node(sizeof(struct uv_irq_2_mmr_pnode), GFP_KERNEL,
+				uv_blade_to_memory_nid(blade));
+	if (!n)
+		return -ENOMEM;
+
+	n->irq = irq;
+	n->offset = offset;
+	n->pnode = uv_blade_to_pnode(blade);
+	spin_lock_irqsave(&uv_irq_lock, irqflags);
+	/* Find the right place in the rbtree: */
+	while (*link) {
+		parent = *link;
+		e = rb_entry(parent, struct uv_irq_2_mmr_pnode, list);
+
+		if (unlikely(irq == e->irq)) {
+			/* irq entry exists */
+			e->pnode = uv_blade_to_pnode(blade);
+			e->offset = offset;
+			spin_unlock_irqrestore(&uv_irq_lock, irqflags);
+			kfree(n);
+			return 0;
+		}
+
+		if (irq < e->irq)
+			link = &(*link)->rb_left;
+		else
+			link = &(*link)->rb_right;
+	}
+
+	/* Insert the node into the rbtree. */
+	rb_link_node(&n->list, parent, link);
+	rb_insert_color(&n->list, &uv_irq_root);
+
+	spin_unlock_irqrestore(&uv_irq_lock, irqflags);
+	return 0;
+}
+
+/* Retrieve offset and pnode information from the rb tree for a specific irq */
+int uv_irq_2_mmr_info(int irq, unsigned long *offset, int *pnode)
+{
+	struct uv_irq_2_mmr_pnode *e;
+	struct rb_node *n;
+	unsigned long irqflags;
+
+	spin_lock_irqsave(&uv_irq_lock, irqflags);
+	n = uv_irq_root.rb_node;
+	while (n) {
+		e = rb_entry(n, struct uv_irq_2_mmr_pnode, list);
+
+		if (e->irq == irq) {
+			*offset = e->offset;
+			*pnode = e->pnode;
+			spin_unlock_irqrestore(&uv_irq_lock, irqflags);
+			return 0;
+		}
+
+		if (irq < e->irq)
+			n = n->rb_left;
+		else
+			n = n->rb_right;
+	}
+	spin_unlock_irqrestore(&uv_irq_lock, irqflags);
+	return -1;
+}
+
+/*
+ * Re-target the irq to the specified CPU and enable the specified MMR located
+ * on the specified blade to allow the sending of MSIs to the specified CPU.
+ */
+static int
+arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
+		       unsigned long mmr_offset, int limit)
+{
+	const struct cpumask *eligible_cpu = cpumask_of(cpu);
+	struct irq_cfg *cfg = get_irq_chip_data(irq);
+	unsigned long mmr_value;
+	struct uv_IO_APIC_route_entry *entry;
+	int mmr_pnode, err;
+
+	BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
+			sizeof(unsigned long));
+
+	err = assign_irq_vector(irq, cfg, eligible_cpu);
+	if (err != 0)
+		return err;
+
+	if (limit == UV_AFFINITY_CPU)
+		irq_set_status_flags(irq, IRQ_NO_BALANCING);
+	else
+		irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
+
+	set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq,
+				      irq_name);
+
+	mmr_value = 0;
+	entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
+	entry->vector		= cfg->vector;
+	entry->delivery_mode	= apic->irq_delivery_mode;
+	entry->dest_mode	= apic->irq_dest_mode;
+	entry->polarity		= 0;
+	entry->trigger		= 0;
+	entry->mask		= 0;
+	entry->dest		= apic->cpu_mask_to_apicid(eligible_cpu);
+
+	mmr_pnode = uv_blade_to_pnode(mmr_blade);
+	uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
+
+	if (cfg->move_in_progress)
+		send_cleanup_vector(cfg);
+
+	return irq;
+}
+
+/*
+ * Disable the specified MMR located on the specified blade so that MSIs are
+ * longer allowed to be sent.
+ */
+static void arch_disable_uv_irq(int mmr_pnode, unsigned long mmr_offset)
+{
+	unsigned long mmr_value;
+	struct uv_IO_APIC_route_entry *entry;
+
+	BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
+			sizeof(unsigned long));
+
+	mmr_value = 0;
+	entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
+	entry->mask = 1;
+
+	uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
+}
+
+static int
+uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask,
+		    bool force)
+{
+	struct irq_cfg *cfg = data->chip_data;
+	unsigned int dest;
+	unsigned long mmr_value, mmr_offset;
+	struct uv_IO_APIC_route_entry *entry;
+	int mmr_pnode;
+
+	if (__ioapic_set_affinity(data, mask, &dest))
+		return -1;
+
+	mmr_value = 0;
+	entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
+
+	entry->vector		= cfg->vector;
+	entry->delivery_mode	= apic->irq_delivery_mode;
+	entry->dest_mode	= apic->irq_dest_mode;
+	entry->polarity		= 0;
+	entry->trigger		= 0;
+	entry->mask		= 0;
+	entry->dest		= dest;
+
+	/* Get previously stored MMR and pnode of hub sourcing interrupts */
+	if (uv_irq_2_mmr_info(data->irq, &mmr_offset, &mmr_pnode))
+		return -1;
+
+	uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
+
+	if (cfg->move_in_progress)
+		send_cleanup_vector(cfg);
+
+	return 0;
+}
+
+/*
+ * Set up a mapping of an available irq and vector, and enable the specified
+ * MMR that defines the MSI that is to be sent to the specified CPU when an
+ * interrupt is raised.
+ */
+int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
+		 unsigned long mmr_offset, int limit)
+{
+	int irq, ret;
+
+	irq = create_irq_nr(NR_IRQS_LEGACY, uv_blade_to_memory_nid(mmr_blade));
+
+	if (irq <= 0)
+		return -EBUSY;
+
+	ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset,
+		limit);
+	if (ret == irq)
+		uv_set_irq_2_mmr_info(irq, mmr_offset, mmr_blade);
+	else
+		destroy_irq(irq);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(uv_setup_irq);
+
+/*
+ * Tear down a mapping of an irq and vector, and disable the specified MMR that
+ * defined the MSI that was to be sent to the specified CPU when an interrupt
+ * was raised.
+ *
+ * Set mmr_blade and mmr_offset to what was passed in on uv_setup_irq().
+ */
+void uv_teardown_irq(unsigned int irq)
+{
+	struct uv_irq_2_mmr_pnode *e;
+	struct rb_node *n;
+	unsigned long irqflags;
+
+	spin_lock_irqsave(&uv_irq_lock, irqflags);
+	n = uv_irq_root.rb_node;
+	while (n) {
+		e = rb_entry(n, struct uv_irq_2_mmr_pnode, list);
+		if (e->irq == irq) {
+			arch_disable_uv_irq(e->pnode, e->offset);
+			rb_erase(n, &uv_irq_root);
+			kfree(e);
+			break;
+		}
+		if (irq < e->irq)
+			n = n->rb_left;
+		else
+			n = n->rb_right;
+	}
+	spin_unlock_irqrestore(&uv_irq_lock, irqflags);
+	destroy_irq(irq);
+}
+EXPORT_SYMBOL_GPL(uv_teardown_irq);
diff --git a/arch/x86/platform/uv/uv_sysfs.c b/arch/x86/platform/uv/uv_sysfs.c
new file mode 100644
index 00000000000..309c70fb775
--- /dev/null
+++ b/arch/x86/platform/uv/uv_sysfs.c
@@ -0,0 +1,76 @@
+/*
+ * This file supports the /sys/firmware/sgi_uv interfaces for SGI UV.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *
+ *  Copyright (c) 2008 Silicon Graphics, Inc.  All Rights Reserved.
+ *  Copyright (c) Russ Anderson
+ */
+
+#include <linux/sysdev.h>
+#include <asm/uv/bios.h>
+#include <asm/uv/uv.h>
+
+struct kobject *sgi_uv_kobj;
+
+static ssize_t partition_id_show(struct kobject *kobj,
+			struct kobj_attribute *attr, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%ld\n", sn_partition_id);
+}
+
+static ssize_t coherence_id_show(struct kobject *kobj,
+			struct kobj_attribute *attr, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%ld\n", partition_coherence_id());
+}
+
+static struct kobj_attribute partition_id_attr =
+	__ATTR(partition_id, S_IRUGO, partition_id_show, NULL);
+
+static struct kobj_attribute coherence_id_attr =
+	__ATTR(coherence_id, S_IRUGO, coherence_id_show, NULL);
+
+
+static int __init sgi_uv_sysfs_init(void)
+{
+	unsigned long ret;
+
+	if (!is_uv_system())
+		return -ENODEV;
+
+	if (!sgi_uv_kobj)
+		sgi_uv_kobj = kobject_create_and_add("sgi_uv", firmware_kobj);
+	if (!sgi_uv_kobj) {
+		printk(KERN_WARNING "kobject_create_and_add sgi_uv failed\n");
+		return -EINVAL;
+	}
+
+	ret = sysfs_create_file(sgi_uv_kobj, &partition_id_attr.attr);
+	if (ret) {
+		printk(KERN_WARNING "sysfs_create_file partition_id failed\n");
+		return ret;
+	}
+
+	ret = sysfs_create_file(sgi_uv_kobj, &coherence_id_attr.attr);
+	if (ret) {
+		printk(KERN_WARNING "sysfs_create_file coherence_id failed\n");
+		return ret;
+	}
+
+	return 0;
+}
+
+device_initcall(sgi_uv_sysfs_init);
diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c
new file mode 100644
index 00000000000..56e421bc379
--- /dev/null
+++ b/arch/x86/platform/uv/uv_time.c
@@ -0,0 +1,423 @@
+/*
+ * SGI RTC clock/timer routines.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *
+ *  Copyright (c) 2009 Silicon Graphics, Inc.  All Rights Reserved.
+ *  Copyright (c) Dimitri Sivanich
+ */
+#include <linux/clockchips.h>
+#include <linux/slab.h>
+
+#include <asm/uv/uv_mmrs.h>
+#include <asm/uv/uv_hub.h>
+#include <asm/uv/bios.h>
+#include <asm/uv/uv.h>
+#include <asm/apic.h>
+#include <asm/cpu.h>
+
+#define RTC_NAME		"sgi_rtc"
+
+static cycle_t uv_read_rtc(struct clocksource *cs);
+static int uv_rtc_next_event(unsigned long, struct clock_event_device *);
+static void uv_rtc_timer_setup(enum clock_event_mode,
+				struct clock_event_device *);
+
+static struct clocksource clocksource_uv = {
+	.name		= RTC_NAME,
+	.rating		= 400,
+	.read		= uv_read_rtc,
+	.mask		= (cycle_t)UVH_RTC_REAL_TIME_CLOCK_MASK,
+	.shift		= 10,
+	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
+};
+
+static struct clock_event_device clock_event_device_uv = {
+	.name		= RTC_NAME,
+	.features	= CLOCK_EVT_FEAT_ONESHOT,
+	.shift		= 20,
+	.rating		= 400,
+	.irq		= -1,
+	.set_next_event	= uv_rtc_next_event,
+	.set_mode	= uv_rtc_timer_setup,
+	.event_handler	= NULL,
+};
+
+static DEFINE_PER_CPU(struct clock_event_device, cpu_ced);
+
+/* There is one of these allocated per node */
+struct uv_rtc_timer_head {
+	spinlock_t	lock;
+	/* next cpu waiting for timer, local node relative: */
+	int		next_cpu;
+	/* number of cpus on this node: */
+	int		ncpus;
+	struct {
+		int	lcpu;		/* systemwide logical cpu number */
+		u64	expires;	/* next timer expiration for this cpu */
+	} cpu[1];
+};
+
+/*
+ * Access to uv_rtc_timer_head via blade id.
+ */
+static struct uv_rtc_timer_head		**blade_info __read_mostly;
+
+static int				uv_rtc_evt_enable;
+
+/*
+ * Hardware interface routines
+ */
+
+/* Send IPIs to another node */
+static void uv_rtc_send_IPI(int cpu)
+{
+	unsigned long apicid, val;
+	int pnode;
+
+	apicid = cpu_physical_id(cpu);
+	pnode = uv_apicid_to_pnode(apicid);
+	val = (1UL << UVH_IPI_INT_SEND_SHFT) |
+	      (apicid << UVH_IPI_INT_APIC_ID_SHFT) |
+	      (X86_PLATFORM_IPI_VECTOR << UVH_IPI_INT_VECTOR_SHFT);
+
+	uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
+}
+
+/* Check for an RTC interrupt pending */
+static int uv_intr_pending(int pnode)
+{
+	return uv_read_global_mmr64(pnode, UVH_EVENT_OCCURRED0) &
+		UVH_EVENT_OCCURRED0_RTC1_MASK;
+}
+
+/* Setup interrupt and return non-zero if early expiration occurred. */
+static int uv_setup_intr(int cpu, u64 expires)
+{
+	u64 val;
+	int pnode = uv_cpu_to_pnode(cpu);
+
+	uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG,
+		UVH_RTC1_INT_CONFIG_M_MASK);
+	uv_write_global_mmr64(pnode, UVH_INT_CMPB, -1L);
+
+	uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS,
+		UVH_EVENT_OCCURRED0_RTC1_MASK);
+
+	val = (X86_PLATFORM_IPI_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) |
+		((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT);
+
+	/* Set configuration */
+	uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, val);
+	/* Initialize comparator value */
+	uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires);
+
+	if (uv_read_rtc(NULL) <= expires)
+		return 0;
+
+	return !uv_intr_pending(pnode);
+}
+
+/*
+ * Per-cpu timer tracking routines
+ */
+
+static __init void uv_rtc_deallocate_timers(void)
+{
+	int bid;
+
+	for_each_possible_blade(bid) {
+		kfree(blade_info[bid]);
+	}
+	kfree(blade_info);
+}
+
+/* Allocate per-node list of cpu timer expiration times. */
+static __init int uv_rtc_allocate_timers(void)
+{
+	int cpu;
+
+	blade_info = kmalloc(uv_possible_blades * sizeof(void *), GFP_KERNEL);
+	if (!blade_info)
+		return -ENOMEM;
+	memset(blade_info, 0, uv_possible_blades * sizeof(void *));
+
+	for_each_present_cpu(cpu) {
+		int nid = cpu_to_node(cpu);
+		int bid = uv_cpu_to_blade_id(cpu);
+		int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
+		struct uv_rtc_timer_head *head = blade_info[bid];
+
+		if (!head) {
+			head = kmalloc_node(sizeof(struct uv_rtc_timer_head) +
+				(uv_blade_nr_possible_cpus(bid) *
+					2 * sizeof(u64)),
+				GFP_KERNEL, nid);
+			if (!head) {
+				uv_rtc_deallocate_timers();
+				return -ENOMEM;
+			}
+			spin_lock_init(&head->lock);
+			head->ncpus = uv_blade_nr_possible_cpus(bid);
+			head->next_cpu = -1;
+			blade_info[bid] = head;
+		}
+
+		head->cpu[bcpu].lcpu = cpu;
+		head->cpu[bcpu].expires = ULLONG_MAX;
+	}
+
+	return 0;
+}
+
+/* Find and set the next expiring timer.  */
+static void uv_rtc_find_next_timer(struct uv_rtc_timer_head *head, int pnode)
+{
+	u64 lowest = ULLONG_MAX;
+	int c, bcpu = -1;
+
+	head->next_cpu = -1;
+	for (c = 0; c < head->ncpus; c++) {
+		u64 exp = head->cpu[c].expires;
+		if (exp < lowest) {
+			bcpu = c;
+			lowest = exp;
+		}
+	}
+	if (bcpu >= 0) {
+		head->next_cpu = bcpu;
+		c = head->cpu[bcpu].lcpu;
+		if (uv_setup_intr(c, lowest))
+			/* If we didn't set it up in time, trigger */
+			uv_rtc_send_IPI(c);
+	} else {
+		uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG,
+			UVH_RTC1_INT_CONFIG_M_MASK);
+	}
+}
+
+/*
+ * Set expiration time for current cpu.
+ *
+ * Returns 1 if we missed the expiration time.
+ */
+static int uv_rtc_set_timer(int cpu, u64 expires)
+{
+	int pnode = uv_cpu_to_pnode(cpu);
+	int bid = uv_cpu_to_blade_id(cpu);
+	struct uv_rtc_timer_head *head = blade_info[bid];
+	int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
+	u64 *t = &head->cpu[bcpu].expires;
+	unsigned long flags;
+	int next_cpu;
+
+	spin_lock_irqsave(&head->lock, flags);
+
+	next_cpu = head->next_cpu;
+	*t = expires;
+
+	/* Will this one be next to go off? */
+	if (next_cpu < 0 || bcpu == next_cpu ||
+			expires < head->cpu[next_cpu].expires) {
+		head->next_cpu = bcpu;
+		if (uv_setup_intr(cpu, expires)) {
+			*t = ULLONG_MAX;
+			uv_rtc_find_next_timer(head, pnode);
+			spin_unlock_irqrestore(&head->lock, flags);
+			return -ETIME;
+		}
+	}
+
+	spin_unlock_irqrestore(&head->lock, flags);
+	return 0;
+}
+
+/*
+ * Unset expiration time for current cpu.
+ *
+ * Returns 1 if this timer was pending.
+ */
+static int uv_rtc_unset_timer(int cpu, int force)
+{
+	int pnode = uv_cpu_to_pnode(cpu);
+	int bid = uv_cpu_to_blade_id(cpu);
+	struct uv_rtc_timer_head *head = blade_info[bid];
+	int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
+	u64 *t = &head->cpu[bcpu].expires;
+	unsigned long flags;
+	int rc = 0;
+
+	spin_lock_irqsave(&head->lock, flags);
+
+	if ((head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) || force)
+		rc = 1;
+
+	if (rc) {
+		*t = ULLONG_MAX;
+		/* Was the hardware setup for this timer? */
+		if (head->next_cpu == bcpu)
+			uv_rtc_find_next_timer(head, pnode);
+	}
+
+	spin_unlock_irqrestore(&head->lock, flags);
+
+	return rc;
+}
+
+
+/*
+ * Kernel interface routines.
+ */
+
+/*
+ * Read the RTC.
+ *
+ * Starting with HUB rev 2.0, the UV RTC register is replicated across all
+ * cachelines of it's own page.  This allows faster simultaneous reads
+ * from a given socket.
+ */
+static cycle_t uv_read_rtc(struct clocksource *cs)
+{
+	unsigned long offset;
+
+	if (uv_get_min_hub_revision_id() == 1)
+		offset = 0;
+	else
+		offset = (uv_blade_processor_id() * L1_CACHE_BYTES) % PAGE_SIZE;
+
+	return (cycle_t)uv_read_local_mmr(UVH_RTC | offset);
+}
+
+/*
+ * Program the next event, relative to now
+ */
+static int uv_rtc_next_event(unsigned long delta,
+			     struct clock_event_device *ced)
+{
+	int ced_cpu = cpumask_first(ced->cpumask);
+
+	return uv_rtc_set_timer(ced_cpu, delta + uv_read_rtc(NULL));
+}
+
+/*
+ * Setup the RTC timer in oneshot mode
+ */
+static void uv_rtc_timer_setup(enum clock_event_mode mode,
+			       struct clock_event_device *evt)
+{
+	int ced_cpu = cpumask_first(evt->cpumask);
+
+	switch (mode) {
+	case CLOCK_EVT_MODE_PERIODIC:
+	case CLOCK_EVT_MODE_ONESHOT:
+	case CLOCK_EVT_MODE_RESUME:
+		/* Nothing to do here yet */
+		break;
+	case CLOCK_EVT_MODE_UNUSED:
+	case CLOCK_EVT_MODE_SHUTDOWN:
+		uv_rtc_unset_timer(ced_cpu, 1);
+		break;
+	}
+}
+
+static void uv_rtc_interrupt(void)
+{
+	int cpu = smp_processor_id();
+	struct clock_event_device *ced = &per_cpu(cpu_ced, cpu);
+
+	if (!ced || !ced->event_handler)
+		return;
+
+	if (uv_rtc_unset_timer(cpu, 0) != 1)
+		return;
+
+	ced->event_handler(ced);
+}
+
+static int __init uv_enable_evt_rtc(char *str)
+{
+	uv_rtc_evt_enable = 1;
+
+	return 1;
+}
+__setup("uvrtcevt", uv_enable_evt_rtc);
+
+static __init void uv_rtc_register_clockevents(struct work_struct *dummy)
+{
+	struct clock_event_device *ced = &__get_cpu_var(cpu_ced);
+
+	*ced = clock_event_device_uv;
+	ced->cpumask = cpumask_of(smp_processor_id());
+	clockevents_register_device(ced);
+}
+
+static __init int uv_rtc_setup_clock(void)
+{
+	int rc;
+
+	if (!is_uv_system())
+		return -ENODEV;
+
+	clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second,
+				clocksource_uv.shift);
+
+	/* If single blade, prefer tsc */
+	if (uv_num_possible_blades() == 1)
+		clocksource_uv.rating = 250;
+
+	rc = clocksource_register(&clocksource_uv);
+	if (rc)
+		printk(KERN_INFO "UV RTC clocksource failed rc %d\n", rc);
+	else
+		printk(KERN_INFO "UV RTC clocksource registered freq %lu MHz\n",
+			sn_rtc_cycles_per_second/(unsigned long)1E6);
+
+	if (rc || !uv_rtc_evt_enable || x86_platform_ipi_callback)
+		return rc;
+
+	/* Setup and register clockevents */
+	rc = uv_rtc_allocate_timers();
+	if (rc)
+		goto error;
+
+	x86_platform_ipi_callback = uv_rtc_interrupt;
+
+	clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second,
+				NSEC_PER_SEC, clock_event_device_uv.shift);
+
+	clock_event_device_uv.min_delta_ns = NSEC_PER_SEC /
+						sn_rtc_cycles_per_second;
+
+	clock_event_device_uv.max_delta_ns = clocksource_uv.mask *
+				(NSEC_PER_SEC / sn_rtc_cycles_per_second);
+
+	rc = schedule_on_each_cpu(uv_rtc_register_clockevents);
+	if (rc) {
+		x86_platform_ipi_callback = NULL;
+		uv_rtc_deallocate_timers();
+		goto error;
+	}
+
+	printk(KERN_INFO "UV RTC clockevents registered\n");
+
+	return 0;
+
+error:
+	clocksource_unregister(&clocksource_uv);
+	printk(KERN_INFO "UV RTC clockevents failed rc %d\n", rc);
+
+	return rc;
+}
+arch_initcall(uv_rtc_setup_clock);
-- 
cgit v1.2.3-18-g5258


From 8654b1c2de1465120974899fc1c8aa00e91d4b7e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 23 Oct 2010 11:28:42 +0200
Subject: x86: Move olpc to platform

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andres Salomon <dilinger@queued.net>
---
 arch/x86/kernel/Makefile          |   4 -
 arch/x86/kernel/olpc-xo1.c        | 140 -------------------
 arch/x86/kernel/olpc.c            | 281 --------------------------------------
 arch/x86/kernel/olpc_ofw.c        | 112 ---------------
 arch/x86/platform/Makefile        |   1 +
 arch/x86/platform/olpc/Makefile   |   3 +
 arch/x86/platform/olpc/olpc-xo1.c | 140 +++++++++++++++++++
 arch/x86/platform/olpc/olpc.c     | 281 ++++++++++++++++++++++++++++++++++++++
 arch/x86/platform/olpc/olpc_ofw.c | 112 +++++++++++++++
 9 files changed, 537 insertions(+), 537 deletions(-)
 delete mode 100644 arch/x86/kernel/olpc-xo1.c
 delete mode 100644 arch/x86/kernel/olpc.c
 delete mode 100644 arch/x86/kernel/olpc_ofw.c
 create mode 100644 arch/x86/platform/olpc/Makefile
 create mode 100644 arch/x86/platform/olpc/olpc-xo1.c
 create mode 100644 arch/x86/platform/olpc/olpc.c
 create mode 100644 arch/x86/platform/olpc/olpc_ofw.c

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 08e2e4bf839..9e13763b609 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -101,10 +101,6 @@ obj-$(CONFIG_PARAVIRT_CLOCK)	+= pvclock.o
 
 obj-$(CONFIG_PCSPKR_PLATFORM)	+= pcspeaker.o
 
-obj-$(CONFIG_OLPC)		+= olpc.o
-obj-$(CONFIG_OLPC_XO1)		+= olpc-xo1.o
-obj-$(CONFIG_OLPC_OPENFIRMWARE)	+= olpc_ofw.o
-
 microcode-y				:= microcode_core.o
 microcode-$(CONFIG_MICROCODE_INTEL)	+= microcode_intel.o
 microcode-$(CONFIG_MICROCODE_AMD)	+= microcode_amd.o
diff --git a/arch/x86/kernel/olpc-xo1.c b/arch/x86/kernel/olpc-xo1.c
deleted file mode 100644
index f5442c03abc..00000000000
--- a/arch/x86/kernel/olpc-xo1.c
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Support for features of the OLPC XO-1 laptop
- *
- * Copyright (C) 2010 One Laptop per Child
- * Copyright (C) 2006 Red Hat, Inc.
- * Copyright (C) 2006 Advanced Micro Devices, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <linux/module.h>
-#include <linux/pci.h>
-#include <linux/pci_ids.h>
-#include <linux/platform_device.h>
-#include <linux/pm.h>
-
-#include <asm/io.h>
-#include <asm/olpc.h>
-
-#define DRV_NAME "olpc-xo1"
-
-#define PMS_BAR		4
-#define ACPI_BAR	5
-
-/* PMC registers (PMS block) */
-#define PM_SCLK		0x10
-#define PM_IN_SLPCTL	0x20
-#define PM_WKXD		0x34
-#define PM_WKD		0x30
-#define PM_SSC		0x54
-
-/* PM registers (ACPI block) */
-#define PM1_CNT		0x08
-#define PM_GPE0_STS	0x18
-
-static unsigned long acpi_base;
-static unsigned long pms_base;
-
-static void xo1_power_off(void)
-{
-	printk(KERN_INFO "OLPC XO-1 power off sequence...\n");
-
-	/* Enable all of these controls with 0 delay */
-	outl(0x40000000, pms_base + PM_SCLK);
-	outl(0x40000000, pms_base + PM_IN_SLPCTL);
-	outl(0x40000000, pms_base + PM_WKXD);
-	outl(0x40000000, pms_base + PM_WKD);
-
-	/* Clear status bits (possibly unnecessary) */
-	outl(0x0002ffff, pms_base  + PM_SSC);
-	outl(0xffffffff, acpi_base + PM_GPE0_STS);
-
-	/* Write SLP_EN bit to start the machinery */
-	outl(0x00002000, acpi_base + PM1_CNT);
-}
-
-/* Read the base addresses from the PCI BAR info */
-static int __devinit setup_bases(struct pci_dev *pdev)
-{
-	int r;
-
-	r = pci_enable_device_io(pdev);
-	if (r) {
-		dev_err(&pdev->dev, "can't enable device IO\n");
-		return r;
-	}
-
-	r = pci_request_region(pdev, ACPI_BAR, DRV_NAME);
-	if (r) {
-		dev_err(&pdev->dev, "can't alloc PCI BAR #%d\n", ACPI_BAR);
-		return r;
-	}
-
-	r = pci_request_region(pdev, PMS_BAR, DRV_NAME);
-	if (r) {
-		dev_err(&pdev->dev, "can't alloc PCI BAR #%d\n", PMS_BAR);
-		pci_release_region(pdev, ACPI_BAR);
-		return r;
-	}
-
-	acpi_base = pci_resource_start(pdev, ACPI_BAR);
-	pms_base = pci_resource_start(pdev, PMS_BAR);
-
-	return 0;
-}
-
-static int __devinit olpc_xo1_probe(struct platform_device *pdev)
-{
-	struct pci_dev *pcidev;
-	int r;
-
-	pcidev = pci_get_device(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA,
-				NULL);
-	if (!pdev)
-		return -ENODEV;
-
-	r = setup_bases(pcidev);
-	if (r)
-		return r;
-
-	pm_power_off = xo1_power_off;
-
-	printk(KERN_INFO "OLPC XO-1 support registered\n");
-	return 0;
-}
-
-static int __devexit olpc_xo1_remove(struct platform_device *pdev)
-{
-	pm_power_off = NULL;
-	return 0;
-}
-
-static struct platform_driver olpc_xo1_driver = {
-	.driver = {
-		.name = DRV_NAME,
-		.owner = THIS_MODULE,
-	},
-	.probe = olpc_xo1_probe,
-	.remove = __devexit_p(olpc_xo1_remove),
-};
-
-static int __init olpc_xo1_init(void)
-{
-	return platform_driver_register(&olpc_xo1_driver);
-}
-
-static void __exit olpc_xo1_exit(void)
-{
-	platform_driver_unregister(&olpc_xo1_driver);
-}
-
-MODULE_AUTHOR("Daniel Drake <dsd@laptop.org>");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("platform:olpc-xo1");
-
-module_init(olpc_xo1_init);
-module_exit(olpc_xo1_exit);
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c
deleted file mode 100644
index edaf3fe8dc5..00000000000
--- a/arch/x86/kernel/olpc.c
+++ /dev/null
@@ -1,281 +0,0 @@
-/*
- * Support for the OLPC DCON and OLPC EC access
- *
- * Copyright © 2006  Advanced Micro Devices, Inc.
- * Copyright © 2007-2008  Andres Salomon <dilinger@debian.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/delay.h>
-#include <linux/spinlock.h>
-#include <linux/io.h>
-#include <linux/string.h>
-#include <linux/platform_device.h>
-
-#include <asm/geode.h>
-#include <asm/setup.h>
-#include <asm/olpc.h>
-#include <asm/olpc_ofw.h>
-
-struct olpc_platform_t olpc_platform_info;
-EXPORT_SYMBOL_GPL(olpc_platform_info);
-
-static DEFINE_SPINLOCK(ec_lock);
-
-/* what the timeout *should* be (in ms) */
-#define EC_BASE_TIMEOUT 20
-
-/* the timeout that bugs in the EC might force us to actually use */
-static int ec_timeout = EC_BASE_TIMEOUT;
-
-static int __init olpc_ec_timeout_set(char *str)
-{
-	if (get_option(&str, &ec_timeout) != 1) {
-		ec_timeout = EC_BASE_TIMEOUT;
-		printk(KERN_ERR "olpc-ec:  invalid argument to "
-				"'olpc_ec_timeout=', ignoring!\n");
-	}
-	printk(KERN_DEBUG "olpc-ec:  using %d ms delay for EC commands.\n",
-			ec_timeout);
-	return 1;
-}
-__setup("olpc_ec_timeout=", olpc_ec_timeout_set);
-
-/*
- * These {i,o}bf_status functions return whether the buffers are full or not.
- */
-
-static inline unsigned int ibf_status(unsigned int port)
-{
-	return !!(inb(port) & 0x02);
-}
-
-static inline unsigned int obf_status(unsigned int port)
-{
-	return inb(port) & 0x01;
-}
-
-#define wait_on_ibf(p, d) __wait_on_ibf(__LINE__, (p), (d))
-static int __wait_on_ibf(unsigned int line, unsigned int port, int desired)
-{
-	unsigned int timeo;
-	int state = ibf_status(port);
-
-	for (timeo = ec_timeout; state != desired && timeo; timeo--) {
-		mdelay(1);
-		state = ibf_status(port);
-	}
-
-	if ((state == desired) && (ec_timeout > EC_BASE_TIMEOUT) &&
-			timeo < (ec_timeout - EC_BASE_TIMEOUT)) {
-		printk(KERN_WARNING "olpc-ec:  %d: waited %u ms for IBF!\n",
-				line, ec_timeout - timeo);
-	}
-
-	return !(state == desired);
-}
-
-#define wait_on_obf(p, d) __wait_on_obf(__LINE__, (p), (d))
-static int __wait_on_obf(unsigned int line, unsigned int port, int desired)
-{
-	unsigned int timeo;
-	int state = obf_status(port);
-
-	for (timeo = ec_timeout; state != desired && timeo; timeo--) {
-		mdelay(1);
-		state = obf_status(port);
-	}
-
-	if ((state == desired) && (ec_timeout > EC_BASE_TIMEOUT) &&
-			timeo < (ec_timeout - EC_BASE_TIMEOUT)) {
-		printk(KERN_WARNING "olpc-ec:  %d: waited %u ms for OBF!\n",
-				line, ec_timeout - timeo);
-	}
-
-	return !(state == desired);
-}
-
-/*
- * This allows the kernel to run Embedded Controller commands.  The EC is
- * documented at <http://wiki.laptop.org/go/Embedded_controller>, and the
- * available EC commands are here:
- * <http://wiki.laptop.org/go/Ec_specification>.  Unfortunately, while
- * OpenFirmware's source is available, the EC's is not.
- */
-int olpc_ec_cmd(unsigned char cmd, unsigned char *inbuf, size_t inlen,
-		unsigned char *outbuf,  size_t outlen)
-{
-	unsigned long flags;
-	int ret = -EIO;
-	int i;
-	int restarts = 0;
-
-	spin_lock_irqsave(&ec_lock, flags);
-
-	/* Clear OBF */
-	for (i = 0; i < 10 && (obf_status(0x6c) == 1); i++)
-		inb(0x68);
-	if (i == 10) {
-		printk(KERN_ERR "olpc-ec:  timeout while attempting to "
-				"clear OBF flag!\n");
-		goto err;
-	}
-
-	if (wait_on_ibf(0x6c, 0)) {
-		printk(KERN_ERR "olpc-ec:  timeout waiting for EC to "
-				"quiesce!\n");
-		goto err;
-	}
-
-restart:
-	/*
-	 * Note that if we time out during any IBF checks, that's a failure;
-	 * we have to return.  There's no way for the kernel to clear that.
-	 *
-	 * If we time out during an OBF check, we can restart the command;
-	 * reissuing it will clear the OBF flag, and we should be alright.
-	 * The OBF flag will sometimes misbehave due to what we believe
-	 * is a hardware quirk..
-	 */
-	pr_devel("olpc-ec:  running cmd 0x%x\n", cmd);
-	outb(cmd, 0x6c);
-
-	if (wait_on_ibf(0x6c, 0)) {
-		printk(KERN_ERR "olpc-ec:  timeout waiting for EC to read "
-				"command!\n");
-		goto err;
-	}
-
-	if (inbuf && inlen) {
-		/* write data to EC */
-		for (i = 0; i < inlen; i++) {
-			if (wait_on_ibf(0x6c, 0)) {
-				printk(KERN_ERR "olpc-ec:  timeout waiting for"
-						" EC accept data!\n");
-				goto err;
-			}
-			pr_devel("olpc-ec:  sending cmd arg 0x%x\n", inbuf[i]);
-			outb(inbuf[i], 0x68);
-		}
-	}
-	if (outbuf && outlen) {
-		/* read data from EC */
-		for (i = 0; i < outlen; i++) {
-			if (wait_on_obf(0x6c, 1)) {
-				printk(KERN_ERR "olpc-ec:  timeout waiting for"
-						" EC to provide data!\n");
-				if (restarts++ < 10)
-					goto restart;
-				goto err;
-			}
-			outbuf[i] = inb(0x68);
-			pr_devel("olpc-ec:  received 0x%x\n", outbuf[i]);
-		}
-	}
-
-	ret = 0;
-err:
-	spin_unlock_irqrestore(&ec_lock, flags);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(olpc_ec_cmd);
-
-static bool __init check_ofw_architecture(void)
-{
-	size_t propsize;
-	char olpc_arch[5];
-	const void *args[] = { NULL, "architecture", olpc_arch, (void *)5 };
-	void *res[] = { &propsize };
-
-	if (olpc_ofw("getprop", args, res)) {
-		printk(KERN_ERR "ofw: getprop call failed!\n");
-		return false;
-	}
-	return propsize == 5 && strncmp("OLPC", olpc_arch, 5) == 0;
-}
-
-static u32 __init get_board_revision(void)
-{
-	size_t propsize;
-	__be32 rev;
-	const void *args[] = { NULL, "board-revision-int", &rev, (void *)4 };
-	void *res[] = { &propsize };
-
-	if (olpc_ofw("getprop", args, res) || propsize != 4) {
-		printk(KERN_ERR "ofw: getprop call failed!\n");
-		return cpu_to_be32(0);
-	}
-	return be32_to_cpu(rev);
-}
-
-static bool __init platform_detect(void)
-{
-	if (!check_ofw_architecture())
-		return false;
-	olpc_platform_info.flags |= OLPC_F_PRESENT;
-	olpc_platform_info.boardrev = get_board_revision();
-	return true;
-}
-
-static int __init add_xo1_platform_devices(void)
-{
-	struct platform_device *pdev;
-
-	pdev = platform_device_register_simple("xo1-rfkill", -1, NULL, 0);
-	if (IS_ERR(pdev))
-		return PTR_ERR(pdev);
-
-	pdev = platform_device_register_simple("olpc-xo1", -1, NULL, 0);
-	if (IS_ERR(pdev))
-		return PTR_ERR(pdev);
-
-	return 0;
-}
-
-static int __init olpc_init(void)
-{
-	int r = 0;
-
-	if (!olpc_ofw_present() || !platform_detect())
-		return 0;
-
-	spin_lock_init(&ec_lock);
-
-	/* assume B1 and above models always have a DCON */
-	if (olpc_board_at_least(olpc_board(0xb1)))
-		olpc_platform_info.flags |= OLPC_F_DCON;
-
-	/* get the EC revision */
-	olpc_ec_cmd(EC_FIRMWARE_REV, NULL, 0,
-			(unsigned char *) &olpc_platform_info.ecver, 1);
-
-#ifdef CONFIG_PCI_OLPC
-	/* If the VSA exists let it emulate PCI, if not emulate in kernel.
-	 * XO-1 only. */
-	if (olpc_platform_info.boardrev < olpc_board_pre(0xd0) &&
-			!cs5535_has_vsa2())
-		x86_init.pci.arch_init = pci_olpc_init;
-#endif
-
-	printk(KERN_INFO "OLPC board revision %s%X (EC=%x)\n",
-			((olpc_platform_info.boardrev & 0xf) < 8) ? "pre" : "",
-			olpc_platform_info.boardrev >> 4,
-			olpc_platform_info.ecver);
-
-	if (olpc_platform_info.boardrev < olpc_board_pre(0xd0)) { /* XO-1 */
-		r = add_xo1_platform_devices();
-		if (r)
-			return r;
-	}
-
-	return 0;
-}
-
-postcore_initcall(olpc_init);
diff --git a/arch/x86/kernel/olpc_ofw.c b/arch/x86/kernel/olpc_ofw.c
deleted file mode 100644
index 78732046437..00000000000
--- a/arch/x86/kernel/olpc_ofw.c
+++ /dev/null
@@ -1,112 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <asm/page.h>
-#include <asm/setup.h>
-#include <asm/io.h>
-#include <asm/pgtable.h>
-#include <asm/olpc_ofw.h>
-
-/* address of OFW callback interface; will be NULL if OFW isn't found */
-static int (*olpc_ofw_cif)(int *);
-
-/* page dir entry containing OFW's pgdir table; filled in by head_32.S */
-u32 olpc_ofw_pgd __initdata;
-
-static DEFINE_SPINLOCK(ofw_lock);
-
-#define MAXARGS 10
-
-void __init setup_olpc_ofw_pgd(void)
-{
-	pgd_t *base, *ofw_pde;
-
-	if (!olpc_ofw_cif)
-		return;
-
-	/* fetch OFW's PDE */
-	base = early_ioremap(olpc_ofw_pgd, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD);
-	if (!base) {
-		printk(KERN_ERR "failed to remap OFW's pgd - disabling OFW!\n");
-		olpc_ofw_cif = NULL;
-		return;
-	}
-	ofw_pde = &base[OLPC_OFW_PDE_NR];
-
-	/* install OFW's PDE permanently into the kernel's pgtable */
-	set_pgd(&swapper_pg_dir[OLPC_OFW_PDE_NR], *ofw_pde);
-	/* implicit optimization barrier here due to uninline function return */
-
-	early_iounmap(base, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD);
-}
-
-int __olpc_ofw(const char *name, int nr_args, const void **args, int nr_res,
-		void **res)
-{
-	int ofw_args[MAXARGS + 3];
-	unsigned long flags;
-	int ret, i, *p;
-
-	BUG_ON(nr_args + nr_res > MAXARGS);
-
-	if (!olpc_ofw_cif)
-		return -EIO;
-
-	ofw_args[0] = (int)name;
-	ofw_args[1] = nr_args;
-	ofw_args[2] = nr_res;
-
-	p = &ofw_args[3];
-	for (i = 0; i < nr_args; i++, p++)
-		*p = (int)args[i];
-
-	/* call into ofw */
-	spin_lock_irqsave(&ofw_lock, flags);
-	ret = olpc_ofw_cif(ofw_args);
-	spin_unlock_irqrestore(&ofw_lock, flags);
-
-	if (!ret) {
-		for (i = 0; i < nr_res; i++, p++)
-			*((int *)res[i]) = *p;
-	}
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(__olpc_ofw);
-
-bool olpc_ofw_present(void)
-{
-	return olpc_ofw_cif != NULL;
-}
-EXPORT_SYMBOL_GPL(olpc_ofw_present);
-
-/* OFW cif _should_ be above this address */
-#define OFW_MIN 0xff000000
-
-/* OFW starts on a 1MB boundary */
-#define OFW_BOUND (1<<20)
-
-void __init olpc_ofw_detect(void)
-{
-	struct olpc_ofw_header *hdr = &boot_params.olpc_ofw_header;
-	unsigned long start;
-
-	/* ensure OFW booted us by checking for "OFW " string */
-	if (hdr->ofw_magic != OLPC_OFW_SIG)
-		return;
-
-	olpc_ofw_cif = (int (*)(int *))hdr->cif_handler;
-
-	if ((unsigned long)olpc_ofw_cif < OFW_MIN) {
-		printk(KERN_ERR "OFW detected, but cif has invalid address 0x%lx - disabling.\n",
-				(unsigned long)olpc_ofw_cif);
-		olpc_ofw_cif = NULL;
-		return;
-	}
-
-	/* determine where OFW starts in memory */
-	start = round_down((unsigned long)olpc_ofw_cif, OFW_BOUND);
-	printk(KERN_INFO "OFW detected in memory, cif @ 0x%lx (reserving top %ldMB)\n",
-			(unsigned long)olpc_ofw_cif, (-start) >> 20);
-	reserve_top_address(-start);
-}
diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile
index 8519b01f1ac..7bf70b812fa 100644
--- a/arch/x86/platform/Makefile
+++ b/arch/x86/platform/Makefile
@@ -1,6 +1,7 @@
 # Platform specific code goes here
 obj-y	+= efi/
 obj-y	+= mrst/
+obj-y	+= olpc/
 obj-y	+= scx200/
 obj-y	+= sfi/
 obj-y	+= visws/
diff --git a/arch/x86/platform/olpc/Makefile b/arch/x86/platform/olpc/Makefile
new file mode 100644
index 00000000000..c31b8fcb5a8
--- /dev/null
+++ b/arch/x86/platform/olpc/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_OLPC)		+= olpc.o
+obj-$(CONFIG_OLPC_XO1)		+= olpc-xo1.o
+obj-$(CONFIG_OLPC_OPENFIRMWARE)	+= olpc_ofw.o
diff --git a/arch/x86/platform/olpc/olpc-xo1.c b/arch/x86/platform/olpc/olpc-xo1.c
new file mode 100644
index 00000000000..f5442c03abc
--- /dev/null
+++ b/arch/x86/platform/olpc/olpc-xo1.c
@@ -0,0 +1,140 @@
+/*
+ * Support for features of the OLPC XO-1 laptop
+ *
+ * Copyright (C) 2010 One Laptop per Child
+ * Copyright (C) 2006 Red Hat, Inc.
+ * Copyright (C) 2006 Advanced Micro Devices, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/pci_ids.h>
+#include <linux/platform_device.h>
+#include <linux/pm.h>
+
+#include <asm/io.h>
+#include <asm/olpc.h>
+
+#define DRV_NAME "olpc-xo1"
+
+#define PMS_BAR		4
+#define ACPI_BAR	5
+
+/* PMC registers (PMS block) */
+#define PM_SCLK		0x10
+#define PM_IN_SLPCTL	0x20
+#define PM_WKXD		0x34
+#define PM_WKD		0x30
+#define PM_SSC		0x54
+
+/* PM registers (ACPI block) */
+#define PM1_CNT		0x08
+#define PM_GPE0_STS	0x18
+
+static unsigned long acpi_base;
+static unsigned long pms_base;
+
+static void xo1_power_off(void)
+{
+	printk(KERN_INFO "OLPC XO-1 power off sequence...\n");
+
+	/* Enable all of these controls with 0 delay */
+	outl(0x40000000, pms_base + PM_SCLK);
+	outl(0x40000000, pms_base + PM_IN_SLPCTL);
+	outl(0x40000000, pms_base + PM_WKXD);
+	outl(0x40000000, pms_base + PM_WKD);
+
+	/* Clear status bits (possibly unnecessary) */
+	outl(0x0002ffff, pms_base  + PM_SSC);
+	outl(0xffffffff, acpi_base + PM_GPE0_STS);
+
+	/* Write SLP_EN bit to start the machinery */
+	outl(0x00002000, acpi_base + PM1_CNT);
+}
+
+/* Read the base addresses from the PCI BAR info */
+static int __devinit setup_bases(struct pci_dev *pdev)
+{
+	int r;
+
+	r = pci_enable_device_io(pdev);
+	if (r) {
+		dev_err(&pdev->dev, "can't enable device IO\n");
+		return r;
+	}
+
+	r = pci_request_region(pdev, ACPI_BAR, DRV_NAME);
+	if (r) {
+		dev_err(&pdev->dev, "can't alloc PCI BAR #%d\n", ACPI_BAR);
+		return r;
+	}
+
+	r = pci_request_region(pdev, PMS_BAR, DRV_NAME);
+	if (r) {
+		dev_err(&pdev->dev, "can't alloc PCI BAR #%d\n", PMS_BAR);
+		pci_release_region(pdev, ACPI_BAR);
+		return r;
+	}
+
+	acpi_base = pci_resource_start(pdev, ACPI_BAR);
+	pms_base = pci_resource_start(pdev, PMS_BAR);
+
+	return 0;
+}
+
+static int __devinit olpc_xo1_probe(struct platform_device *pdev)
+{
+	struct pci_dev *pcidev;
+	int r;
+
+	pcidev = pci_get_device(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA,
+				NULL);
+	if (!pdev)
+		return -ENODEV;
+
+	r = setup_bases(pcidev);
+	if (r)
+		return r;
+
+	pm_power_off = xo1_power_off;
+
+	printk(KERN_INFO "OLPC XO-1 support registered\n");
+	return 0;
+}
+
+static int __devexit olpc_xo1_remove(struct platform_device *pdev)
+{
+	pm_power_off = NULL;
+	return 0;
+}
+
+static struct platform_driver olpc_xo1_driver = {
+	.driver = {
+		.name = DRV_NAME,
+		.owner = THIS_MODULE,
+	},
+	.probe = olpc_xo1_probe,
+	.remove = __devexit_p(olpc_xo1_remove),
+};
+
+static int __init olpc_xo1_init(void)
+{
+	return platform_driver_register(&olpc_xo1_driver);
+}
+
+static void __exit olpc_xo1_exit(void)
+{
+	platform_driver_unregister(&olpc_xo1_driver);
+}
+
+MODULE_AUTHOR("Daniel Drake <dsd@laptop.org>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:olpc-xo1");
+
+module_init(olpc_xo1_init);
+module_exit(olpc_xo1_exit);
diff --git a/arch/x86/platform/olpc/olpc.c b/arch/x86/platform/olpc/olpc.c
new file mode 100644
index 00000000000..edaf3fe8dc5
--- /dev/null
+++ b/arch/x86/platform/olpc/olpc.c
@@ -0,0 +1,281 @@
+/*
+ * Support for the OLPC DCON and OLPC EC access
+ *
+ * Copyright © 2006  Advanced Micro Devices, Inc.
+ * Copyright © 2007-2008  Andres Salomon <dilinger@debian.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/spinlock.h>
+#include <linux/io.h>
+#include <linux/string.h>
+#include <linux/platform_device.h>
+
+#include <asm/geode.h>
+#include <asm/setup.h>
+#include <asm/olpc.h>
+#include <asm/olpc_ofw.h>
+
+struct olpc_platform_t olpc_platform_info;
+EXPORT_SYMBOL_GPL(olpc_platform_info);
+
+static DEFINE_SPINLOCK(ec_lock);
+
+/* what the timeout *should* be (in ms) */
+#define EC_BASE_TIMEOUT 20
+
+/* the timeout that bugs in the EC might force us to actually use */
+static int ec_timeout = EC_BASE_TIMEOUT;
+
+static int __init olpc_ec_timeout_set(char *str)
+{
+	if (get_option(&str, &ec_timeout) != 1) {
+		ec_timeout = EC_BASE_TIMEOUT;
+		printk(KERN_ERR "olpc-ec:  invalid argument to "
+				"'olpc_ec_timeout=', ignoring!\n");
+	}
+	printk(KERN_DEBUG "olpc-ec:  using %d ms delay for EC commands.\n",
+			ec_timeout);
+	return 1;
+}
+__setup("olpc_ec_timeout=", olpc_ec_timeout_set);
+
+/*
+ * These {i,o}bf_status functions return whether the buffers are full or not.
+ */
+
+static inline unsigned int ibf_status(unsigned int port)
+{
+	return !!(inb(port) & 0x02);
+}
+
+static inline unsigned int obf_status(unsigned int port)
+{
+	return inb(port) & 0x01;
+}
+
+#define wait_on_ibf(p, d) __wait_on_ibf(__LINE__, (p), (d))
+static int __wait_on_ibf(unsigned int line, unsigned int port, int desired)
+{
+	unsigned int timeo;
+	int state = ibf_status(port);
+
+	for (timeo = ec_timeout; state != desired && timeo; timeo--) {
+		mdelay(1);
+		state = ibf_status(port);
+	}
+
+	if ((state == desired) && (ec_timeout > EC_BASE_TIMEOUT) &&
+			timeo < (ec_timeout - EC_BASE_TIMEOUT)) {
+		printk(KERN_WARNING "olpc-ec:  %d: waited %u ms for IBF!\n",
+				line, ec_timeout - timeo);
+	}
+
+	return !(state == desired);
+}
+
+#define wait_on_obf(p, d) __wait_on_obf(__LINE__, (p), (d))
+static int __wait_on_obf(unsigned int line, unsigned int port, int desired)
+{
+	unsigned int timeo;
+	int state = obf_status(port);
+
+	for (timeo = ec_timeout; state != desired && timeo; timeo--) {
+		mdelay(1);
+		state = obf_status(port);
+	}
+
+	if ((state == desired) && (ec_timeout > EC_BASE_TIMEOUT) &&
+			timeo < (ec_timeout - EC_BASE_TIMEOUT)) {
+		printk(KERN_WARNING "olpc-ec:  %d: waited %u ms for OBF!\n",
+				line, ec_timeout - timeo);
+	}
+
+	return !(state == desired);
+}
+
+/*
+ * This allows the kernel to run Embedded Controller commands.  The EC is
+ * documented at <http://wiki.laptop.org/go/Embedded_controller>, and the
+ * available EC commands are here:
+ * <http://wiki.laptop.org/go/Ec_specification>.  Unfortunately, while
+ * OpenFirmware's source is available, the EC's is not.
+ */
+int olpc_ec_cmd(unsigned char cmd, unsigned char *inbuf, size_t inlen,
+		unsigned char *outbuf,  size_t outlen)
+{
+	unsigned long flags;
+	int ret = -EIO;
+	int i;
+	int restarts = 0;
+
+	spin_lock_irqsave(&ec_lock, flags);
+
+	/* Clear OBF */
+	for (i = 0; i < 10 && (obf_status(0x6c) == 1); i++)
+		inb(0x68);
+	if (i == 10) {
+		printk(KERN_ERR "olpc-ec:  timeout while attempting to "
+				"clear OBF flag!\n");
+		goto err;
+	}
+
+	if (wait_on_ibf(0x6c, 0)) {
+		printk(KERN_ERR "olpc-ec:  timeout waiting for EC to "
+				"quiesce!\n");
+		goto err;
+	}
+
+restart:
+	/*
+	 * Note that if we time out during any IBF checks, that's a failure;
+	 * we have to return.  There's no way for the kernel to clear that.
+	 *
+	 * If we time out during an OBF check, we can restart the command;
+	 * reissuing it will clear the OBF flag, and we should be alright.
+	 * The OBF flag will sometimes misbehave due to what we believe
+	 * is a hardware quirk..
+	 */
+	pr_devel("olpc-ec:  running cmd 0x%x\n", cmd);
+	outb(cmd, 0x6c);
+
+	if (wait_on_ibf(0x6c, 0)) {
+		printk(KERN_ERR "olpc-ec:  timeout waiting for EC to read "
+				"command!\n");
+		goto err;
+	}
+
+	if (inbuf && inlen) {
+		/* write data to EC */
+		for (i = 0; i < inlen; i++) {
+			if (wait_on_ibf(0x6c, 0)) {
+				printk(KERN_ERR "olpc-ec:  timeout waiting for"
+						" EC accept data!\n");
+				goto err;
+			}
+			pr_devel("olpc-ec:  sending cmd arg 0x%x\n", inbuf[i]);
+			outb(inbuf[i], 0x68);
+		}
+	}
+	if (outbuf && outlen) {
+		/* read data from EC */
+		for (i = 0; i < outlen; i++) {
+			if (wait_on_obf(0x6c, 1)) {
+				printk(KERN_ERR "olpc-ec:  timeout waiting for"
+						" EC to provide data!\n");
+				if (restarts++ < 10)
+					goto restart;
+				goto err;
+			}
+			outbuf[i] = inb(0x68);
+			pr_devel("olpc-ec:  received 0x%x\n", outbuf[i]);
+		}
+	}
+
+	ret = 0;
+err:
+	spin_unlock_irqrestore(&ec_lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(olpc_ec_cmd);
+
+static bool __init check_ofw_architecture(void)
+{
+	size_t propsize;
+	char olpc_arch[5];
+	const void *args[] = { NULL, "architecture", olpc_arch, (void *)5 };
+	void *res[] = { &propsize };
+
+	if (olpc_ofw("getprop", args, res)) {
+		printk(KERN_ERR "ofw: getprop call failed!\n");
+		return false;
+	}
+	return propsize == 5 && strncmp("OLPC", olpc_arch, 5) == 0;
+}
+
+static u32 __init get_board_revision(void)
+{
+	size_t propsize;
+	__be32 rev;
+	const void *args[] = { NULL, "board-revision-int", &rev, (void *)4 };
+	void *res[] = { &propsize };
+
+	if (olpc_ofw("getprop", args, res) || propsize != 4) {
+		printk(KERN_ERR "ofw: getprop call failed!\n");
+		return cpu_to_be32(0);
+	}
+	return be32_to_cpu(rev);
+}
+
+static bool __init platform_detect(void)
+{
+	if (!check_ofw_architecture())
+		return false;
+	olpc_platform_info.flags |= OLPC_F_PRESENT;
+	olpc_platform_info.boardrev = get_board_revision();
+	return true;
+}
+
+static int __init add_xo1_platform_devices(void)
+{
+	struct platform_device *pdev;
+
+	pdev = platform_device_register_simple("xo1-rfkill", -1, NULL, 0);
+	if (IS_ERR(pdev))
+		return PTR_ERR(pdev);
+
+	pdev = platform_device_register_simple("olpc-xo1", -1, NULL, 0);
+	if (IS_ERR(pdev))
+		return PTR_ERR(pdev);
+
+	return 0;
+}
+
+static int __init olpc_init(void)
+{
+	int r = 0;
+
+	if (!olpc_ofw_present() || !platform_detect())
+		return 0;
+
+	spin_lock_init(&ec_lock);
+
+	/* assume B1 and above models always have a DCON */
+	if (olpc_board_at_least(olpc_board(0xb1)))
+		olpc_platform_info.flags |= OLPC_F_DCON;
+
+	/* get the EC revision */
+	olpc_ec_cmd(EC_FIRMWARE_REV, NULL, 0,
+			(unsigned char *) &olpc_platform_info.ecver, 1);
+
+#ifdef CONFIG_PCI_OLPC
+	/* If the VSA exists let it emulate PCI, if not emulate in kernel.
+	 * XO-1 only. */
+	if (olpc_platform_info.boardrev < olpc_board_pre(0xd0) &&
+			!cs5535_has_vsa2())
+		x86_init.pci.arch_init = pci_olpc_init;
+#endif
+
+	printk(KERN_INFO "OLPC board revision %s%X (EC=%x)\n",
+			((olpc_platform_info.boardrev & 0xf) < 8) ? "pre" : "",
+			olpc_platform_info.boardrev >> 4,
+			olpc_platform_info.ecver);
+
+	if (olpc_platform_info.boardrev < olpc_board_pre(0xd0)) { /* XO-1 */
+		r = add_xo1_platform_devices();
+		if (r)
+			return r;
+	}
+
+	return 0;
+}
+
+postcore_initcall(olpc_init);
diff --git a/arch/x86/platform/olpc/olpc_ofw.c b/arch/x86/platform/olpc/olpc_ofw.c
new file mode 100644
index 00000000000..78732046437
--- /dev/null
+++ b/arch/x86/platform/olpc/olpc_ofw.c
@@ -0,0 +1,112 @@
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <asm/page.h>
+#include <asm/setup.h>
+#include <asm/io.h>
+#include <asm/pgtable.h>
+#include <asm/olpc_ofw.h>
+
+/* address of OFW callback interface; will be NULL if OFW isn't found */
+static int (*olpc_ofw_cif)(int *);
+
+/* page dir entry containing OFW's pgdir table; filled in by head_32.S */
+u32 olpc_ofw_pgd __initdata;
+
+static DEFINE_SPINLOCK(ofw_lock);
+
+#define MAXARGS 10
+
+void __init setup_olpc_ofw_pgd(void)
+{
+	pgd_t *base, *ofw_pde;
+
+	if (!olpc_ofw_cif)
+		return;
+
+	/* fetch OFW's PDE */
+	base = early_ioremap(olpc_ofw_pgd, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD);
+	if (!base) {
+		printk(KERN_ERR "failed to remap OFW's pgd - disabling OFW!\n");
+		olpc_ofw_cif = NULL;
+		return;
+	}
+	ofw_pde = &base[OLPC_OFW_PDE_NR];
+
+	/* install OFW's PDE permanently into the kernel's pgtable */
+	set_pgd(&swapper_pg_dir[OLPC_OFW_PDE_NR], *ofw_pde);
+	/* implicit optimization barrier here due to uninline function return */
+
+	early_iounmap(base, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD);
+}
+
+int __olpc_ofw(const char *name, int nr_args, const void **args, int nr_res,
+		void **res)
+{
+	int ofw_args[MAXARGS + 3];
+	unsigned long flags;
+	int ret, i, *p;
+
+	BUG_ON(nr_args + nr_res > MAXARGS);
+
+	if (!olpc_ofw_cif)
+		return -EIO;
+
+	ofw_args[0] = (int)name;
+	ofw_args[1] = nr_args;
+	ofw_args[2] = nr_res;
+
+	p = &ofw_args[3];
+	for (i = 0; i < nr_args; i++, p++)
+		*p = (int)args[i];
+
+	/* call into ofw */
+	spin_lock_irqsave(&ofw_lock, flags);
+	ret = olpc_ofw_cif(ofw_args);
+	spin_unlock_irqrestore(&ofw_lock, flags);
+
+	if (!ret) {
+		for (i = 0; i < nr_res; i++, p++)
+			*((int *)res[i]) = *p;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(__olpc_ofw);
+
+bool olpc_ofw_present(void)
+{
+	return olpc_ofw_cif != NULL;
+}
+EXPORT_SYMBOL_GPL(olpc_ofw_present);
+
+/* OFW cif _should_ be above this address */
+#define OFW_MIN 0xff000000
+
+/* OFW starts on a 1MB boundary */
+#define OFW_BOUND (1<<20)
+
+void __init olpc_ofw_detect(void)
+{
+	struct olpc_ofw_header *hdr = &boot_params.olpc_ofw_header;
+	unsigned long start;
+
+	/* ensure OFW booted us by checking for "OFW " string */
+	if (hdr->ofw_magic != OLPC_OFW_SIG)
+		return;
+
+	olpc_ofw_cif = (int (*)(int *))hdr->cif_handler;
+
+	if ((unsigned long)olpc_ofw_cif < OFW_MIN) {
+		printk(KERN_ERR "OFW detected, but cif has invalid address 0x%lx - disabling.\n",
+				(unsigned long)olpc_ofw_cif);
+		olpc_ofw_cif = NULL;
+		return;
+	}
+
+	/* determine where OFW starts in memory */
+	start = round_down((unsigned long)olpc_ofw_cif, OFW_BOUND);
+	printk(KERN_INFO "OFW detected in memory, cif @ 0x%lx (reserving top %ldMB)\n",
+			(unsigned long)olpc_ofw_cif, (-start) >> 20);
+	reserve_top_address(-start);
+}
-- 
cgit v1.2.3-18-g5258


From 22d4cd4c4dce6d7b7d9a7e396aa4f87fe7a649b1 Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Wed, 27 Oct 2010 01:43:02 -0400
Subject: x86-32: Allocate irq stacks seperate from percpu area

The percpu allocator cannot handle alignments larger than one
page. Allocate the irq stacks seperately, and only keep the
pointers as percpu data.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: tj@kernel.org
LKML-Reference: <1288158182-1753-1-git-send-email-brgerst@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/irq.h |  2 --
 arch/x86/kernel/irq_32.c   | 12 ++----------
 arch/x86/kernel/smpboot.c  |  1 -
 3 files changed, 2 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index 0bf5b008365..13b0ebaa512 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -21,10 +21,8 @@ static inline int irq_canonicalize(int irq)
 
 #ifdef CONFIG_X86_32
 extern void irq_ctx_init(int cpu);
-extern void irq_ctx_exit(int cpu);
 #else
 # define irq_ctx_init(cpu) do { } while (0)
-# define irq_ctx_exit(cpu) do { } while (0)
 #endif
 
 #define __ARCH_HAS_DO_SOFTIRQ
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 50fbbe60e50..64668dbf00a 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -60,9 +60,6 @@ union irq_ctx {
 static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx);
 static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx);
 
-static DEFINE_PER_CPU_MULTIPAGE_ALIGNED(union irq_ctx, hardirq_stack, THREAD_SIZE);
-static DEFINE_PER_CPU_MULTIPAGE_ALIGNED(union irq_ctx, softirq_stack, THREAD_SIZE);
-
 static void call_on_stack(void *func, void *stack)
 {
 	asm volatile("xchgl	%%ebx,%%esp	\n"
@@ -128,7 +125,7 @@ void __cpuinit irq_ctx_init(int cpu)
 	if (per_cpu(hardirq_ctx, cpu))
 		return;
 
-	irqctx = &per_cpu(hardirq_stack, cpu);
+	irqctx = (union irq_ctx *)__get_free_pages(THREAD_FLAGS, THREAD_ORDER);
 	irqctx->tinfo.task		= NULL;
 	irqctx->tinfo.exec_domain	= NULL;
 	irqctx->tinfo.cpu		= cpu;
@@ -137,7 +134,7 @@ void __cpuinit irq_ctx_init(int cpu)
 
 	per_cpu(hardirq_ctx, cpu) = irqctx;
 
-	irqctx = &per_cpu(softirq_stack, cpu);
+	irqctx = (union irq_ctx *)__get_free_pages(THREAD_FLAGS, THREAD_ORDER);
 	irqctx->tinfo.task		= NULL;
 	irqctx->tinfo.exec_domain	= NULL;
 	irqctx->tinfo.cpu		= cpu;
@@ -150,11 +147,6 @@ void __cpuinit irq_ctx_init(int cpu)
 	       cpu, per_cpu(hardirq_ctx, cpu),  per_cpu(softirq_ctx, cpu));
 }
 
-void irq_ctx_exit(int cpu)
-{
-	per_cpu(hardirq_ctx, cpu) = NULL;
-}
-
 asmlinkage void do_softirq(void)
 {
 	unsigned long flags;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 6af118511b4..90baf567bbf 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1373,7 +1373,6 @@ void play_dead_common(void)
 {
 	idle_task_exit();
 	reset_lazy_tlbstate();
-	irq_ctx_exit(raw_smp_processor_id());
 	c1e_remove_cpu(raw_smp_processor_id());
 
 	mb();
-- 
cgit v1.2.3-18-g5258


From 20273941f2129aa5a432796d98a276ed73d60782 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 27 Oct 2010 15:32:58 -0700
Subject: mm: fix race in kunmap_atomic()

Christoph reported a nice splat which illustrated a race in the new stack
based kmap_atomic implementation.

The problem is that we pop our stack slot before we're completely done
resetting its state -- in particular clearing the PTE (sometimes that's
CONFIG_DEBUG_HIGHMEM).  If an interrupt happens before we actually clear
the PTE used for the last slot, that interrupt can reuse the slot in a
dirty state, which triggers a BUG in kmap_atomic().

Fix this by introducing kmap_atomic_idx() which reports the current slot
index without actually releasing it and use that to find the PTE and delay
the _pop() until after we're completely done.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reported-by: Christoph Hellwig <hch@infradead.org>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/highmem_32.c | 3 ++-
 arch/x86/mm/iomap_32.c   | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index d723e369003..b4996266210 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -74,7 +74,7 @@ void __kunmap_atomic(void *kvaddr)
 	    vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) {
 		int idx, type;
 
-		type = kmap_atomic_idx_pop();
+		type = kmap_atomic_idx();
 		idx = type + KM_TYPE_NR * smp_processor_id();
 
 #ifdef CONFIG_DEBUG_HIGHMEM
@@ -87,6 +87,7 @@ void __kunmap_atomic(void *kvaddr)
 		 * attributes or becomes a protected page in a hypervisor.
 		 */
 		kpte_clear_flush(kmap_pte-idx, vaddr);
+		kmap_atomic_idx_pop();
 	}
 #ifdef CONFIG_DEBUG_HIGHMEM
 	else {
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index 75a3d7f24a2..7b179b499fa 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -98,7 +98,7 @@ iounmap_atomic(void __iomem *kvaddr)
 	    vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) {
 		int idx, type;
 
-		type = kmap_atomic_idx_pop();
+		type = kmap_atomic_idx();
 		idx = type + KM_TYPE_NR * smp_processor_id();
 
 #ifdef CONFIG_DEBUG_HIGHMEM
@@ -111,6 +111,7 @@ iounmap_atomic(void __iomem *kvaddr)
 		 * attributes or becomes a protected page in a hypervisor.
 		 */
 		kpte_clear_flush(kmap_pte-idx, vaddr);
+		kmap_atomic_idx_pop();
 	}
 
 	pagefault_enable();
-- 
cgit v1.2.3-18-g5258


From 9b05a69e0534ec70bc94921936ffa05b330507cb Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Wed, 27 Oct 2010 15:33:47 -0700
Subject: ptrace: change signature of arch_ptrace()

Fix up the arguments to arch_ptrace() to take account of the fact that
@addr and @data are now unsigned long rather than long as of a preceding
patch in this series.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Cc: <linux-arch@vger.kernel.org>
Acked-by: Roland McGrath <roland@redhat.com>
Acked-by: David Howells <dhowells@redhat.com>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/ptrace.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 70c4872cd8a..1a7ca045920 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -801,7 +801,8 @@ void ptrace_disable(struct task_struct *child)
 static const struct user_regset_view user_x86_32_view; /* Initialized below. */
 #endif
 
-long arch_ptrace(struct task_struct *child, long request, long addr, long data)
+long arch_ptrace(struct task_struct *child, long request,
+		 unsigned long addr, unsigned long data)
 {
 	int ret;
 	unsigned long __user *datap = (unsigned long __user *)data;
@@ -888,14 +889,14 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 
 #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
 	case PTRACE_GET_THREAD_AREA:
-		if (addr < 0)
+		if ((int) addr < 0)
 			return -EIO;
 		ret = do_get_thread_area(child, addr,
 					 (struct user_desc __user *) data);
 		break;
 
 	case PTRACE_SET_THREAD_AREA:
-		if (addr < 0)
+		if ((int) addr < 0)
 			return -EIO;
 		ret = do_set_thread_area(child, addr,
 					 (struct user_desc __user *) data, 0);
-- 
cgit v1.2.3-18-g5258


From eb5a3699311ba8ed22b7b38ceb3bb1411e438e2a Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Wed, 27 Oct 2010 15:33:48 -0700
Subject: ptrace: cleanup arch_ptrace() on x86

Remove checking @addr less than 0 because @addr is now unsigned and
use new udescp variable in order to remove unnecessary castings.

[akpm@linux-foundation.org: fix unused variable 'udescp']
Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/ptrace.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 1a7ca045920..45892dc4b72 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -813,8 +813,7 @@ long arch_ptrace(struct task_struct *child, long request,
 		unsigned long tmp;
 
 		ret = -EIO;
-		if ((addr & (sizeof(data) - 1)) || addr < 0 ||
-		    addr >= sizeof(struct user))
+		if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user))
 			break;
 
 		tmp = 0;  /* Default return condition */
@@ -831,8 +830,7 @@ long arch_ptrace(struct task_struct *child, long request,
 
 	case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
 		ret = -EIO;
-		if ((addr & (sizeof(data) - 1)) || addr < 0 ||
-		    addr >= sizeof(struct user))
+		if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user))
 			break;
 
 		if (addr < sizeof(struct user_regs_struct))
@@ -892,14 +890,14 @@ long arch_ptrace(struct task_struct *child, long request,
 		if ((int) addr < 0)
 			return -EIO;
 		ret = do_get_thread_area(child, addr,
-					 (struct user_desc __user *) data);
+					(struct user_desc __user *)data);
 		break;
 
 	case PTRACE_SET_THREAD_AREA:
 		if ((int) addr < 0)
 			return -EIO;
 		ret = do_set_thread_area(child, addr,
-					 (struct user_desc __user *) data, 0);
+					(struct user_desc __user *)data, 0);
 		break;
 #endif
 
-- 
cgit v1.2.3-18-g5258


From 61d8e11e519ee7912ab59610fba1aaf08e3c1d84 Mon Sep 17 00:00:00 2001
From: Zimny Lech <napohybelskurwysynom2010@gmail.com>
Date: Wed, 27 Oct 2010 15:34:53 -0700
Subject: Remove duplicate includes from many files

Signed-off-by: Zimny Lech <napohybelskurwysynom2010@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/init_64.c    | 1 -
 arch/x86/xen/enlighten.c | 1 -
 2 files changed, 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 84346200e78..71a59296af8 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -51,7 +51,6 @@
 #include <asm/numa.h>
 #include <asm/cacheflush.h>
 #include <asm/init.h>
-#include <linux/bootmem.h>
 
 static int __init parse_direct_gbpages_off(char *arg)
 {
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 44ab12dc2a1..0cd12db0b14 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -59,7 +59,6 @@
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <asm/reboot.h>
-#include <asm/setup.h>
 #include <asm/stackprotector.h>
 #include <asm/hypervisor.h>
 
-- 
cgit v1.2.3-18-g5258


From 419db274bed4269f475a8e78cbe9c917192cfe8b Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Thu, 28 Oct 2010 09:50:17 -0700
Subject: x86, memblock: Fix early_node_mem with big reserved region.

Xen can reserve huge amounts of memory for pre-ballooning, but that
still shows as RAM in the e820 memory map.  early_node_mem could not
find range because of start/end adjusting, and will go through the
fallback path.  However, the fallback patch is still using
memblock_x86_find_range_node(), and it is partially top-down because
it go through active_range entries from low to high.

Let's use memblock_find_in_range instead memblock_x86_find_range_node.
So get real top down in fallback path.

We may still need to make memblock_x86_find_range_node to do overall
top_down work.

Reported-by: Jeremy Fitzhardinge <jeremy@goop.org>
Tested-by: Jeremy Fitzhardinge <jeremy@goop.org>
Tested-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <4CC9A9C9.8020700@kernel.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/mm/numa_64.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 60f498511dd..7ffc9b727ef 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -178,11 +178,8 @@ static void * __init early_node_mem(int nodeid, unsigned long start,
 
 	/* extend the search scope */
 	end = max_pfn_mapped << PAGE_SHIFT;
-	if (end > (MAX_DMA32_PFN<<PAGE_SHIFT))
-		start = MAX_DMA32_PFN<<PAGE_SHIFT;
-	else
-		start = MAX_DMA_PFN<<PAGE_SHIFT;
-	mem = memblock_x86_find_in_range_node(nodeid, start, end, size, align);
+	start = MAX_DMA_PFN << PAGE_SHIFT;
+	mem = memblock_find_in_range(start, end, size, align);
 	if (mem != MEMBLOCK_ERROR)
 		return __va(mem);
 
-- 
cgit v1.2.3-18-g5258


From 0520bd8438f18f2b1b2af5fd1c4ecc070a1bf837 Mon Sep 17 00:00:00 2001
From: Russ Anderson <rja@sgi.com>
Date: Thu, 28 Oct 2010 17:41:32 -0500
Subject: x86, uv: More Westmere support on SGI UV

Enable Westmere support for all APIC modes on SGI UV.

Signed-off-by: Russ Anderson <rja@sgi.com>
LKML-Reference: <20101028224132.GB15804@sgi.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/apic/x2apic_uv_x.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 0a2918eaab3..ed4118de249 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -71,7 +71,7 @@ static int early_get_nodeid(void)
 	return node_id.s.node_id;
 }
 
-static int __init early_get_apic_pnode_shift(void)
+static void __init early_get_apic_pnode_shift(void)
 {
 	unsigned long *mmr;
 
@@ -83,8 +83,6 @@ static int __init early_get_apic_pnode_shift(void)
 		 * Old bios, use default value
 		 */
 		uvh_apicid.s.pnode_shift = UV_APIC_PNODE_SHIFT;
-
-	return uvh_apicid.s.pnode_shift;
 }
 
 static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
@@ -93,6 +91,7 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 
 	if (!strcmp(oem_id, "SGI")) {
 		nodeid = early_get_nodeid();
+		early_get_apic_pnode_shift();
 		x86_platform.is_untracked_pat_range =  uv_is_untracked_pat_range;
 		x86_platform.nmi_init = uv_nmi_init;
 		if (!strcmp(oem_table_id, "UVL"))
@@ -101,7 +100,7 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 			uv_system_type = UV_X2APIC;
 		else if (!strcmp(oem_table_id, "UVH")) {
 			__get_cpu_var(x2apic_extra_bits) =
-				nodeid << (early_get_apic_pnode_shift() - 1);
+				nodeid << (uvh_apicid.s.pnode_shift - 1);
 			uv_system_type = UV_NON_UNIQUE_APIC;
 			return 1;
 		}
-- 
cgit v1.2.3-18-g5258


From 5c1eb08936693cd78c71164c8bea0b086ae72c67 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 28 Oct 2010 16:40:54 +0200
Subject: x86-32: Restore irq stacks NUMA-aware allocations

Commit 22d4cd4c4d ("Allocate irq stacks seperate from percpu
area") removed NUMA affinity of IRQ stacks as side-effect of
the fix.

Using alloc_pages_node() instead of __get_free_pages() is safe,
even if the target node has no available LOWMEM pages :
alloc_pages_node() fallbacks to another node.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: Brian Gerst <brgerst@gmail.com>
Cc: tj@kernel.org
Cc: torvalds@linux-foundation.org
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <1288276854.2649.607.camel@edumazet-laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/irq_32.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 64668dbf00a..96656f20775 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -17,6 +17,7 @@
 #include <linux/delay.h>
 #include <linux/uaccess.h>
 #include <linux/percpu.h>
+#include <linux/mm.h>
 
 #include <asm/apic.h>
 
@@ -125,7 +126,9 @@ void __cpuinit irq_ctx_init(int cpu)
 	if (per_cpu(hardirq_ctx, cpu))
 		return;
 
-	irqctx = (union irq_ctx *)__get_free_pages(THREAD_FLAGS, THREAD_ORDER);
+	irqctx = page_address(alloc_pages_node(cpu_to_node(cpu),
+					       THREAD_FLAGS,
+					       THREAD_ORDER));
 	irqctx->tinfo.task		= NULL;
 	irqctx->tinfo.exec_domain	= NULL;
 	irqctx->tinfo.cpu		= cpu;
@@ -134,7 +137,9 @@ void __cpuinit irq_ctx_init(int cpu)
 
 	per_cpu(hardirq_ctx, cpu) = irqctx;
 
-	irqctx = (union irq_ctx *)__get_free_pages(THREAD_FLAGS, THREAD_ORDER);
+	irqctx = page_address(alloc_pages_node(cpu_to_node(cpu),
+					       THREAD_FLAGS,
+					       THREAD_ORDER));
 	irqctx->tinfo.task		= NULL;
 	irqctx->tinfo.exec_domain	= NULL;
 	irqctx->tinfo.cpu		= cpu;
-- 
cgit v1.2.3-18-g5258


From 2d1d7126bbde53989f1d7de174816c123bb7ecb0 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Wed, 27 Oct 2010 21:09:15 -0700
Subject: x86, ftrace: Use safe noops, drop trap test

Always use a safe 5-byte noop sequence.  Drop the trap test, since it
is known to return false negatives on some virtualization platforms on
32 bits.  The resulting code is both simpler and safer.

Cc: Daniel Drake <dsd@laptop.org>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/kernel/alternative.c | 69 ++++++++++---------------------------------
 1 file changed, 15 insertions(+), 54 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index a36bb90aef5..0b30214282a 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -644,65 +644,26 @@ void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
 
 #if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL)
 
-unsigned char ideal_nop5[IDEAL_NOP_SIZE_5];
+#ifdef CONFIG_X86_64
+unsigned char ideal_nop5[5] = { 0x66, 0x66, 0x66, 0x66, 0x90 };
+#else
+unsigned char ideal_nop5[5] = { 0x3e, 0x8d, 0x74, 0x26, 0x00 };
+#endif
 
 void __init arch_init_ideal_nop5(void)
 {
-	extern const unsigned char ftrace_test_p6nop[];
-	extern const unsigned char ftrace_test_nop5[];
-	extern const unsigned char ftrace_test_jmp[];
-	int faulted = 0;
-
 	/*
-	 * There is no good nop for all x86 archs.
-	 * We will default to using the P6_NOP5, but first we
-	 * will test to make sure that the nop will actually
-	 * work on this CPU. If it faults, we will then
-	 * go to a lesser efficient 5 byte nop. If that fails
-	 * we then just use a jmp as our nop. This isn't the most
-	 * efficient nop, but we can not use a multi part nop
-	 * since we would then risk being preempted in the middle
-	 * of that nop, and if we enabled tracing then, it might
-	 * cause a system crash.
+	 * There is no good nop for all x86 archs.  This selection
+	 * algorithm should be unified with the one in find_nop_table(),
+	 * but this should be good enough for now.
 	 *
-	 * TODO: check the cpuid to determine the best nop.
+	 * For cases other than the ones below, use the safe (as in
+	 * always functional) defaults above.
 	 */
-	asm volatile (
-		"ftrace_test_jmp:"
-		"jmp ftrace_test_p6nop\n"
-		"nop\n"
-		"nop\n"
-		"nop\n"  /* 2 byte jmp + 3 bytes */
-		"ftrace_test_p6nop:"
-		P6_NOP5
-		"jmp 1f\n"
-		"ftrace_test_nop5:"
-		".byte 0x66,0x66,0x66,0x66,0x90\n"
-		"1:"
-		".section .fixup, \"ax\"\n"
-		"2:	movl $1, %0\n"
-		"	jmp ftrace_test_nop5\n"
-		"3:	movl $2, %0\n"
-		"	jmp 1b\n"
-		".previous\n"
-		_ASM_EXTABLE(ftrace_test_p6nop, 2b)
-		_ASM_EXTABLE(ftrace_test_nop5, 3b)
-		: "=r"(faulted) : "0" (faulted));
-
-	switch (faulted) {
-	case 0:
-		pr_info("converting mcount calls to 0f 1f 44 00 00\n");
-		memcpy(ideal_nop5, ftrace_test_p6nop, IDEAL_NOP_SIZE_5);
-		break;
-	case 1:
-		pr_info("converting mcount calls to 66 66 66 66 90\n");
-		memcpy(ideal_nop5, ftrace_test_nop5, IDEAL_NOP_SIZE_5);
-		break;
-	case 2:
-		pr_info("converting mcount calls to jmp . + 5\n");
-		memcpy(ideal_nop5, ftrace_test_jmp, IDEAL_NOP_SIZE_5);
-		break;
-	}
-
+#ifdef CONFIG_X86_64
+	/* Don't use these on 32 bits due to broken virtualizers */
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+		memcpy(ideal_nop5, p6_nops[5], 5);
+#endif
 }
 #endif
-- 
cgit v1.2.3-18-g5258


From d7ba979d45272385ce0fdf141d922e61ff48e07b Mon Sep 17 00:00:00 2001
From: Dongdong Deng <dongdong.deng@windriver.com>
Date: Wed, 18 Aug 2010 06:02:00 -0500
Subject: debug_core,x86,blackfin: Clean up hw debug disable API

The kgdb_disable_hw_debug() was an architecture specific function for
disabling all hardware breakpoints on a per cpu basis when entering
the debug core.

This patch will remove the weak function kdbg_disable_hw_debug() and
change it into a call back which lives with the rest of hw breakpoint
call backs in struct kgdb_arch.

Signed-off-by: Dongdong Deng <dongdong.deng@windriver.com>
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 arch/x86/kernel/kgdb.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index d81cfebb848..ec592caac4b 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -387,7 +387,7 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
  *	disable hardware debugging while it is processing gdb packets or
  *	handling exception.
  */
-void kgdb_disable_hw_debug(struct pt_regs *regs)
+static void kgdb_disable_hw_debug(struct pt_regs *regs)
 {
 	int i;
 	int cpu = raw_smp_processor_id();
@@ -724,6 +724,7 @@ struct kgdb_arch arch_kgdb_ops = {
 	.flags			= KGDB_HW_BREAKPOINT,
 	.set_hw_breakpoint	= kgdb_set_hw_break,
 	.remove_hw_breakpoint	= kgdb_remove_hw_break,
+	.disable_hw_break	= kgdb_disable_hw_debug,
 	.remove_all_hw_break	= kgdb_remove_all_hw_break,
 	.correct_hw_break	= kgdb_correct_hw_break,
 };
-- 
cgit v1.2.3-18-g5258


From 45f81b1c96d9793e47ce925d257ea693ce0b193e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 29 Oct 2010 12:33:43 -0400
Subject: jump label: Add work around to i386 gcc asm goto bug

On i386 (not x86_64) early implementations of gcc would have a bug
with asm goto causing it to produce code like the following:

(This was noticed by Peter Zijlstra)

   56 pushl 0
   67 nopl         jmp 0x6f
      popl
      jmp 0x8c

   6f              mov
                   test
                   je 0x8c

   8c mov
      call *(%esp)

The jump added in the asm goto skipped over the popl that matched
the pushl 0, which lead up to a quick crash of the system when
the jump was enabled. The nopl is defined in the asm goto () statement
and when tracepoints are enabled, the nop changes to a jump to the label
that was specified by the asm goto. asm goto is suppose to tell gcc that
the code in the asm might jump to an external label. Here gcc obviously
fails to make that work.

The bug report for gcc is here:

  http://gcc.gnu.org/bugzilla/show_bug.cgi?id=46226

The bug only appears on x86 when not compiled with
-maccumulate-outgoing-args. This option is always set on x86_64 and it
is also the work around for a function graph tracer i386 bug.
(See commit: 746357d6a526d6da9d89a2ec645b28406e959c2e)
This explains why the bug only showed up on i386 when function graph
tracer was not enabled.

This patch now adds a CONFIG_JUMP_LABEL option that is default
off instead of using jump labels by default. When jump labels are
enabled, the -maccumulate-outgoing-args will be used (causing a
slightly larger kernel image on i386). This option will exist
until we have a way to detect if the gcc compiler in use is safe
to use on all configurations without the work around.

Note, there exists such a test, but for now we will keep the enabling
of jump label as a manual option.

Archs that know the compiler is safe with asm goto, may choose to
select JUMP_LABEL and enable it by default.

Reported-by: Ingo Molnar <mingo@elte.hu>
Cause-discovered-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Jason Baron <jbaron@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: David Daney <ddaney@caviumnetworks.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: David Miller <davem@davemloft.net>
Cc: Richard Henderson <rth@redhat.com>
LKML-Reference: <1288028746.3673.11.camel@laptop>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/Makefile_32.cpu | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu
index 1255d953c65..f2ee1abb1df 100644
--- a/arch/x86/Makefile_32.cpu
+++ b/arch/x86/Makefile_32.cpu
@@ -51,7 +51,18 @@ cflags-$(CONFIG_X86_GENERIC) 	+= $(call tune,generic,$(call tune,i686))
 # prologue (push %ebp, mov %esp, %ebp) which breaks the function graph
 # tracer assumptions. For i686, generic, core2 this is set by the
 # compiler anyway
-cflags-$(CONFIG_FUNCTION_GRAPH_TRACER) += $(call cc-option,-maccumulate-outgoing-args)
+ifeq ($(CONFIG_FUNCTION_GRAPH_TRACER), y)
+ADD_ACCUMULATE_OUTGOING_ARGS := y
+endif
+
+# Work around to a bug with asm goto with first implementations of it
+# in gcc causing gcc to mess up the push and pop of the stack in some
+# uses of asm goto.
+ifeq ($(CONFIG_JUMP_LABEL), y)
+ADD_ACCUMULATE_OUTGOING_ARGS := y
+endif
+
+cflags-$(ADD_ACCUMULATE_OUTGOING_ARGS) += $(call cc-option,-maccumulate-outgoing-args)
 
 # Bug fix for binutils: this option is required in order to keep
 # binutils from generating NOPL instructions against our will.
-- 
cgit v1.2.3-18-g5258


From a2d771c036eb8c040683089ca04c36dfb93a0e60 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Fri, 29 Oct 2010 16:56:19 +0100
Subject: xen: correct size of level2_kernel_pgt

sizeof(pmd_t *) is 4 bytes on 32-bit PAE leading to an allocation of
only 2048 bytes. The correct size is sizeof(pmd_t) giving us a full
page allocation.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index c237b810b03..21ed8d7f75a 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -2126,7 +2126,7 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
 {
 	pmd_t *kernel_pmd;
 
-	level2_kernel_pgt = extend_brk(sizeof(pmd_t *) * PTRS_PER_PMD, PAGE_SIZE);
+	level2_kernel_pgt = extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
 
 	max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
 				  xen_start_info->nr_pt_frames * PAGE_SIZE +
-- 
cgit v1.2.3-18-g5258


From 404ba5d7bb958d3d788bdaa0debc0bdf60f13ffe Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@redhat.com>
Date: Thu, 28 Oct 2010 11:20:27 -0400
Subject: x86, alternative: Call stop_machine_text_poke() on all cpus

Currently, text_poke_smp() passes a NULL as the third argument to
__stop_machine(), which will only run stop_machine_text_poke()
on 1 cpu. Change NULL -> cpu_online_mask, as stop_machine_text_poke()
is intended to be run on all cpus.

I actually didn't notice any problems with stop_machine_text_poke()
only being called on 1 cpu, but found this via code inspection.

Signed-off-by: Jason Baron <jbaron@redhat.com>
LKML-Reference: <20101028152026.GB2875@redhat.com>
Acked-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Acked-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/alternative.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index a36bb90aef5..5ceeca38282 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -638,7 +638,7 @@ void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
 	atomic_set(&stop_machine_first, 1);
 	wrote_text = 0;
 	/* Use __stop_machine() because the caller already got online_cpus. */
-	__stop_machine(stop_machine_text_poke, (void *)&tpp, NULL);
+	__stop_machine(stop_machine_text_poke, (void *)&tpp, cpu_online_mask);
 	return addr;
 }
 
-- 
cgit v1.2.3-18-g5258


From 7b79462a20826a7269322113c68ca78d5f67c0bd Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Sat, 30 Oct 2010 01:19:29 -0700
Subject: x86: Check irq_remapped instead of remapping_enabled in destroy_irq()

Russ Anderson reported:
| There is a regression that is causing a NULL pointer dereference
| in free_irte when shutting down xpc. git bisect narrowed it down
| to git commit d585d06(intr_remap: Simplify the code further), which
| changed free_irte(). Reverse applying the patch fixes the problem.

We need to use irq_remapped() for each irq instead of checking only
intr_remapping_enabled as there might be non remapped irqs even when
remapping is enabled.

[ tglx: use cfg instead of retrieving it again. Massaged changelog ]

Reported-bisected-and-tested-by: Russ Anderson <rja@sgi.com>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <4CCBD511.40607@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/apic/io_apic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 0929191d83c..7cc0a721f62 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3109,7 +3109,7 @@ void destroy_irq(unsigned int irq)
 
 	irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE);
 
-	if (intr_remapping_enabled)
+	if (irq_remapped(cfg))
 		free_irte(irq);
 	raw_spin_lock_irqsave(&vector_lock, flags);
 	__clear_irq_vector(irq, cfg);
-- 
cgit v1.2.3-18-g5258


From cf38d0ba7efdc476815768b2b999b27cfae69747 Mon Sep 17 00:00:00 2001
From: Rakib Mullick <rakib.mullick@gmail.com>
Date: Mon, 1 Nov 2010 12:53:50 +0600
Subject: x86, mm: Fix section mismatch in tlb.c

Mark tlb_cpuhp_notify as __cpuinit. It's basically a callback
function, which is called from __cpuinit init_smp_flash(). So -
it's safe.

We were warned by the following warning:

 WARNING: arch/x86/mm/built-in.o(.text+0x356d): Section mismatch
 in reference from the function tlb_cpuhp_notify() to the
 function .cpuinit.text:calculate_tlb_offset()
 The function tlb_cpuhp_notify() references
 the function __cpuinit calculate_tlb_offset().
 This is often because tlb_cpuhp_notify lacks a __cpuinit
 annotation or the annotation of calculate_tlb_offset is wrong.

Signed-off-by: Rakib Mullick <rakib.mullick@gmail.com>
Cc: Borislav Petkov <borislav.petkov@amd.com>
Cc: Shaohua Li <shaohua.li@intel.com>
LKML-Reference: <AANLkTinWQRG=HA9uB3ad0KAqRRTinL6L_4iKgF84coph@mail.gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/tlb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 49358481c73..12cdbb17ad1 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -251,7 +251,7 @@ static void __cpuinit calculate_tlb_offset(void)
 	}
 }
 
-static int tlb_cpuhp_notify(struct notifier_block *n,
+static int __cpuinit tlb_cpuhp_notify(struct notifier_block *n,
 		unsigned long action, void *hcpu)
 {
 	switch (action & 0xf) {
-- 
cgit v1.2.3-18-g5258


From edde99ce05290e50ce0b3495d209e54e6349ab47 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Mon, 25 Oct 2010 03:21:24 +0200
Subject: KVM: Write protect memory after slot swap

I have observed the following bug trigger:

1. userspace calls GET_DIRTY_LOG
2. kvm_mmu_slot_remove_write_access is called and makes a page ro
3. page fault happens and makes the page writeable
   fault is logged in the bitmap appropriately
4. kvm_vm_ioctl_get_dirty_log swaps slot pointers

a lot of time passes

5. guest writes into the page
6. userspace calls GET_DIRTY_LOG

At point (5), bitmap is clean and page is writeable,
thus, guest modification of memory is not logged
and GET_DIRTY_LOG returns an empty bitmap.

The rule is that all pages are either dirty in the current bitmap,
or write-protected, which is violated here.

It seems that just moving kvm_mmu_slot_remove_write_access down
to after the slot pointer swap should fix this bug.

KVM-Stable-Tag.
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2288ad829b3..b0818f67206 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3169,10 +3169,6 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 		struct kvm_memslots *slots, *old_slots;
 		unsigned long *dirty_bitmap;
 
-		spin_lock(&kvm->mmu_lock);
-		kvm_mmu_slot_remove_write_access(kvm, log->slot);
-		spin_unlock(&kvm->mmu_lock);
-
 		r = -ENOMEM;
 		dirty_bitmap = vmalloc(n);
 		if (!dirty_bitmap)
@@ -3194,6 +3190,10 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 		dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap;
 		kfree(old_slots);
 
+		spin_lock(&kvm->mmu_lock);
+		kvm_mmu_slot_remove_write_access(kvm, log->slot);
+		spin_unlock(&kvm->mmu_lock);
+
 		r = -EFAULT;
 		if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) {
 			vfree(dirty_bitmap);
-- 
cgit v1.2.3-18-g5258


From eb45fda45f915c7ca3e81e005e853cb770da2642 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Mon, 25 Oct 2010 11:58:22 -0200
Subject: KVM: MMU: fix rmap_remove on non present sptes

drop_spte should not attempt to rmap_remove a non present shadow pte.

This fixes a BUG_ON seen on kvm-autotest.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Reported-by: Lucas Meneghel Rodrigues <lmr@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 908ea5464a5..fb8b376bf28 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -720,7 +720,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 	}
 }
 
-static void set_spte_track_bits(u64 *sptep, u64 new_spte)
+static int set_spte_track_bits(u64 *sptep, u64 new_spte)
 {
 	pfn_t pfn;
 	u64 old_spte = *sptep;
@@ -731,19 +731,20 @@ static void set_spte_track_bits(u64 *sptep, u64 new_spte)
 		old_spte = __xchg_spte(sptep, new_spte);
 
 	if (!is_rmap_spte(old_spte))
-		return;
+		return 0;
 
 	pfn = spte_to_pfn(old_spte);
 	if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
 		kvm_set_pfn_accessed(pfn);
 	if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
 		kvm_set_pfn_dirty(pfn);
+	return 1;
 }
 
 static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
 {
-	set_spte_track_bits(sptep, new_spte);
-	rmap_remove(kvm, sptep);
+	if (set_spte_track_bits(sptep, new_spte))
+		rmap_remove(kvm, sptep);
 }
 
 static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
-- 
cgit v1.2.3-18-g5258


From 97e69aa62f8b5d338d6cff49be09e37cc1262838 Mon Sep 17 00:00:00 2001
From: Vasiliy Kulikov <segooon@gmail.com>
Date: Sat, 30 Oct 2010 22:54:47 +0400
Subject: KVM: x86: fix information leak to userland

Structures kvm_vcpu_events, kvm_debugregs, kvm_pit_state2 and
kvm_clock_data are copied to userland with some padding and reserved
fields unitialized.  It leads to leaking of contents of kernel stack
memory.  We have to initialize them to zero.

In patch v1 Jan Kiszka suggested to fill reserved fields with zeros
instead of memset'ting the whole struct.  It makes sense as these
fields are explicitly marked as padding.  No more fields need zeroing.

KVM-Stable-Tag.
Signed-off-by: Vasiliy Kulikov <segooon@gmail.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b0818f67206..463c65b8f93 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2560,6 +2560,7 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
 		!kvm_exception_is_soft(vcpu->arch.exception.nr);
 	events->exception.nr = vcpu->arch.exception.nr;
 	events->exception.has_error_code = vcpu->arch.exception.has_error_code;
+	events->exception.pad = 0;
 	events->exception.error_code = vcpu->arch.exception.error_code;
 
 	events->interrupt.injected =
@@ -2573,12 +2574,14 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
 	events->nmi.injected = vcpu->arch.nmi_injected;
 	events->nmi.pending = vcpu->arch.nmi_pending;
 	events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
+	events->nmi.pad = 0;
 
 	events->sipi_vector = vcpu->arch.sipi_vector;
 
 	events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
 			 | KVM_VCPUEVENT_VALID_SIPI_VECTOR
 			 | KVM_VCPUEVENT_VALID_SHADOW);
+	memset(&events->reserved, 0, sizeof(events->reserved));
 }
 
 static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
@@ -2623,6 +2626,7 @@ static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
 	dbgregs->dr6 = vcpu->arch.dr6;
 	dbgregs->dr7 = vcpu->arch.dr7;
 	dbgregs->flags = 0;
+	memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved));
 }
 
 static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
@@ -3106,6 +3110,7 @@ static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
 		sizeof(ps->channels));
 	ps->flags = kvm->arch.vpit->pit_state.flags;
 	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+	memset(&ps->reserved, 0, sizeof(ps->reserved));
 	return r;
 }
 
@@ -3486,6 +3491,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
 		local_irq_enable();
 		user_ns.flags = 0;
+		memset(&user_ns.pad, 0, sizeof(user_ns.pad));
 
 		r = -EFAULT;
 		if (copy_to_user(argp, &user_ns, sizeof(user_ns)))
-- 
cgit v1.2.3-18-g5258


From 453d9c57e27b4401bc3e98906bcac31ae8be0165 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Mon, 1 Nov 2010 14:01:13 +0100
Subject: KVM: x86: Issue smp_call_function_many with preemption disabled

smp_call_function_many is specified to be called only with preemption
disabled. Fulfill this requirement.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 463c65b8f93..cdac9e592aa 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3978,8 +3978,10 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
 		return X86EMUL_CONTINUE;
 
 	if (kvm_x86_ops->has_wbinvd_exit()) {
+		preempt_disable();
 		smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
 				wbinvd_ipi, NULL, 1);
+		preempt_enable();
 		cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
 	}
 	wbinvd();
-- 
cgit v1.2.3-18-g5258


From 07cf2a64c2ad3408a0e12aa4cd6040b30c09381d Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Sat, 6 Nov 2010 10:06:49 +0100
Subject: xen: fix memory leak in Xen PCI MSI/MSI-X allocator.

Stanse found that xen_setup_msi_irqs leaks memory when
xen_allocate_pirq fails. Free the memory in that fail path.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: xen-devel@lists.xensource.com
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: x86@kernel.org
---
 arch/x86/pci/xen.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 117f5b8daf7..d7b5109f7a9 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -147,8 +147,10 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 		irq = xen_allocate_pirq(v[i], 0, /* not sharable */
 			(type == PCI_CAP_ID_MSIX) ?
 			"pcifront-msi-x" : "pcifront-msi");
-		if (irq < 0)
-			return -1;
+		if (irq < 0) {
+			ret = -1;
+			goto free;
+		}
 
 		ret = set_irq_msi(irq, msidesc);
 		if (ret)
@@ -164,7 +166,7 @@ error:
 	if (ret == -ENODEV)
 		dev_err(&dev->dev, "Xen PCI frontend has not registered" \
 			" MSI/MSI-X support!\n");
-
+free:
 	kfree(v);
 	return ret;
 }
-- 
cgit v1.2.3-18-g5258


From 0059b2436a86fedb2747f654f8e10a67e97d8614 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Mon, 8 Nov 2010 22:20:29 +0100
Subject: x86: Address gcc4.6 "set but not used" warnings in apic.h

native_apic_msr_read() and x2apic_enabled() use rdmsr(msr, low, high),
but only use the low part.

gcc4.6 complains about this:
.../apic.h:144:11: warning: variable 'high' set but not used [-Wunused-but-set-variable]

rdmsr() is just a wrapper around rdmsrl() which splits the 64bit value
into low and high, so using rdmsrl() directly solves this.

[tglx: Changed the variables to u64 as suggested by Cyrill. It's less
       confusing and has no code impact as this is 64bit only anyway.
       Massaged changelog as well. ]

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Cc: x86@kernel.org
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
LKML-Reference: <1289251229-19589-1-git-send-email-andi@firstfloor.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/apic.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 286de34b0ed..f6ce0bda3b9 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -141,13 +141,13 @@ static inline void native_apic_msr_write(u32 reg, u32 v)
 
 static inline u32 native_apic_msr_read(u32 reg)
 {
-	u32 low, high;
+	u64 msr;
 
 	if (reg == APIC_DFR)
 		return -1;
 
-	rdmsr(APIC_BASE_MSR + (reg >> 4), low, high);
-	return low;
+	rdmsrl(APIC_BASE_MSR + (reg >> 4), msr);
+	return (u32)msr;
 }
 
 static inline void native_x2apic_wait_icr_idle(void)
@@ -181,12 +181,12 @@ extern void enable_x2apic(void);
 extern void x2apic_icr_write(u32 low, u32 id);
 static inline int x2apic_enabled(void)
 {
-	int msr, msr2;
+	u64 msr;
 
 	if (!cpu_has_x2apic)
 		return 0;
 
-	rdmsr(MSR_IA32_APICBASE, msr, msr2);
+	rdmsrl(MSR_IA32_APICBASE, msr);
 	if (msr & X2APIC_ENABLE)
 		return 1;
 	return 0;
-- 
cgit v1.2.3-18-g5258


From 8e5e9521c13ff8cf6727999999c8d88cc64b5ff7 Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jj@chaosbits.net>
Date: Tue, 9 Nov 2010 00:08:11 +0100
Subject: x86: Remove unnecessary casts of void ptr returning alloc function
 return values

The [vk][cmz]alloc(_node) family of functions return void
pointers which it's completely unnecessary/pointless to cast to
other pointer types since that happens implicitly.

This patch removes such casts from arch/x86.

Signed-off-by: Jesper Juhl <jj@chaosbits.net>
Cc: trivial@kernel.org
Cc: amd64-microcode@amd64.org
Cc: Andreas Herrmann <andreas.herrmann3@amd.com>
LKML-Reference: <alpine.LNX.2.00.1011082310220.23697@swampdragon.chaosbits.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/microcode_amd.c |  2 +-
 arch/x86/platform/uv/tlb_uv.c   | 13 ++++++-------
 2 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index e1af7c055c7..ce0cb4721c9 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -212,7 +212,7 @@ static int install_equiv_cpu_table(const u8 *buf)
 		return 0;
 	}
 
-	equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size);
+	equiv_cpu_table = vmalloc(size);
 	if (!equiv_cpu_table) {
 		pr_err("failed to allocate equivalent CPU table\n");
 		return 0;
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index 20ea20a39e2..a318194002b 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -1343,8 +1343,8 @@ uv_activation_descriptor_init(int node, int pnode)
 	 * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR)
 	 * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per uvhub
 	 */
-	bau_desc = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)*
-		UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node);
+	bau_desc = kmalloc_node(sizeof(struct bau_desc) * UV_ADP_SIZE
+				* UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node);
 	BUG_ON(!bau_desc);
 
 	pa = uv_gpa(bau_desc); /* need the real nasid*/
@@ -1402,9 +1402,9 @@ uv_payload_queue_init(int node, int pnode)
 	struct bau_payload_queue_entry *pqp_malloc;
 	struct bau_control *bcp;
 
-	pqp = (struct bau_payload_queue_entry *) kmalloc_node(
-		(DEST_Q_SIZE + 1) * sizeof(struct bau_payload_queue_entry),
-		GFP_KERNEL, node);
+	pqp = kmalloc_node((DEST_Q_SIZE + 1)
+			   * sizeof(struct bau_payload_queue_entry),
+			   GFP_KERNEL, node);
 	BUG_ON(!pqp);
 	pqp_malloc = pqp;
 
@@ -1520,8 +1520,7 @@ static void __init uv_init_per_cpu(int nuvhubs)
 
 	timeout_us = calculate_destination_timeout();
 
-	uvhub_descs = (struct uvhub_desc *)
-		kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL);
+	uvhub_descs = kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL);
 	memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc));
 	uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL);
 	for_each_present_cpu(cpu) {
-- 
cgit v1.2.3-18-g5258


From 62b0cfc240b1d4601333912ef8760e0ca9ec2cec Mon Sep 17 00:00:00 2001
From: Jack Steiner <steiner@sgi.com>
Date: Sat, 6 Nov 2010 15:41:04 -0500
Subject: x86, UV: Update node controller MMRs

A new version of the SGI UV hub node controller is being
developed. A few of the MMRs (control registers) that exist on
the current hub no longer exist on the new hub. Fortunately,
there are alternate MMRs that are are functionally equivalent
and that exist on both hubs.

This patch changes the UV code to use MMRs that exist in BOTH
versions of the hub node controller.

Signed-off-by: Jack Steiner <steiner@sgi.com>
LKML-Reference: <20101106204056.GA27584@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uv/uv_mmrs.h  | 189 +++++++++++++++++++------------------
 arch/x86/kernel/apic/x2apic_uv_x.c |  12 +--
 2 files changed, 102 insertions(+), 99 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h
index b2f2d2e05ce..6d90adf4428 100644
--- a/arch/x86/include/asm/uv/uv_mmrs.h
+++ b/arch/x86/include/asm/uv/uv_mmrs.h
@@ -805,6 +805,78 @@ union uvh_node_present_table_u {
     } s;
 };
 
+/* ========================================================================= */
+/*                 UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR                  */
+/* ========================================================================= */
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR 0x16000c8UL
+
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_SHFT 24
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_MASK 0x00000000ff000000UL
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_SHFT 48
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_MASK 0x001f000000000000UL
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_SHFT 63
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_MASK 0x8000000000000000UL
+
+union uvh_rh_gam_alias210_overlay_config_0_mmr_u {
+    unsigned long	v;
+    struct uvh_rh_gam_alias210_overlay_config_0_mmr_s {
+	unsigned long	rsvd_0_23: 24;  /*    */
+	unsigned long	base    :  8;  /* RW */
+	unsigned long	rsvd_32_47: 16;  /*    */
+	unsigned long	m_alias :  5;  /* RW */
+	unsigned long	rsvd_53_62: 10;  /*    */
+	unsigned long	enable  :  1;  /* RW */
+    } s;
+};
+
+/* ========================================================================= */
+/*                 UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR                  */
+/* ========================================================================= */
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR 0x16000d8UL
+
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_SHFT 24
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_MASK 0x00000000ff000000UL
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_SHFT 48
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_MASK 0x001f000000000000UL
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_SHFT 63
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_MASK 0x8000000000000000UL
+
+union uvh_rh_gam_alias210_overlay_config_1_mmr_u {
+    unsigned long	v;
+    struct uvh_rh_gam_alias210_overlay_config_1_mmr_s {
+	unsigned long	rsvd_0_23: 24;  /*    */
+	unsigned long	base    :  8;  /* RW */
+	unsigned long	rsvd_32_47: 16;  /*    */
+	unsigned long	m_alias :  5;  /* RW */
+	unsigned long	rsvd_53_62: 10;  /*    */
+	unsigned long	enable  :  1;  /* RW */
+    } s;
+};
+
+/* ========================================================================= */
+/*                 UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR                  */
+/* ========================================================================= */
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR 0x16000e8UL
+
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_SHFT 24
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_MASK 0x00000000ff000000UL
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_SHFT 48
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_MASK 0x001f000000000000UL
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_SHFT 63
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_MASK 0x8000000000000000UL
+
+union uvh_rh_gam_alias210_overlay_config_2_mmr_u {
+    unsigned long	v;
+    struct uvh_rh_gam_alias210_overlay_config_2_mmr_s {
+	unsigned long	rsvd_0_23: 24;  /*    */
+	unsigned long	base    :  8;  /* RW */
+	unsigned long	rsvd_32_47: 16;  /*    */
+	unsigned long	m_alias :  5;  /* RW */
+	unsigned long	rsvd_53_62: 10;  /*    */
+	unsigned long	enable  :  1;  /* RW */
+    } s;
+};
+
 /* ========================================================================= */
 /*                UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR                  */
 /* ========================================================================= */
@@ -856,6 +928,29 @@ union uvh_rh_gam_alias210_redirect_config_2_mmr_u {
     } s;
 };
 
+/* ========================================================================= */
+/*                          UVH_RH_GAM_CONFIG_MMR                            */
+/* ========================================================================= */
+#define UVH_RH_GAM_CONFIG_MMR 0x1600000UL
+
+#define UVH_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0
+#define UVH_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL
+#define UVH_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6
+#define UVH_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL
+#define UVH_RH_GAM_CONFIG_MMR_MMIOL_CFG_SHFT 12
+#define UVH_RH_GAM_CONFIG_MMR_MMIOL_CFG_MASK 0x0000000000001000UL
+
+union uvh_rh_gam_config_mmr_u {
+    unsigned long	v;
+    struct uvh_rh_gam_config_mmr_s {
+	unsigned long	m_skt     :  6;  /* RW */
+	unsigned long	n_skt     :  4;  /* RW */
+	unsigned long	rsvd_10_11:  2;  /*    */
+	unsigned long	mmiol_cfg :  1;  /* RW */
+	unsigned long	rsvd_13_63: 51;  /*    */
+    } s;
+};
+
 /* ========================================================================= */
 /*                    UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR                      */
 /* ========================================================================= */
@@ -987,97 +1082,5 @@ union uvh_rtc1_int_config_u {
     } s;
 };
 
-/* ========================================================================= */
-/*                          UVH_SI_ADDR_MAP_CONFIG                           */
-/* ========================================================================= */
-#define UVH_SI_ADDR_MAP_CONFIG 0xc80000UL
-
-#define UVH_SI_ADDR_MAP_CONFIG_M_SKT_SHFT 0
-#define UVH_SI_ADDR_MAP_CONFIG_M_SKT_MASK 0x000000000000003fUL
-#define UVH_SI_ADDR_MAP_CONFIG_N_SKT_SHFT 8
-#define UVH_SI_ADDR_MAP_CONFIG_N_SKT_MASK 0x0000000000000f00UL
-
-union uvh_si_addr_map_config_u {
-    unsigned long	v;
-    struct uvh_si_addr_map_config_s {
-	unsigned long	m_skt :  6;  /* RW */
-	unsigned long	rsvd_6_7:  2;  /*    */
-	unsigned long	n_skt :  4;  /* RW */
-	unsigned long	rsvd_12_63: 52;  /*    */
-    } s;
-};
-
-/* ========================================================================= */
-/*                       UVH_SI_ALIAS0_OVERLAY_CONFIG                        */
-/* ========================================================================= */
-#define UVH_SI_ALIAS0_OVERLAY_CONFIG 0xc80008UL
-
-#define UVH_SI_ALIAS0_OVERLAY_CONFIG_BASE_SHFT 24
-#define UVH_SI_ALIAS0_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL
-#define UVH_SI_ALIAS0_OVERLAY_CONFIG_M_ALIAS_SHFT 48
-#define UVH_SI_ALIAS0_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL
-#define UVH_SI_ALIAS0_OVERLAY_CONFIG_ENABLE_SHFT 63
-#define UVH_SI_ALIAS0_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL
-
-union uvh_si_alias0_overlay_config_u {
-    unsigned long	v;
-    struct uvh_si_alias0_overlay_config_s {
-	unsigned long	rsvd_0_23: 24;  /*    */
-	unsigned long	base    :  8;  /* RW */
-	unsigned long	rsvd_32_47: 16;  /*    */
-	unsigned long	m_alias :  5;  /* RW */
-	unsigned long	rsvd_53_62: 10;  /*    */
-	unsigned long	enable  :  1;  /* RW */
-    } s;
-};
-
-/* ========================================================================= */
-/*                       UVH_SI_ALIAS1_OVERLAY_CONFIG                        */
-/* ========================================================================= */
-#define UVH_SI_ALIAS1_OVERLAY_CONFIG 0xc80010UL
-
-#define UVH_SI_ALIAS1_OVERLAY_CONFIG_BASE_SHFT 24
-#define UVH_SI_ALIAS1_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL
-#define UVH_SI_ALIAS1_OVERLAY_CONFIG_M_ALIAS_SHFT 48
-#define UVH_SI_ALIAS1_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL
-#define UVH_SI_ALIAS1_OVERLAY_CONFIG_ENABLE_SHFT 63
-#define UVH_SI_ALIAS1_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL
-
-union uvh_si_alias1_overlay_config_u {
-    unsigned long	v;
-    struct uvh_si_alias1_overlay_config_s {
-	unsigned long	rsvd_0_23: 24;  /*    */
-	unsigned long	base    :  8;  /* RW */
-	unsigned long	rsvd_32_47: 16;  /*    */
-	unsigned long	m_alias :  5;  /* RW */
-	unsigned long	rsvd_53_62: 10;  /*    */
-	unsigned long	enable  :  1;  /* RW */
-    } s;
-};
-
-/* ========================================================================= */
-/*                       UVH_SI_ALIAS2_OVERLAY_CONFIG                        */
-/* ========================================================================= */
-#define UVH_SI_ALIAS2_OVERLAY_CONFIG 0xc80018UL
-
-#define UVH_SI_ALIAS2_OVERLAY_CONFIG_BASE_SHFT 24
-#define UVH_SI_ALIAS2_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL
-#define UVH_SI_ALIAS2_OVERLAY_CONFIG_M_ALIAS_SHFT 48
-#define UVH_SI_ALIAS2_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL
-#define UVH_SI_ALIAS2_OVERLAY_CONFIG_ENABLE_SHFT 63
-#define UVH_SI_ALIAS2_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL
-
-union uvh_si_alias2_overlay_config_u {
-    unsigned long	v;
-    struct uvh_si_alias2_overlay_config_s {
-	unsigned long	rsvd_0_23: 24;  /*    */
-	unsigned long	base    :  8;  /* RW */
-	unsigned long	rsvd_32_47: 16;  /*    */
-	unsigned long	m_alias :  5;  /* RW */
-	unsigned long	rsvd_53_62: 10;  /*    */
-	unsigned long	enable  :  1;  /* RW */
-    } s;
-};
-
 
-#endif /* _ASM_X86_UV_UV_MMRS_H */
+#endif /* __ASM_UV_MMRS_X86_H__ */
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index ed4118de249..194539aea17 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -379,14 +379,14 @@ struct redir_addr {
 #define DEST_SHIFT UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT
 
 static __initdata struct redir_addr redir_addrs[] = {
-	{UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR, UVH_SI_ALIAS0_OVERLAY_CONFIG},
-	{UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR, UVH_SI_ALIAS1_OVERLAY_CONFIG},
-	{UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_SI_ALIAS2_OVERLAY_CONFIG},
+	{UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR},
+	{UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR},
+	{UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR},
 };
 
 static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
 {
-	union uvh_si_alias0_overlay_config_u alias;
+	union uvh_rh_gam_alias210_overlay_config_2_mmr_u alias;
 	union uvh_rh_gam_alias210_redirect_config_2_mmr_u redirect;
 	int i;
 
@@ -660,7 +660,7 @@ void uv_nmi_init(void)
 
 void __init uv_system_init(void)
 {
-	union uvh_si_addr_map_config_u m_n_config;
+	union uvh_rh_gam_config_mmr_u  m_n_config;
 	union uvh_node_id_u node_id;
 	unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size;
 	int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val;
@@ -670,7 +670,7 @@ void __init uv_system_init(void)
 
 	map_low_mmrs();
 
-	m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG);
+	m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR );
 	m_val = m_n_config.s.m_skt;
 	n_val = m_n_config.s.n_skt;
 	mmr_base =
-- 
cgit v1.2.3-18-g5258


From 2f62bf7d238f6dfa39faf24c746d0b8dd60f85c5 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Thu, 4 Nov 2010 15:23:58 +0000
Subject: x86: Adjust section annotations in AMD Fam10 MMCONF enabling code

check_enable_amd_mmconf_dmi() gets called only for the BSP,
hence everything hanging off of it can be __init*.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Acked-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <4CD2DE1E0200007800020990@vpn.id2.novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/mmconf-fam10h_64.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index 71825806cd4..6da143c2a6b 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -217,13 +217,13 @@ void __cpuinit fam10h_check_enable_mmcfg(void)
 	wrmsrl(address, val);
 }
 
-static int __devinit set_check_enable_amd_mmconf(const struct dmi_system_id *d)
+static int __init set_check_enable_amd_mmconf(const struct dmi_system_id *d)
 {
         pci_probe |= PCI_CHECK_ENABLE_AMD_MMCONF;
         return 0;
 }
 
-static const struct dmi_system_id __cpuinitconst mmconf_dmi_table[] = {
+static const struct dmi_system_id __initconst mmconf_dmi_table[] = {
         {
                 .callback = set_check_enable_amd_mmconf,
                 .ident = "Sun Microsystems Machine",
@@ -234,7 +234,8 @@ static const struct dmi_system_id __cpuinitconst mmconf_dmi_table[] = {
 	{}
 };
 
-void __cpuinit check_enable_amd_mmconf_dmi(void)
+/* Called from a __cpuinit function, but only on the BSP. */
+void __ref check_enable_amd_mmconf_dmi(void)
 {
 	dmi_check_system(mmconf_dmi_table);
 }
-- 
cgit v1.2.3-18-g5258


From 2a8dcbd6cd2270f912ca141547d9296ce08abe4a Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jj@chaosbits.net>
Date: Sun, 7 Nov 2010 22:57:18 +0100
Subject: x86, apic: Remove double #include

Remove the second <asm/atomic.h> inclusion.

Signed-off-by: Jesper Juhl <jj@chaosbits.net>
LKML-Reference: <alpine.LNX.2.00.1011072253360.26247@swampdragon.chaosbits.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/apic.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 850657d1b0e..3f838d53739 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -52,7 +52,6 @@
 #include <asm/mce.h>
 #include <asm/kvm_para.h>
 #include <asm/tsc.h>
-#include <asm/atomic.h>
 
 unsigned int num_processors;
 
-- 
cgit v1.2.3-18-g5258


From 1f523bf36734375dd6e986c9f47f010d00a8caca Mon Sep 17 00:00:00 2001
From: Kusanagi Kouichi <slash@ac.auone-net.jp>
Date: Fri, 5 Nov 2010 20:04:42 +0900
Subject: x86, pvclock: Remove leftover scale_delta() function

Commit 92580d64e16402762e2acc3022f065397c780425
("x86: pvclock: Move scale_delta into common header")
forgot to remove scale_delta.

Signed-off-by: Kusanagi Kouichi <slash@ac.auone-net.jp>
Cc: Zachary Amsden <zamsden@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Glauber Costa <glommer@redhat.com>
LKML-Reference: <20101105110444.BAF6D6FC03B@msa105.auone-net.jp>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/pvclock.c | 38 --------------------------------------
 1 file changed, 38 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index bab3b9e6f66..008b91eefa1 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -41,44 +41,6 @@ void pvclock_set_flags(u8 flags)
 	valid_flags = flags;
 }
 
-/*
- * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
- * yielding a 64-bit result.
- */
-static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
-{
-	u64 product;
-#ifdef __i386__
-	u32 tmp1, tmp2;
-#endif
-
-	if (shift < 0)
-		delta >>= -shift;
-	else
-		delta <<= shift;
-
-#ifdef __i386__
-	__asm__ (
-		"mul  %5       ; "
-		"mov  %4,%%eax ; "
-		"mov  %%edx,%4 ; "
-		"mul  %5       ; "
-		"xor  %5,%5    ; "
-		"add  %4,%%eax ; "
-		"adc  %5,%%edx ; "
-		: "=A" (product), "=r" (tmp1), "=r" (tmp2)
-		: "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
-#elif defined(__x86_64__)
-	__asm__ (
-		"mul %%rdx ; shrd $32,%%rdx,%%rax"
-		: "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
-#else
-#error implement me!
-#endif
-
-	return product;
-}
-
 static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow)
 {
 	u64 delta = native_read_tsc() - shadow->tsc_timestamp;
-- 
cgit v1.2.3-18-g5258


From 034c6efa4616e5ff6253549e973e7fef12899324 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 1 Nov 2010 18:52:05 +0100
Subject: perf, amd: Use kmalloc_node(,__GFP_ZERO) for northbridge structure
 allocation

Jasper suggested we use the zeroing capability of the allocators
instead of calling memset ourselves. Add node affinity while we're at
it.

Reported-by: Jesper Juhl <jj@chaosbits.net>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_amd.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 46d58448c3a..e421b8cd694 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -280,11 +280,11 @@ static struct amd_nb *amd_alloc_nb(int cpu, int nb_id)
 	struct amd_nb *nb;
 	int i;
 
-	nb = kmalloc(sizeof(struct amd_nb), GFP_KERNEL);
+	nb = kmalloc_node(sizeof(struct amd_nb), GFP_KERNEL | __GFP_ZERO,
+			  cpu_to_node(cpu));
 	if (!nb)
 		return NULL;
 
-	memset(nb, 0, sizeof(*nb));
 	nb->nb_id = nb_id;
 
 	/*
-- 
cgit v1.2.3-18-g5258


From 9ec23a7f6d2537faf14368e066e307c06812c4ca Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Thu, 28 Oct 2010 11:32:29 -0700
Subject: xen: do not release any memory under 1M in domain 0

We already deliberately setup a 1-1 P2M for the region up to 1M in
order to allow code which assumes this region is already mapped to
work without having to convert everything to ioremap.

Domain 0 should not return any apparently unused memory regions
(reserved or otherwise) in this region to Xen since the e820 may not
accurately reflect what the BIOS has stashed in this region.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/setup.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index b1dbdaa23ec..769c4b01fa3 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -118,16 +118,18 @@ static unsigned long __init xen_return_unused_memory(unsigned long max_pfn,
 						     const struct e820map *e820)
 {
 	phys_addr_t max_addr = PFN_PHYS(max_pfn);
-	phys_addr_t last_end = 0;
+	phys_addr_t last_end = ISA_END_ADDRESS;
 	unsigned long released = 0;
 	int i;
 
+	/* Free any unused memory above the low 1Mbyte. */
 	for (i = 0; i < e820->nr_map && last_end < max_addr; i++) {
 		phys_addr_t end = e820->map[i].addr;
 		end = min(max_addr, end);
 
-		released += xen_release_chunk(last_end, end);
-		last_end = e820->map[i].addr + e820->map[i].size;
+		if (last_end < end)
+			released += xen_release_chunk(last_end, end);
+		last_end = max(last_end, e820->map[i].addr + e820->map[i].size);
 	}
 
 	if (last_end < max_addr)
@@ -164,6 +166,7 @@ char * __init xen_memory_setup(void)
 		XENMEM_memory_map;
 	rc = HYPERVISOR_memory_op(op, &memmap);
 	if (rc == -ENOSYS) {
+		BUG_ON(xen_initial_domain());
 		memmap.nr_entries = 1;
 		map[0].addr = 0ULL;
 		map[0].size = mem_end;
@@ -201,12 +204,13 @@ char * __init xen_memory_setup(void)
 	}
 
 	/*
-	 * Even though this is normal, usable memory under Xen, reserve
-	 * ISA memory anyway because too many things think they can poke
+	 * In domU, the ISA region is normal, usable memory, but we
+	 * reserve ISA memory anyway because too many things poke
 	 * about in there.
 	 *
-	 * In a dom0 kernel, this region is identity mapped with the
-	 * hardware ISA area, so it really is out of bounds.
+	 * In Dom0, the host E820 information can leave gaps in the
+	 * ISA range, which would cause us to release those pages.  To
+	 * avoid this, we unconditionally reserve them here.
 	 */
 	e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
 			E820_RESERVED);
-- 
cgit v1.2.3-18-g5258


From 4723d0f2f96e6c910f951d595067eb31e0dd2d01 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bjorn.helgaas@hp.com>
Date: Wed, 22 Sep 2010 11:09:19 -0600
Subject: x86/PCI: coalesce overlapping host bridge windows

Some BIOSes provide PCI host bridge windows that overlap, e.g.,

    pci_root PNP0A03:00: host bridge window [mem 0xb0000000-0xffffffff]
    pci_root PNP0A03:00: host bridge window [mem 0xafffffff-0xdfffffff]
    pci_root PNP0A03:00: host bridge window [mem 0xf0000000-0xffffffff]

If we simply insert these as children of iomem_resource, the second window
fails because it conflicts with the first, and the third is inserted as a
child of the first, i.e.,

    b0000000-ffffffff PCI Bus 0000:00
      f0000000-ffffffff PCI Bus 0000:00

When we claim PCI device resources, this can cause collisions like this
if we put them in the first window:

    pci 0000:00:01.0: address space collision: [mem 0xff300000-0xff4fffff] conflicts with PCI Bus 0000:00 [mem 0xf0000000-0xffffffff]

Host bridge windows are top-level resources by definition, so it doesn't
make sense to make the third window a child of the first.  This patch
coalesces any host bridge windows that overlap.  For the example above,
the result is this single window:

    pci_root PNP0A03:00: host bridge window [mem 0xafffffff-0xffffffff]

This fixes a 2.6.34 regression.

Reference: https://bugzilla.kernel.org/show_bug.cgi?id=17011
Reported-and-tested-by: Anisse Astier <anisse@astier.eu>
Reported-and-tested-by: Pramod Dematagoda <pmd.lotr.gandalf@gmail.com>
Signed-off-by: Bjorn Helgaas <bjorn.helgaas@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/acpi.c | 103 ++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 83 insertions(+), 20 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 15466c096ba..0972315c386 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -138,7 +138,6 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
 	struct acpi_resource_address64 addr;
 	acpi_status status;
 	unsigned long flags;
-	struct resource *root, *conflict;
 	u64 start, end;
 
 	status = resource_to_addr(acpi_res, &addr);
@@ -146,12 +145,10 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
 		return AE_OK;
 
 	if (addr.resource_type == ACPI_MEMORY_RANGE) {
-		root = &iomem_resource;
 		flags = IORESOURCE_MEM;
 		if (addr.info.mem.caching == ACPI_PREFETCHABLE_MEMORY)
 			flags |= IORESOURCE_PREFETCH;
 	} else if (addr.resource_type == ACPI_IO_RANGE) {
-		root = &ioport_resource;
 		flags = IORESOURCE_IO;
 	} else
 		return AE_OK;
@@ -172,25 +169,90 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
 		return AE_OK;
 	}
 
-	conflict = insert_resource_conflict(root, res);
-	if (conflict) {
-		dev_err(&info->bridge->dev,
-			"address space collision: host bridge window %pR "
-			"conflicts with %s %pR\n",
-			res, conflict->name, conflict);
-	} else {
-		pci_bus_add_resource(info->bus, res, 0);
-		info->res_num++;
-		if (addr.translation_offset)
-			dev_info(&info->bridge->dev, "host bridge window %pR "
-				 "(PCI address [%#llx-%#llx])\n",
-				 res, res->start - addr.translation_offset,
-				 res->end - addr.translation_offset);
+	info->res_num++;
+	if (addr.translation_offset)
+		dev_info(&info->bridge->dev, "host bridge window %pR "
+			 "(PCI address [%#llx-%#llx])\n",
+			 res, res->start - addr.translation_offset,
+			 res->end - addr.translation_offset);
+	else
+		dev_info(&info->bridge->dev, "host bridge window %pR\n", res);
+
+	return AE_OK;
+}
+
+static bool resource_contains(struct resource *res, resource_size_t point)
+{
+	if (res->start <= point && point <= res->end)
+		return true;
+	return false;
+}
+
+static void coalesce_windows(struct pci_root_info *info, int type)
+{
+	int i, j;
+	struct resource *res1, *res2;
+
+	for (i = 0; i < info->res_num; i++) {
+		res1 = &info->res[i];
+		if (!(res1->flags & type))
+			continue;
+
+		for (j = i + 1; j < info->res_num; j++) {
+			res2 = &info->res[j];
+			if (!(res2->flags & type))
+				continue;
+
+			/*
+			 * I don't like throwing away windows because then
+			 * our resources no longer match the ACPI _CRS, but
+			 * the kernel resource tree doesn't allow overlaps.
+			 */
+			if (resource_contains(res1, res2->start) ||
+			    resource_contains(res1, res2->end) ||
+			    resource_contains(res2, res1->start) ||
+			    resource_contains(res2, res1->end)) {
+				res1->start = min(res1->start, res2->start);
+				res1->end = max(res1->end, res2->end);
+				dev_info(&info->bridge->dev,
+					 "host bridge window expanded to %pR; %pR ignored\n",
+					 res1, res2);
+				res2->flags = 0;
+			}
+		}
+	}
+}
+
+static void add_resources(struct pci_root_info *info)
+{
+	int i;
+	struct resource *res, *root, *conflict;
+
+	if (!pci_use_crs)
+		return;
+
+	coalesce_windows(info, IORESOURCE_MEM);
+	coalesce_windows(info, IORESOURCE_IO);
+
+	for (i = 0; i < info->res_num; i++) {
+		res = &info->res[i];
+
+		if (res->flags & IORESOURCE_MEM)
+			root = &iomem_resource;
+		else if (res->flags & IORESOURCE_IO)
+			root = &ioport_resource;
 		else
-			dev_info(&info->bridge->dev,
-				 "host bridge window %pR\n", res);
+			continue;
+
+		conflict = insert_resource_conflict(root, res);
+		if (conflict)
+			dev_err(&info->bridge->dev,
+				"address space collision: host bridge window %pR "
+				"conflicts with %s %pR\n",
+				res, conflict->name, conflict);
+		else
+			pci_bus_add_resource(info->bus, res, 0);
 	}
-	return AE_OK;
 }
 
 static void
@@ -224,6 +286,7 @@ get_current_resources(struct acpi_device *device, int busnum,
 	acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource,
 				&info);
 
+	add_resources(&info);
 	return;
 
 name_alloc_fail:
-- 
cgit v1.2.3-18-g5258