From 2e5aa6824d9e0248d734573dad8858a2cc279cfe Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jj@chaosbits.net>
Date: Mon, 24 Jan 2011 22:41:11 +0100
Subject: x86-64: Don't use pointer to out-of-scope variable in dump_trace()

In arch/x86/kernel/dumpstack_64.c::dump_trace() we have this code:

...
  		if (!stack) {
  			unsigned long dummy;
  			stack = &dummy;
  			if (task && task != current)
  				stack = (unsigned long *)task->thread.sp;
  		}

  		bp = stack_frame(task, regs);
  		/*
  		 * Print function call entries in all stacks, starting at the
  		 * current stack address. If the stacks consist of nested
  		 * exceptions
  		 */
  		tinfo = task_thread_info(task);

  		for (;;) {
  			char *id;
  			unsigned long *estack_end;
  			estack_end = in_exception_stack(cpu, (unsigned long)stack,
  							&used, &id);
...

You'll notice that we assign to 'stack' the address of the variable
'dummy' which is only in-scope inside the 'if (!stack)'. So when we later
access stack (at the end of the above, and assuming we did not take the
'if (task && task != current)' branch) we'll be using the address of a
variable that is no longer in scope. I believe this patch is the proper
fix, but I freely admit that I'm not 100% certain.

Signed-off-by: Jesper Juhl <jj@chaosbits.net>
LKML-Reference: <alpine.LNX.2.00.1101242232590.10252@swampdragon.chaosbits.net>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/dumpstack_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 64101335de1..a6b6fcf7f0a 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -149,13 +149,13 @@ void dump_trace(struct task_struct *task,
 	unsigned used = 0;
 	struct thread_info *tinfo;
 	int graph = 0;
+	unsigned long dummy;
 	unsigned long bp;
 
 	if (!task)
 		task = current;
 
 	if (!stack) {
-		unsigned long dummy;
 		stack = &dummy;
 		if (task && task != current)
 			stack = (unsigned long *)task->thread.sp;
-- 
cgit v1.2.3-18-g5258


From cacf061c5e42a040200463afccd9178ace680322 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Tue, 25 Jan 2011 15:07:09 -0800
Subject: thp: fix PARAVIRT x86 32bit noPAE

This fixes TRANSPARENT_HUGEPAGE=y with PARAVIRT=y and HIGHMEM64=n.

The #ifdef that this patch removes was erratically introduced to fix a
build error for noPAE (where pmd.pmd doesn't exist).  So then the kernel
built but it failed at runtime because set_pmd_at was a noop.  This will
correct it by enabling set_pmd_at for noPAE mode too.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Reported-by: werner <w.landgraf@ru.ru>
Reported-by: Minchan Kim <minchan.kim@gmail.com>
Tested-by: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/paravirt.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 2071a8b2b32..ebbc4d8ab17 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -558,13 +558,12 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
 static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
 			      pmd_t *pmdp, pmd_t pmd)
 {
-#if PAGETABLE_LEVELS >= 3
 	if (sizeof(pmdval_t) > sizeof(long))
 		/* 5 arg words */
 		pv_mmu_ops.set_pmd_at(mm, addr, pmdp, pmd);
 	else
-		PVOP_VCALL4(pv_mmu_ops.set_pmd_at, mm, addr, pmdp, pmd.pmd);
-#endif
+		PVOP_VCALL4(pv_mmu_ops.set_pmd_at, mm, addr, pmdp,
+			    native_pmd_val(pmd));
 }
 #endif
 
-- 
cgit v1.2.3-18-g5258


From 9a57c3e487d25f69715705dfeef6eb9e4d666ad7 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 24 Jan 2011 17:13:53 -0800
Subject: x86: Remove left over system_64.h

Left-over from the x86 merge ...

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <4D3E23D1.7010405@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/system_64.h | 22 ----------------------
 1 file changed, 22 deletions(-)
 delete mode 100644 arch/x86/include/asm/system_64.h

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/system_64.h b/arch/x86/include/asm/system_64.h
deleted file mode 100644
index 1159e091ad0..00000000000
--- a/arch/x86/include/asm/system_64.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#ifndef _ASM_X86_SYSTEM_64_H
-#define _ASM_X86_SYSTEM_64_H
-
-#include <asm/segment.h>
-#include <asm/cmpxchg.h>
-
-
-static inline unsigned long read_cr8(void)
-{
-	unsigned long cr8;
-	asm volatile("movq %%cr8,%0" : "=r" (cr8));
-	return cr8;
-}
-
-static inline void write_cr8(unsigned long val)
-{
-	asm volatile("movq %0,%%cr8" :: "r" (val) : "memory");
-}
-
-#include <linux/irqflags.h>
-
-#endif /* _ASM_X86_SYSTEM_64_H */
-- 
cgit v1.2.3-18-g5258


From 889a7a6a5d5e64063effd40056bdc7b8fb336bd1 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 25 Jan 2011 17:31:54 +0100
Subject: percpu, x86: Fix percpu_xchg_op()

These recent percpu commits:

  2485b6464cf8: x86,percpu: Move out of place 64 bit ops into X86_64 section
  8270137a0d50: cpuops: Use cmpxchg for xchg to avoid lock semantics

Caused this 'perf top' crash:

 Kernel panic - not syncing: Fatal exception in interrupt
 Pid: 0, comm: swapper Tainted: G     D
 2.6.38-rc2-00181-gef71723 #413 Call Trace: <IRQ> [<ffffffff810465b5>]
    ? panic
    ? kmsg_dump
    ? kmsg_dump
    ? oops_end
    ? no_context
    ? __bad_area_nosemaphore
    ? perf_output_begin
    ? bad_area_nosemaphore
    ? do_page_fault
    ? __task_pid_nr_ns
    ? perf_event_tid
    ? __perf_event_header__init_id
    ? validate_chain
    ? perf_output_sample
    ? trace_hardirqs_off
    ? page_fault
    ? irq_work_run
    ? update_process_times
    ? tick_sched_timer
    ? tick_sched_timer
    ? __run_hrtimer
    ? hrtimer_interrupt
    ? account_system_vtime
    ? smp_apic_timer_interrupt
    ? apic_timer_interrupt
 ...

Looking at assembly code, I found:

list = this_cpu_xchg(irq_work_list, NULL);

gives this wrong code : (gcc-4.1.2 cross compiler)

ffffffff810bc45e:
	mov    %gs:0xead0,%rax
	cmpxchg %rax,%gs:0xead0
	jne    ffffffff810bc45e <irq_work_run+0x3e>
	test   %rax,%rax
	je     ffffffff810bc4aa <irq_work_run+0x8a>

Tell gcc we dirty eax/rax register in percpu_xchg_op()

Compiler must use another register to store pxo_new__

We also dont need to reload percpu value after a jump,
since a 'failed' cmpxchg already updated eax/rax

Wrong generated code was :
	xor     %rax,%rax   /* load 0 into %rax */
1:	mov     %gs:0xead0,%rax
	cmpxchg %rax,%gs:0xead0
	jne     1b
	test    %rax,%rax

After patch :

	xor     %rdx,%rdx   /* load 0 into %rdx */
	mov     %gs:0xead0,%rax
1:	cmpxchg %rdx,%gs:0xead0
	jne     1b:
	test    %rax,%rax

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Tejun Heo <tj@kernel.org>
LKML-Reference: <1295973114.3588.312.camel@edumazet-laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/percpu.h | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 3788f4649db..7e172955ee5 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -273,34 +273,34 @@ do {									\
 	typeof(var) pxo_new__ = (nval);					\
 	switch (sizeof(var)) {						\
 	case 1:								\
-		asm("\n1:mov "__percpu_arg(1)",%%al"			\
-		    "\n\tcmpxchgb %2, "__percpu_arg(1)			\
+		asm("\n\tmov "__percpu_arg(1)",%%al"			\
+		    "\n1:\tcmpxchgb %2, "__percpu_arg(1)		\
 		    "\n\tjnz 1b"					\
-			    : "=a" (pxo_ret__), "+m" (var)		\
+			    : "=&a" (pxo_ret__), "+m" (var)		\
 			    : "q" (pxo_new__)				\
 			    : "memory");				\
 		break;							\
 	case 2:								\
-		asm("\n1:mov "__percpu_arg(1)",%%ax"			\
-		    "\n\tcmpxchgw %2, "__percpu_arg(1)			\
+		asm("\n\tmov "__percpu_arg(1)",%%ax"			\
+		    "\n1:\tcmpxchgw %2, "__percpu_arg(1)		\
 		    "\n\tjnz 1b"					\
-			    : "=a" (pxo_ret__), "+m" (var)		\
+			    : "=&a" (pxo_ret__), "+m" (var)		\
 			    : "r" (pxo_new__)				\
 			    : "memory");				\
 		break;							\
 	case 4:								\
-		asm("\n1:mov "__percpu_arg(1)",%%eax"			\
-		    "\n\tcmpxchgl %2, "__percpu_arg(1)			\
+		asm("\n\tmov "__percpu_arg(1)",%%eax"			\
+		    "\n1:\tcmpxchgl %2, "__percpu_arg(1)		\
 		    "\n\tjnz 1b"					\
-			    : "=a" (pxo_ret__), "+m" (var)		\
+			    : "=&a" (pxo_ret__), "+m" (var)		\
 			    : "r" (pxo_new__)				\
 			    : "memory");				\
 		break;							\
 	case 8:								\
-		asm("\n1:mov "__percpu_arg(1)",%%rax"			\
-		    "\n\tcmpxchgq %2, "__percpu_arg(1)			\
+		asm("\n\tmov "__percpu_arg(1)",%%rax"			\
+		    "\n1:\tcmpxchgq %2, "__percpu_arg(1)		\
 		    "\n\tjnz 1b"					\
-			    : "=a" (pxo_ret__), "+m" (var)		\
+			    : "=&a" (pxo_ret__), "+m" (var)		\
 			    : "r" (pxo_new__)				\
 			    : "memory");				\
 		break;							\
-- 
cgit v1.2.3-18-g5258


From cf04d120d9413de581437cf9a29f138ec1178f65 Mon Sep 17 00:00:00 2001
From: Stefan Bader <stefan.bader@canonical.com>
Date: Thu, 27 Jan 2011 10:03:14 -0500
Subject: xen/p2m: Mark INVALID_P2M_ENTRY the mfn_list past max_pfn.

In case the mfn_list does not have enough entries to fill
a p2m page we do not want the entries from max_pfn up to
the boundary to be filled with unknown values. Hence
set them to INVALID_P2M_ENTRY.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/p2m.c | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index ddc81a06edb..fd12d7ce7ff 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -241,21 +241,15 @@ void __init xen_build_dynamic_phys_to_machine(void)
 		 * As long as the mfn_list has enough entries to completely
 		 * fill a p2m page, pointing into the array is ok. But if
 		 * not the entries beyond the last pfn will be undefined.
-		 * And guessing that the 'what-ever-there-is' does not take it
-		 * too kindly when changing it to invalid markers, a new page
-		 * is allocated, initialized and filled with the valid part.
 		 */
 		if (unlikely(pfn + P2M_PER_PAGE > max_pfn)) {
 			unsigned long p2midx;
-			unsigned long *p2m = extend_brk(PAGE_SIZE, PAGE_SIZE);
-			p2m_init(p2m);
-
-			for (p2midx = 0; pfn + p2midx < max_pfn; p2midx++) {
-				p2m[p2midx] = mfn_list[pfn + p2midx];
-			}
-			p2m_top[topidx][mididx] = p2m;
-		} else
-			p2m_top[topidx][mididx] = &mfn_list[pfn];
+
+			p2midx = max_pfn % P2M_PER_PAGE;
+			for ( ; p2midx < P2M_PER_PAGE; p2midx++)
+				mfn_list[pfn + p2midx] = INVALID_P2M_ENTRY;
+		}
+		p2m_top[topidx][mididx] = &mfn_list[pfn];
 	}
 
 	m2p_override_init();
-- 
cgit v1.2.3-18-g5258


From 7cb31b752c71e0bd405c1139e1907c3335877dff Mon Sep 17 00:00:00 2001
From: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Date: Thu, 27 Jan 2011 10:13:25 -0500
Subject: xen/e820: Guard against E820_RAM not having page-aligned size or
 start.

Under Dell Inspiron 1525, and Intel SandyBridge SDP's the
BIOS e820 RAM is not page-aligned:

[   0.000000]  Xen: 0000000000100000 - 00000000df66d800 (usable)

We were not handling that and ended up setting up a pagetable
that included up to df66e000 with the disastrous effect that when

        memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));

tried to clear the page it would crash at the 2K mark.

Initially reported by Michael Young @
http://lists.xensource.com/archives/html/xen-devel/2011-01/msg00108.html

The fix is to page-align the size and also take into consideration
the start of the E820 (in case that is not page-aligned either). This
fixes the bootup failure on those affected machines.

This patch is a rework of the Micheal A Young initial patch and
considers the case if the start is not page-aligned.

Reported-by: Michael A Young <m.a.young@durham.ac.uk>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Michael A Young <m.a.young@durham.ac.uk>
---
 arch/x86/xen/setup.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index b5a7f928234..75bdf2ab3d7 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -179,8 +179,13 @@ char * __init xen_memory_setup(void)
 	e820.nr_map = 0;
 	xen_extra_mem_start = mem_end;
 	for (i = 0; i < memmap.nr_entries; i++) {
-		unsigned long long end = map[i].addr + map[i].size;
+		unsigned long long end;
 
+		/* Guard against non-page aligned E820 entries. */
+		if (map[i].type == E820_RAM)
+			map[i].size -= (map[i].size + map[i].addr) % PAGE_SIZE;
+
+		end = map[i].addr + map[i].size;
 		if (map[i].type == E820_RAM && end > mem_end) {
 			/* RAM off the end - may be partially included */
 			u64 delta = min(map[i].size, end - mem_end);
-- 
cgit v1.2.3-18-g5258


From 23febeddbe67e5160929f7c48f7bfe83c2eecb99 Mon Sep 17 00:00:00 2001
From: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Date: Wed, 26 Jan 2011 17:07:27 +0000
Subject: xen/setup: Route halt operations to safe_halt pvop.

With this patch, the cpuidle driver does not load and
does not issue the mwait operations. Instead the hypervisor
is doing them (b/c we call the safe_halt pvops call).

This fixes quite a lot of bootup issues wherein the user had
to force interrupts for the continuation of the bootup.

Details are discussed in:

http://lists.xensource.com/archives/html/xen-devel/2011-01/msg00535.html

[v2: Wrote the commit description]

Reported-by: Daniel De Graaf <dgdegra@tycho.nsa.gov>
Tested-by: Daniel De Graaf <dgdegra@tycho.nsa.gov>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/setup.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 75bdf2ab3d7..a8a66a50d44 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -355,6 +355,7 @@ void __init xen_arch_setup(void)
 	boot_cpu_data.hlt_works_ok = 1;
 #endif
 	pm_idle = default_idle;
+	boot_option_idle_override = IDLE_HALT;
 
 	fiddle_vdso();
 }
-- 
cgit v1.2.3-18-g5258


From d038b12c6d773a4b9f69ca5243773bf6314f7ee9 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Tue, 25 Jan 2011 17:32:01 +0200
Subject: perf: Fix Pentium4 raw event validation

This patch fixes some issues with raw event validation on
Pentium 4 (Netburst) based processors.

As I was testing libpfm4 Netburst support, I ran into two
problems in the p4_validate_raw_event() function:

   - the shared field must be checked ONLY when HT is on
   - the binding to ESCR register was missing

The second item was causing raw events to not be encoded
correctly compared to generic PMU events.

With this patch, I can now pass Netburst events to libpfm4
examples and get meaningful results:

  $ task -e global_power_events:running:u  noploop 1
  noploop for 1 seconds
  3,206,304,898 global_power_events:running

Signed-off-by: Stephane Eranian <eranian@google.com>
Acked-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: peterz@infradead.org
Cc: paulus@samba.org
Cc: davem@davemloft.net
Cc: fweisbec@gmail.com
Cc: perfmon2-devel@lists.sf.net
Cc: eranian@gmail.com
Cc: robert.richter@amd.com
Cc: acme@redhat.com
Cc: gorcunov@gmail.com
Cc: ming.m.lin@intel.com
LKML-Reference: <4d3efb2f.1252d80a.1a80.ffffc83f@mx.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_p4.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index e56b9bfbabd..f7a0993c1e7 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -682,7 +682,7 @@ static int p4_validate_raw_event(struct perf_event *event)
 	 * if an event is shared accross the logical threads
 	 * the user needs special permissions to be able to use it
 	 */
-	if (p4_event_bind_map[v].shared) {
+	if (p4_ht_active() && p4_event_bind_map[v].shared) {
 		if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
 			return -EACCES;
 	}
@@ -727,7 +727,8 @@ static int p4_hw_config(struct perf_event *event)
 		event->hw.config = p4_set_ht_bit(event->hw.config);
 
 	if (event->attr.type == PERF_TYPE_RAW) {
-
+		struct p4_event_bind *bind;
+		unsigned int esel;
 		/*
 		 * Clear bits we reserve to be managed by kernel itself
 		 * and never allowed from a user space
@@ -743,6 +744,13 @@ static int p4_hw_config(struct perf_event *event)
 		 * bits since we keep additional info here (for cache events and etc)
 		 */
 		event->hw.config |= event->attr.config;
+		bind = p4_config_get_bind(event->attr.config);
+		if (!bind) {
+			rc = -EINVAL;
+			goto out;
+		}
+		esel = P4_OPCODE_ESEL(bind->opcode);
+		event->hw.config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel));
 	}
 
 	rc = x86_setup_perfctr(event);
-- 
cgit v1.2.3-18-g5258


From f12d3d04e8f6223276abb068c5d72852174b8c31 Mon Sep 17 00:00:00 2001
From: Matthieu CASTET <castet.matthieu@free.fr>
Date: Thu, 20 Jan 2011 21:11:45 +0100
Subject: x86, nx: Don't force pages RW when setting NX bits

Xen want page table pages read only.

But the initial page table (from head_*.S) live in .data or .bss.

That was broken by 64edc8ed5ffae999d8d413ba006850e9e34166cb.  There is
absolutely no reason to force these pages RW after they have already
been marked RO.

Signed-off-by: Matthieu CASTET <castet.matthieu@free.fr>
Tested-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/mm/pageattr.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 8b830ca14ac..d343b3c81f3 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -256,7 +256,6 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
 				   unsigned long pfn)
 {
 	pgprot_t forbidden = __pgprot(0);
-	pgprot_t required = __pgprot(0);
 
 	/*
 	 * The BIOS area between 640k and 1Mb needs to be executable for
@@ -282,12 +281,6 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
 	if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT,
 		   __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
 		pgprot_val(forbidden) |= _PAGE_RW;
-	/*
-	 * .data and .bss should always be writable.
-	 */
-	if (within(address, (unsigned long)_sdata, (unsigned long)_edata) ||
-	    within(address, (unsigned long)__bss_start, (unsigned long)__bss_stop))
-		pgprot_val(required) |= _PAGE_RW;
 
 #if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
 	/*
@@ -327,7 +320,6 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
 #endif
 
 	prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
-	prot = __pgprot(pgprot_val(prot) | pgprot_val(required));
 
 	return prot;
 }
-- 
cgit v1.2.3-18-g5258


From f7448548a9f32db38f243ccd4271617758ddfe2c Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Wed, 2 Feb 2011 17:02:55 -0800
Subject: x86, mtrr: Avoid MTRR reprogramming on BP during boot on UP platforms

Markus Kohn ran into a hard hang regression on an acer aspire
1310, when acpi is enabled. git bisect showed the following
commit as the bad one that introduced the boot regression.

	commit d0af9eed5aa91b6b7b5049cae69e5ea956fd85c3
	Author: Suresh Siddha <suresh.b.siddha@intel.com>
	Date:   Wed Aug 19 18:05:36 2009 -0700

	    x86, pat/mtrr: Rendezvous all the cpus for MTRR/PAT init

Because of the UP configuration of that platform,
native_smp_prepare_cpus() bailed out (in smp_sanity_check())
before doing the set_mtrr_aps_delayed_init()

Further down the boot path, native_smp_cpus_done() will call the
delayed MTRR initialization for the AP's (mtrr_aps_init()) with
mtrr_aps_delayed_init not set. This resulted in the boot
processor reprogramming its MTRR's to the values seen during the
start of the OS boot. While this is not needed ideally, this
shouldn't have caused any side-effects. This is because the
reprogramming of MTRR's (set_mtrr_state() that gets called via
set_mtrr()) will check if the live register contents are
different from what is being asked to write and will do the actual
write only if they are different.

BP's mtrr state is read during the start of the OS boot and
typically nothing would have changed when we ask to reprogram it
on BP again because of the above scenario on an UP platform. So
on a normal UP platform no reprogramming of BP MTRR MSR's
happens and all is well.

However, on this platform, bios seems to be modifying the fixed
mtrr range registers between the start of OS boot and when we
double check the live registers for reprogramming BP MTRR
registers. And as the live registers are modified, we end up
reprogramming the MTRR's to the state seen during the start of
the OS boot.

During ACPI initialization, something in the bios (probably smi
handler?) don't like this fact and results in a hard lockup.

We didn't see this boot hang issue on this platform before the
commit d0af9eed5aa91b6b7b5049cae69e5ea956fd85c3, because only
the AP's (if any) will program its MTRR's to the value that BP
had at the start of the OS boot.

Fix this issue by checking mtrr_aps_delayed_init before
continuing further in the mtrr_aps_init(). Now, only AP's (if
any) will program its MTRR's to the BP values during boot.

Addresses https://bugzilla.novell.com/show_bug.cgi?id=623393

  [ By the way, this behavior of the bios modifying MTRR's after the start
    of the OS boot is not common and the kernel is not prepared to
    handle this situation well. Irrespective of this issue, during
    suspend/resume, linux kernel will try to reprogram the BP's MTRR values
    to the values seen during the start of the OS boot. So suspend/resume might
    be already broken on this platform for all linux kernel versions. ]

Reported-and-bisected-by: Markus Kohn <jabber@gmx.org>
Tested-by: Markus Kohn <jabber@gmx.org>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Thomas Renninger <trenn@novell.com>
Cc: Rafael Wysocki <rjw@novell.com>
Cc: Venkatesh Pallipadi <venki@google.com>
Cc: stable@kernel.org # [v2.6.32+]
LKML-Reference: <1296694975.4418.402.camel@sbsiddha-MOBL3.sc.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/mtrr/main.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 01c0f3ee6cc..bebabec5b44 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -793,13 +793,21 @@ void set_mtrr_aps_delayed_init(void)
 }
 
 /*
- * MTRR initialization for all AP's
+ * Delayed MTRR initialization for all AP's
  */
 void mtrr_aps_init(void)
 {
 	if (!use_intel())
 		return;
 
+	/*
+	 * Check if someone has requested the delay of AP MTRR initialization,
+	 * by doing set_mtrr_aps_delayed_init(), prior to this point. If not,
+	 * then we are done.
+	 */
+	if (!mtrr_aps_delayed_init)
+		return;
+
 	set_mtrr(~0U, 0, 0, 0);
 	mtrr_aps_delayed_init = false;
 }
-- 
cgit v1.2.3-18-g5258


From 831d52bc153971b70e64eccfbed2b232394f22f8 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Thu, 3 Feb 2011 12:20:04 -0800
Subject: x86, mm: avoid possible bogus tlb entries by clearing prev mm_cpumask
 after switching mm

Clearing the cpu in prev's mm_cpumask early will avoid the flush tlb
IPI's while the cr3 is still pointing to the prev mm.  And this window
can lead to the possibility of bogus TLB fills resulting in strange
failures.  One such problematic scenario is mentioned below.

 T1. CPU-1 is context switching from mm1 to mm2 context and got a NMI
     etc between the point of clearing the cpu from the mm_cpumask(mm1)
     and before reloading the cr3 with the new mm2.

 T2. CPU-2 is tearing down a specific vma for mm1 and will proceed with
     flushing the TLB for mm1.  It doesn't send the flush TLB to CPU-1
     as it doesn't see that cpu listed in the mm_cpumask(mm1).

 T3. After the TLB flush is complete, CPU-2 goes ahead and frees the
     page-table pages associated with the removed vma mapping.

 T4. CPU-2 now allocates those freed page-table pages for something
     else.

 T5. As the CR3 and TLB caches for mm1 is still active on CPU-1, CPU-1
     can potentially speculate and walk through the page-table caches
     and can insert new TLB entries.  As the page-table pages are
     already freed and being used on CPU-2, this page walk can
     potentially insert a bogus global TLB entry depending on the
     (random) contents of the page that is being used on CPU-2.

 T6. This bogus TLB entry being global will be active across future CR3
     changes and can result in weird memory corruption etc.

To avoid this issue, for the prev mm that is handing over the cpu to
another mm, clear the cpu from the mm_cpumask(prev) after the cr3 is
changed.

Marking it for -stable, though we haven't seen any reported failure that
can be attributed to this.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: stable@kernel.org	[v2.6.32+]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/mmu_context.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 4a2d4e0c18d..8b5393ec108 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -36,8 +36,6 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 	unsigned cpu = smp_processor_id();
 
 	if (likely(prev != next)) {
-		/* stop flush ipis for the previous mm */
-		cpumask_clear_cpu(cpu, mm_cpumask(prev));
 #ifdef CONFIG_SMP
 		percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
 		percpu_write(cpu_tlbstate.active_mm, next);
@@ -47,6 +45,9 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 		/* Re-load page tables */
 		load_cr3(next->pgd);
 
+		/* stop flush ipis for the previous mm */
+		cpumask_clear_cpu(cpu, mm_cpumask(prev));
+
 		/*
 		 * load the LDT, if the LDT is different:
 		 */
-- 
cgit v1.2.3-18-g5258


From 11d4c3f9b671720e80353dd7e433ff2bf65e9500 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Fri, 4 Feb 2011 16:14:11 -0800
Subject: x86-32: Make sure the stack is set up before we use it

Since checkin ebba638ae723d8a8fc2f7abce5ec18b688b791d7 we call
verify_cpu even in 32-bit mode.  Unfortunately, calling a function
means using the stack, and the stack pointer was not initialized in
the 32-bit setup code!  This code initializes the stack pointer, and
simplifies the interface slightly since it is easier to rely on just a
pointer value rather than a descriptor; we need to have different
values for the segment register anyway.

This retains start_stack as a virtual address, even though a physical
address would be more convenient for 32 bits; the 64-bit code wants
the other way around...

Reported-by: Matthieu Castet <castet.matthieu@free.fr>
LKML-Reference: <4D41E86D.8060205@free.fr>
Tested-by: Kees Cook <kees.cook@canonical.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/smp.h   |  5 +----
 arch/x86/kernel/acpi/sleep.c |  2 +-
 arch/x86/kernel/head_32.S    | 30 +++++++++++++-----------------
 arch/x86/kernel/smpboot.c    |  4 ++--
 4 files changed, 17 insertions(+), 24 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 4c2f63c7fc1..1f469513677 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -40,10 +40,7 @@ DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid);
 DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid);
 
 /* Static state in head.S used to set up a CPU */
-extern struct {
-	void *sp;
-	unsigned short ss;
-} stack_start;
+extern unsigned long stack_start; /* Initial stack pointer address */
 
 struct smp_ops {
 	void (*smp_prepare_boot_cpu)(void);
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 69fd72aa559..4d9ebbab223 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -100,7 +100,7 @@ int acpi_save_state_mem(void)
 #else /* CONFIG_64BIT */
 	header->trampoline_segment = setup_trampoline() >> 4;
 #ifdef CONFIG_SMP
-	stack_start.sp = temp_stack + sizeof(temp_stack);
+	stack_start = (unsigned long)temp_stack + sizeof(temp_stack);
 	early_gdt_descr.address =
 			(unsigned long)get_cpu_gdt_table(smp_processor_id());
 	initial_gs = per_cpu_offset(smp_processor_id());
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index fc293dc8dc3..767d6c43de3 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -85,6 +85,8 @@ RESERVE_BRK(pagetables, INIT_MAP_SIZE)
  */
 __HEAD
 ENTRY(startup_32)
+	movl pa(stack_start),%ecx
+	
 	/* test KEEP_SEGMENTS flag to see if the bootloader is asking
 		us to not reload segments */
 	testb $(1<<6), BP_loadflags(%esi)
@@ -99,7 +101,9 @@ ENTRY(startup_32)
 	movl %eax,%es
 	movl %eax,%fs
 	movl %eax,%gs
+	movl %eax,%ss
 2:
+	leal -__PAGE_OFFSET(%ecx),%esp
 
 /*
  * Clear BSS first so that there are no surprises...
@@ -145,8 +149,6 @@ ENTRY(startup_32)
  * _brk_end is set up to point to the first "safe" location.
  * Mappings are created both at virtual address 0 (identity mapping)
  * and PAGE_OFFSET for up to _end.
- *
- * Note that the stack is not yet set up!
  */
 #ifdef CONFIG_X86_PAE
 
@@ -282,6 +284,9 @@ ENTRY(startup_32_smp)
 	movl %eax,%es
 	movl %eax,%fs
 	movl %eax,%gs
+	movl pa(stack_start),%ecx
+	movl %eax,%ss
+	leal -__PAGE_OFFSET(%ecx),%esp
 #endif /* CONFIG_SMP */
 default_entry:
 
@@ -347,8 +352,8 @@ default_entry:
 	movl %eax,%cr0		/* ..and set paging (PG) bit */
 	ljmp $__BOOT_CS,$1f	/* Clear prefetch and normalize %eip */
 1:
-	/* Set up the stack pointer */
-	lss stack_start,%esp
+	/* Shift the stack pointer to a virtual address */
+	addl $__PAGE_OFFSET, %esp
 
 /*
  * Initialize eflags.  Some BIOS's leave bits like NT set.  This would
@@ -360,9 +365,7 @@ default_entry:
 
 #ifdef CONFIG_SMP
 	cmpb $0, ready
-	jz  1f				/* Initial CPU cleans BSS */
-	jmp checkCPUtype
-1:
+	jnz checkCPUtype
 #endif /* CONFIG_SMP */
 
 /*
@@ -470,14 +473,7 @@ is386:	movl $2,%ecx		# set MP
 
 	cld			# gcc2 wants the direction flag cleared at all times
 	pushl $0		# fake return address for unwinder
-#ifdef CONFIG_SMP
-	movb ready, %cl
 	movb $1, ready
-	cmpb $0,%cl		# the first CPU calls start_kernel
-	je   1f
-	movl (stack_start), %esp
-1:
-#endif /* CONFIG_SMP */
 	jmp *(initial_code)
 
 /*
@@ -670,15 +666,15 @@ ENTRY(initial_page_table)
 #endif
 
 .data
+.balign 4
 ENTRY(stack_start)
 	.long init_thread_union+THREAD_SIZE
-	.long __BOOT_DS
-
-ready:	.byte 0
 
 early_recursion_flag:
 	.long 0
 
+ready:	.byte 0
+
 int_msg:
 	.asciz "Unknown interrupt or fault at: %p %p %p\n"
 
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 0cbe8c0b35e..03273b6c272 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -638,7 +638,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
 	 * target processor state.
 	 */
 	startup_ipi_hook(phys_apicid, (unsigned long) start_secondary,
-			 (unsigned long)stack_start.sp);
+			 stack_start);
 
 	/*
 	 * Run STARTUP IPI loop.
@@ -785,7 +785,7 @@ do_rest:
 #endif
 	early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
 	initial_code = (unsigned long)start_secondary;
-	stack_start.sp = (void *) c_idle.idle->thread.sp;
+	stack_start  = c_idle.idle->thread.sp;
 
 	/* start_ip had better be page-aligned! */
 	start_ip = setup_trampoline();
-- 
cgit v1.2.3-18-g5258


From d344e38b2c151ca5e5e39f562017127e93912528 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Sun, 6 Feb 2011 21:16:09 -0800
Subject: x86, nx: Mark the ACPI resume trampoline code as +x

We reserve lowmem for the things that need it, like the ACPI
wakeup code, way early to guarantee availability.  This happens
before we set up the proper pagetables, so set_memory_x() has no
effect.

Until we have a better solution, use an initcall to mark the
wakeup code executable.

Originally-by: Matthieu Castet <castet.matthieu@free.fr>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Matthias Hopf <mhopf@suse.de>
Cc: rjw@sisk.pl
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <4D4F8019.2090104@zytor.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/acpi/sleep.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 4d9ebbab223..68d1537b8c8 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -12,10 +12,8 @@
 #include <linux/cpumask.h>
 #include <asm/segment.h>
 #include <asm/desc.h>
-
-#ifdef CONFIG_X86_32
 #include <asm/pgtable.h>
-#endif
+#include <asm/cacheflush.h>
 
 #include "realmode/wakeup.h"
 #include "sleep.h"
@@ -149,6 +147,15 @@ void __init acpi_reserve_wakeup_memory(void)
 	memblock_x86_reserve_range(mem, mem + WAKEUP_SIZE, "ACPI WAKEUP");
 }
 
+int __init acpi_configure_wakeup_memory(void)
+{
+	if (acpi_realmode)
+		set_memory_x(acpi_realmode, WAKEUP_SIZE >> PAGE_SHIFT);
+
+	return 0;
+}
+arch_initcall(acpi_configure_wakeup_memory);
+
 
 static int __init acpi_sleep_setup(char *str)
 {
-- 
cgit v1.2.3-18-g5258