40 files changed, 1869 insertions, 446 deletions
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index db259757dc8..4b8326177c5 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -207,33 +207,6 @@ config SMP
 
 	  If you don't know what to do here, say N.
 
-config PREEMPT
-	bool "Preemptible Kernel"
-	---help---
-	  This option reduces the latency of the kernel when reacting to
-	  real-time or interactive events by allowing a low priority process to
-	  be preempted even if it is in kernel mode executing a system call.
-	  This allows applications to run more reliably even when the system is
-	  under load. On contrary it may also break your drivers and add
-	  priority inheritance problems to your system. Don't select it if
-	  you rely on a stable system or have slightly obscure hardware.
-	  It's also not very well tested on x86-64 currently.
-	  You have been warned.
-
-	  Say Y here if you are feeling brave and building a kernel for a
-	  desktop, embedded or real-time system.  Say N if you are unsure.
-
-config PREEMPT_BKL
-	bool "Preempt The Big Kernel Lock"
-	depends on PREEMPT
-	default y
-	help
-	  This option reduces the latency of the kernel by making the
-	  big kernel lock preemptible.
-
-	  Say Y here if you are building a kernel for a desktop system.
-	  Say N if you are unsure.
-
 config SCHED_SMT
 	bool "SMT (Hyperthreading) scheduler support"
 	depends on SMP
@@ -244,6 +217,8 @@ config SCHED_SMT
 	  cost of slightly increased overhead in some places. If unsure say
 	  N here.
 
+source "kernel/Kconfig.preempt"
+
 config K8_NUMA
        bool "K8 NUMA support"
        select NUMA
@@ -313,6 +288,15 @@ config NR_CPUS
 	  This is purely to save memory - each supported CPU requires
 	  memory in the static kernel configuration.
 
+config HOTPLUG_CPU
+	bool "Support for hot-pluggable CPUs (EXPERIMENTAL)"
+	depends on SMP && HOTPLUG && EXPERIMENTAL
+	help
+		Say Y here to experiment with turning CPUs off and on.  CPUs
+		can be controlled through /sys/devices/system/cpu/cpu#.
+		Say N if you want to disable CPU hotplug.
+
+
 config HPET_TIMER
 	bool
 	default y
@@ -385,6 +369,34 @@ config X86_MCE_INTEL
 	   Additional support for intel specific MCE features such as
 	   the thermal monitor.
 
+config PHYSICAL_START
+	hex "Physical address where the kernel is loaded" if EMBEDDED
+	default "0x100000"
+	help
+	  This gives the physical address where the kernel is loaded.
+	  Primarily used in the case of kexec on panic where the
+	  fail safe kernel needs to run at a different address than
+	  the panic-ed kernel.
+
+	  Don't change this unless you know what you are doing.
+
+config KEXEC
+	bool "kexec system call (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	help
+	  kexec is a system call that implements the ability to shutdown your
+	  current kernel, and to start another kernel.  It is like a reboot
+	  but it is indepedent of the system firmware.   And like a reboot
+	  you can start any kernel with it, not just Linux.
+
+	  The name comes from the similiarity to the exec system call.
+
+	  It is an ongoing process to be certain the hardware in a machine
+	  is properly shutdown, so do not be surprised if this code does not
+	  initially work for you.  It may help to enable device hotplugging
+	  support.  As of this writing the exact hardware interface is
+	  strongly in flux, so no good recommendation can be made.
+
 config SECCOMP
 	bool "Enable seccomp to safely compute untrusted bytecode"
 	depends on PROC_FS
@@ -503,6 +515,8 @@ config UID16
 
 endmenu
 
+source "net/Kconfig"
+
 source drivers/Kconfig
 
 source "drivers/firmware/Kconfig"
diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile
index 6f90c246c41..42891569767 100644
--- a/arch/x86_64/Makefile
+++ b/arch/x86_64/Makefile
@@ -35,7 +35,7 @@ export IA32_CC IA32_LD IA32_AS IA32_OBJCOPY IA32_CPP
 
 LDFLAGS		:= -m elf_x86_64
 OBJCOPYFLAGS	:= -O binary -R .note -R .comment -S
-LDFLAGS_vmlinux := -e stext
+LDFLAGS_vmlinux :=
 
 CHECKFLAGS      += -D__x86_64__ -m64
 
@@ -65,7 +65,9 @@ CFLAGS += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,)
 head-y := arch/x86_64/kernel/head.o arch/x86_64/kernel/head64.o arch/x86_64/kernel/init_task.o
 
 libs-y 					+= arch/x86_64/lib/
-core-y					+= arch/x86_64/kernel/ arch/x86_64/mm/
+core-y					+= arch/x86_64/kernel/ \
+					   arch/x86_64/mm/ \
+					   arch/x86_64/crypto/
 core-$(CONFIG_IA32_EMULATION)		+= arch/x86_64/ia32/
 drivers-$(CONFIG_PCI)			+= arch/x86_64/pci/
 drivers-$(CONFIG_OPROFILE)		+= arch/x86_64/oprofile/
diff --git a/arch/x86_64/boot/compressed/head.S b/arch/x86_64/boot/compressed/head.S
index 27264dbd575..6f55565e4d4 100644
--- a/arch/x86_64/boot/compressed/head.S
+++ b/arch/x86_64/boot/compressed/head.S
@@ -2,8 +2,6 @@
  *  linux/boot/head.S
  *
  *  Copyright (C) 1991, 1992, 1993  Linus Torvalds
- *
- *  $Id: head.S,v 1.3 2001/04/20 00:59:28 ak Exp $	 		
  */
 
 /*
@@ -21,13 +19,14 @@
  */
 
 /*
- * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996	
+ * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
  */
 .code32
 .text
 
 #include <linux/linkage.h>
 #include <asm/segment.h>
+#include <asm/page.h>
 
 	.code32
 	.globl startup_32
@@ -77,7 +76,7 @@ startup_32:
 	jnz  3f
 	addl $8,%esp
 	xorl %ebx,%ebx
-	ljmp $(__KERNEL_CS), $0x100000
+	ljmp $(__KERNEL_CS), $__PHYSICAL_START
 
 /*
  * We come here, if we were loaded high.
@@ -103,7 +102,7 @@ startup_32:
 	popl %ecx	# lcount
 	popl %edx	# high_buffer_start
 	popl %eax	# hcount
-	movl $0x100000,%edi
+	movl $__PHYSICAL_START,%edi
 	cli		# make sure we don't get interrupted
 	ljmp $(__KERNEL_CS), $0x1000 # and jump to the move routine
 
@@ -128,7 +127,7 @@ move_routine_start:
 	movsl
 	movl %ebx,%esi	# Restore setup pointer
 	xorl %ebx,%ebx
-	ljmp $(__KERNEL_CS), $0x100000
+	ljmp $(__KERNEL_CS), $__PHYSICAL_START
 move_routine_end:
 
 
diff --git a/arch/x86_64/boot/compressed/misc.c b/arch/x86_64/boot/compressed/misc.c
index c8b9216f9e6..b38d5b8b5fb 100644
--- a/arch/x86_64/boot/compressed/misc.c
+++ b/arch/x86_64/boot/compressed/misc.c
@@ -11,6 +11,7 @@
 
 #include "miscsetup.h"
 #include <asm/io.h>
+#include <asm/page.h>
 
 /*
  * gzip declarations
@@ -92,8 +93,11 @@ static unsigned long output_ptr = 0;
 static void *malloc(int size);
 static void free(void *where);
  
+void* memset(void* s, int c, unsigned n);
+void* memcpy(void* dest, const void* src, unsigned n);
+
 static void putstr(const char *);
-  
+
 extern int end;
 static long free_mem_ptr = (long)&end;
 static long free_mem_end_ptr;
@@ -284,7 +288,7 @@ void setup_normal_output_buffer(void)
 #else
 	if ((ALT_MEM_K > EXT_MEM_K ? ALT_MEM_K : EXT_MEM_K) < 1024) error("Less than 2MB of memory");
 #endif
-	output_data = (char *)0x100000; /* Points to 1M */
+	output_data = (char *)__PHYSICAL_START; /* Normally Points to 1M */
 	free_mem_end_ptr = (long)real_mode;
 }
 
@@ -307,8 +311,8 @@ void setup_output_buffer_if_we_run_high(struct moveparams *mv)
 	low_buffer_size = low_buffer_end - LOW_BUFFER_START;
 	high_loaded = 1;
 	free_mem_end_ptr = (long)high_buffer_start;
-	if ( (0x100000 + low_buffer_size) > ((ulg)high_buffer_start)) {
-		high_buffer_start = (uch *)(0x100000 + low_buffer_size);
+	if ( (__PHYSICAL_START + low_buffer_size) > ((ulg)high_buffer_start)) {
+		high_buffer_start = (uch *)(__PHYSICAL_START + low_buffer_size);
 		mv->hcount = 0; /* say: we need not to move high_buffer */
 	}
 	else mv->hcount = -1;
diff --git a/arch/x86_64/boot/install.sh b/arch/x86_64/boot/install.sh
index f17b40dfc0f..198af15a775 100644
--- a/arch/x86_64/boot/install.sh
+++ b/arch/x86_64/boot/install.sh
@@ -1,6 +1,6 @@
 #!/bin/sh
 #
-# arch/i386/boot/install.sh
+# arch/x86_64/boot/install.sh
 #
 # This file is subject to the terms and conditions of the GNU General Public
 # License.  See the file "COPYING" in the main directory of this archive
diff --git a/arch/x86_64/boot/setup.S b/arch/x86_64/boot/setup.S
index 75d4d2ad93b..ff58b2832b7 100644
--- a/arch/x86_64/boot/setup.S
+++ b/arch/x86_64/boot/setup.S
@@ -33,7 +33,7 @@
  * Transcribed from Intel (as86) -> AT&T (gas) by Chris Noe, May 1999.
  * <stiker@northlink.com>
  *
- * Fix to work around buggy BIOSes which dont use carry bit correctly
+ * Fix to work around buggy BIOSes which don't use carry bit correctly
  * and/or report extended memory in CX/DX for e801h memory size detection 
  * call.  As a result the kernel got wrong figures.  The int15/e801h docs
  * from Ralf Brown interrupt list seem to indicate AX/BX should be used
@@ -383,7 +383,7 @@ sse_ok:
 # a whole bunch of different types, and allows memory holes and
 # everything.  We scan through this memory map and build a list
 # of the first 32 memory areas, which we return at [E820MAP].
-# This is documented at http://www.teleport.com/~acpi/acpihtml/topic245.htm
+# This is documented at http://www.acpi.info/, in the ACPI 2.0 specification.
 
 #define SMAP  0x534d4150
 
@@ -436,7 +436,7 @@ bail820:
 
 meme801:
 	stc					# fix to work around buggy
-	xorw	%cx,%cx				# BIOSes which dont clear/set
+	xorw	%cx,%cx				# BIOSes which don't clear/set
 	xorw	%dx,%dx				# carry on pass/error of
 						# e801h memory size call
 						# or merely pass cx,dx though
@@ -733,7 +733,7 @@ flush_instr:
 #
 #	but we yet haven't reloaded the CS register, so the default size 
 #	of the target offset still is 16 bit.
-#       However, using an operant prefix (0x66), the CPU will properly
+#	However, using an operand prefix (0x66), the CPU will properly
 #	take our 48 bit far pointer. (INTeL 80386 Programmer's Reference
 #	Manual, Mixing 16-bit and 32-bit code, page 16-6)
 
diff --git a/arch/x86_64/boot/tools/build.c b/arch/x86_64/boot/tools/build.c
index c2fa6631317..18b5bac1c42 100644
--- a/arch/x86_64/boot/tools/build.c
+++ b/arch/x86_64/boot/tools/build.c
@@ -1,6 +1,4 @@
 /*
- *  $Id: build.c,v 1.3 2001/06/26 15:14:50 pavel Exp $
- *
  *  Copyright (C) 1991, 1992  Linus Torvalds
  *  Copyright (C) 1997 Martin Mares
  */
@@ -8,7 +6,8 @@
 /*
  * This file builds a disk-image from three different files:
  *
- * - bootsect: exactly 512 bytes of 8086 machine code, loads the rest
+ * - bootsect: compatibility mbr which prints an error message if
+ *             someone tries to boot the kernel directly.
  * - setup: 8086 machine code, sets up system parm
  * - system: 80386 code for actual system
  *
diff --git a/arch/x86_64/crypto/Makefile b/arch/x86_64/crypto/Makefile
new file mode 100644
index 00000000000..426d20f4b72
--- /dev/null
+++ b/arch/x86_64/crypto/Makefile
@@ -0,0 +1,9 @@
+# 
+# x86_64/crypto/Makefile 
+# 
+# Arch-specific CryptoAPI modules.
+# 
+
+obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
+
+aes-x86_64-y := aes-x86_64-asm.o aes.o
diff --git a/arch/x86_64/crypto/aes-x86_64-asm.S b/arch/x86_64/crypto/aes-x86_64-asm.S
new file mode 100644
index 00000000000..483cbb23ab8
--- /dev/null
+++ b/arch/x86_64/crypto/aes-x86_64-asm.S
@@ -0,0 +1,186 @@
+/* AES (Rijndael) implementation (FIPS PUB 197) for x86_64
+ *
+ * Copyright (C) 2005 Andreas Steinmetz, <ast@domdv.de>
+ *
+ * License:
+ * This code can be distributed under the terms of the GNU General Public
+ * License (GPL) Version 2 provided that the above header down to and
+ * including this sentence is retained in full.
+ */
+
+.extern aes_ft_tab
+.extern aes_it_tab
+.extern aes_fl_tab
+.extern aes_il_tab
+
+.text
+
+#define R1	%rax
+#define R1E	%eax
+#define R1X	%ax
+#define R1H	%ah
+#define R1L	%al
+#define R2	%rbx
+#define R2E	%ebx
+#define R2X	%bx
+#define R2H	%bh
+#define R2L	%bl
+#define R3	%rcx
+#define R3E	%ecx
+#define R3X	%cx
+#define R3H	%ch
+#define R3L	%cl
+#define R4	%rdx
+#define R4E	%edx
+#define R4X	%dx
+#define R4H	%dh
+#define R4L	%dl
+#define R5	%rsi
+#define R5E	%esi
+#define R6	%rdi
+#define R6E	%edi
+#define R7	%rbp
+#define R7E	%ebp
+#define R8	%r8
+#define R9	%r9
+#define R10	%r10
+#define R11	%r11
+
+#define prologue(FUNC,BASE,B128,B192,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11) \
+	.global	FUNC;			\
+	.type	FUNC,@function;		\
+	.align	8;			\
+FUNC:	movq	r1,r2;			\
+	movq	r3,r4;			\
+	leaq	BASE+52(r8),r9;		\
+	movq	r10,r11;		\
+	movl	(r7),r5 ## E;		\
+	movl	4(r7),r1 ## E;		\
+	movl	8(r7),r6 ## E;		\
+	movl	12(r7),r7 ## E;		\
+	movl	(r8),r10 ## E;		\
+	xorl	-48(r9),r5 ## E;	\
+	xorl	-44(r9),r1 ## E;	\
+	xorl	-40(r9),r6 ## E;	\
+	xorl	-36(r9),r7 ## E;	\
+	cmpl	$24,r10 ## E;		\
+	jb	B128;			\
+	leaq	32(r9),r9;		\
+	je	B192;			\
+	leaq	32(r9),r9;
+
+#define epilogue(r1,r2,r3,r4,r5,r6,r7,r8,r9) \
+	movq	r1,r2;			\
+	movq	r3,r4;			\
+	movl	r5 ## E,(r9);		\
+	movl	r6 ## E,4(r9);		\
+	movl	r7 ## E,8(r9);		\
+	movl	r8 ## E,12(r9);		\
+	ret;
+
+#define round(TAB,OFFSET,r1,r2,r3,r4,r5,r6,r7,r8,ra,rb,rc,rd) \
+	movzbl	r2 ## H,r5 ## E;	\
+	movzbl	r2 ## L,r6 ## E;	\
+	movl	TAB+1024(,r5,4),r5 ## E;\
+	movw	r4 ## X,r2 ## X;	\
+	movl	TAB(,r6,4),r6 ## E;	\
+	roll	$16,r2 ## E;		\
+	shrl	$16,r4 ## E;		\
+	movzbl	r4 ## H,r7 ## E;	\
+	movzbl	r4 ## L,r4 ## E;	\
+	xorl	OFFSET(r8),ra ## E;	\
+	xorl	OFFSET+4(r8),rb ## E;	\
+	xorl	TAB+3072(,r7,4),r5 ## E;\
+	xorl	TAB+2048(,r4,4),r6 ## E;\
+	movzbl	r1 ## L,r7 ## E;	\
+	movzbl	r1 ## H,r4 ## E;	\
+	movl	TAB+1024(,r4,4),r4 ## E;\
+	movw	r3 ## X,r1 ## X;	\
+	roll	$16,r1 ## E;		\
+	shrl	$16,r3 ## E;		\
+	xorl	TAB(,r7,4),r5 ## E;	\
+	movzbl	r3 ## H,r7 ## E;	\
+	movzbl	r3 ## L,r3 ## E;	\
+	xorl	TAB+3072(,r7,4),r4 ## E;\
+	xorl	TAB+2048(,r3,4),r5 ## E;\
+	movzbl	r1 ## H,r7 ## E;	\
+	movzbl	r1 ## L,r3 ## E;	\
+	shrl	$16,r1 ## E;		\
+	xorl	TAB+3072(,r7,4),r6 ## E;\
+	movl	TAB+2048(,r3,4),r3 ## E;\
+	movzbl	r1 ## H,r7 ## E;	\
+	movzbl	r1 ## L,r1 ## E;	\
+	xorl	TAB+1024(,r7,4),r6 ## E;\
+	xorl	TAB(,r1,4),r3 ## E;	\
+	movzbl	r2 ## H,r1 ## E;	\
+	movzbl	r2 ## L,r7 ## E;	\
+	shrl	$16,r2 ## E;		\
+	xorl	TAB+3072(,r1,4),r3 ## E;\
+	xorl	TAB+2048(,r7,4),r4 ## E;\
+	movzbl	r2 ## H,r1 ## E;	\
+	movzbl	r2 ## L,r2 ## E;	\
+	xorl	OFFSET+8(r8),rc ## E;	\
+	xorl	OFFSET+12(r8),rd ## E;	\
+	xorl	TAB+1024(,r1,4),r3 ## E;\
+	xorl	TAB(,r2,4),r4 ## E;
+
+#define move_regs(r1,r2,r3,r4) \
+	movl	r3 ## E,r1 ## E;	\
+	movl	r4 ## E,r2 ## E;
+
+#define entry(FUNC,BASE,B128,B192) \
+	prologue(FUNC,BASE,B128,B192,R2,R8,R7,R9,R1,R3,R4,R6,R10,R5,R11)
+
+#define return epilogue(R8,R2,R9,R7,R5,R6,R3,R4,R11)
+
+#define encrypt_round(TAB,OFFSET) \
+	round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R10,R5,R6,R3,R4) \
+	move_regs(R1,R2,R5,R6)
+
+#define encrypt_final(TAB,OFFSET) \
+	round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R10,R5,R6,R3,R4)
+
+#define decrypt_round(TAB,OFFSET) \
+	round(TAB,OFFSET,R2,R1,R4,R3,R6,R5,R7,R10,R5,R6,R3,R4) \
+	move_regs(R1,R2,R5,R6)
+
+#define decrypt_final(TAB,OFFSET) \
+	round(TAB,OFFSET,R2,R1,R4,R3,R6,R5,R7,R10,R5,R6,R3,R4)
+
+/* void aes_encrypt(void *ctx, u8 *out, const u8 *in) */
+
+	entry(aes_encrypt,0,enc128,enc192)
+	encrypt_round(aes_ft_tab,-96)
+	encrypt_round(aes_ft_tab,-80)
+enc192:	encrypt_round(aes_ft_tab,-64)
+	encrypt_round(aes_ft_tab,-48)
+enc128:	encrypt_round(aes_ft_tab,-32)
+	encrypt_round(aes_ft_tab,-16)
+	encrypt_round(aes_ft_tab,  0)
+	encrypt_round(aes_ft_tab, 16)
+	encrypt_round(aes_ft_tab, 32)
+	encrypt_round(aes_ft_tab, 48)
+	encrypt_round(aes_ft_tab, 64)
+	encrypt_round(aes_ft_tab, 80)
+	encrypt_round(aes_ft_tab, 96)
+	encrypt_final(aes_fl_tab,112)
+	return
+
+/* void aes_decrypt(void *ctx, u8 *out, const u8 *in) */
+
+	entry(aes_decrypt,240,dec128,dec192)
+	decrypt_round(aes_it_tab,-96)
+	decrypt_round(aes_it_tab,-80)
+dec192:	decrypt_round(aes_it_tab,-64)
+	decrypt_round(aes_it_tab,-48)
+dec128:	decrypt_round(aes_it_tab,-32)
+	decrypt_round(aes_it_tab,-16)
+	decrypt_round(aes_it_tab,  0)
+	decrypt_round(aes_it_tab, 16)
+	decrypt_round(aes_it_tab, 32)
+	decrypt_round(aes_it_tab, 48)
+	decrypt_round(aes_it_tab, 64)
+	decrypt_round(aes_it_tab, 80)
+	decrypt_round(aes_it_tab, 96)
+	decrypt_final(aes_il_tab,112)
+	return
diff --git a/arch/x86_64/crypto/aes.c b/arch/x86_64/crypto/aes.c
new file mode 100644
index 00000000000..2b5c4010ce3
--- /dev/null
+++ b/arch/x86_64/crypto/aes.c
@@ -0,0 +1,324 @@
+/*
+ * Cryptographic API.
+ *
+ * AES Cipher Algorithm.
+ *
+ * Based on Brian Gladman's code.
+ *
+ * Linux developers:
+ *  Alexander Kjeldaas <astor@fast.no>
+ *  Herbert Valerio Riedel <hvr@hvrlab.org>
+ *  Kyle McMartin <kyle@debian.org>
+ *  Adam J. Richter <adam@yggdrasil.com> (conversion to 2.5 API).
+ *  Andreas Steinmetz <ast@domdv.de> (adapted to x86_64 assembler)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * ---------------------------------------------------------------------------
+ * Copyright (c) 2002, Dr Brian Gladman <brg@gladman.me.uk>, Worcester, UK.
+ * All rights reserved.
+ *
+ * LICENSE TERMS
+ *
+ * The free distribution and use of this software in both source and binary
+ * form is allowed (with or without changes) provided that:
+ *
+ *   1. distributions of this source code include the above copyright
+ *      notice, this list of conditions and the following disclaimer;
+ *
+ *   2. distributions in binary form include the above copyright
+ *      notice, this list of conditions and the following disclaimer
+ *      in the documentation and/or other associated materials;
+ *
+ *   3. the copyright holder's name is not used to endorse products
+ *      built using this software without specific written permission.
+ *
+ * ALTERNATIVELY, provided that this notice is retained in full, this product
+ * may be distributed under the terms of the GNU General Public License (GPL),
+ * in which case the provisions of the GPL apply INSTEAD OF those given above.
+ *
+ * DISCLAIMER
+ *
+ * This software is provided 'as is' with no explicit or implied warranties
+ * in respect of its properties, including, but not limited to, correctness
+ * and/or fitness for purpose.
+ * ---------------------------------------------------------------------------
+ */
+
+/* Some changes from the Gladman version:
+    s/RIJNDAEL(e_key)/E_KEY/g
+    s/RIJNDAEL(d_key)/D_KEY/g
+*/
+
+#include <asm/byteorder.h>
+#include <linux/bitops.h>
+#include <linux/crypto.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+#define AES_MIN_KEY_SIZE	16
+#define AES_MAX_KEY_SIZE	32
+
+#define AES_BLOCK_SIZE		16
+
+/*
+ * #define byte(x, nr) ((unsigned char)((x) >> (nr*8)))
+ */
+static inline u8 byte(const u32 x, const unsigned n)
+{
+	return x >> (n << 3);
+}
+
+#define u32_in(x) le32_to_cpu(*(const __le32 *)(x))
+
+struct aes_ctx
+{
+	u32 key_length;
+	u32 E[60];
+	u32 D[60];
+};
+
+#define E_KEY ctx->E
+#define D_KEY ctx->D
+
+static u8 pow_tab[256] __initdata;
+static u8 log_tab[256] __initdata;
+static u8 sbx_tab[256] __initdata;
+static u8 isb_tab[256] __initdata;
+static u32 rco_tab[10];
+u32 aes_ft_tab[4][256];
+u32 aes_it_tab[4][256];
+
+u32 aes_fl_tab[4][256];
+u32 aes_il_tab[4][256];
+
+static inline u8 f_mult(u8 a, u8 b)
+{
+	u8 aa = log_tab[a], cc = aa + log_tab[b];
+
+	return pow_tab[cc + (cc < aa ? 1 : 0)];
+}
+
+#define ff_mult(a, b) (a && b ? f_mult(a, b) : 0)
+
+#define ls_box(x)				\
+	(aes_fl_tab[0][byte(x, 0)] ^		\
+	 aes_fl_tab[1][byte(x, 1)] ^		\
+	 aes_fl_tab[2][byte(x, 2)] ^		\
+	 aes_fl_tab[3][byte(x, 3)])
+
+static void __init gen_tabs(void)
+{
+	u32 i, t;
+	u8 p, q;
+
+	/* log and power tables for GF(2**8) finite field with
+	   0x011b as modular polynomial - the simplest primitive
+	   root is 0x03, used here to generate the tables */
+
+	for (i = 0, p = 1; i < 256; ++i) {
+		pow_tab[i] = (u8)p;
+		log_tab[p] = (u8)i;
+
+		p ^= (p << 1) ^ (p & 0x80 ? 0x01b : 0);
+	}
+
+	log_tab[1] = 0;
+
+	for (i = 0, p = 1; i < 10; ++i) {
+		rco_tab[i] = p;
+
+		p = (p << 1) ^ (p & 0x80 ? 0x01b : 0);
+	}
+
+	for (i = 0; i < 256; ++i) {
+		p = (i ? pow_tab[255 - log_tab[i]] : 0);
+		q = ((p >> 7) | (p << 1)) ^ ((p >> 6) | (p << 2));
+		p ^= 0x63 ^ q ^ ((q >> 6) | (q << 2));
+		sbx_tab[i] = p;
+		isb_tab[p] = (u8)i;
+	}
+
+	for (i = 0; i < 256; ++i) {
+		p = sbx_tab[i];
+
+		t = p;
+		aes_fl_tab[0][i] = t;
+		aes_fl_tab[1][i] = rol32(t, 8);
+		aes_fl_tab[2][i] = rol32(t, 16);
+		aes_fl_tab[3][i] = rol32(t, 24);
+
+		t = ((u32)ff_mult(2, p)) |
+		    ((u32)p << 8) |
+		    ((u32)p << 16) | ((u32)ff_mult(3, p) << 24);
+
+		aes_ft_tab[0][i] = t;
+		aes_ft_tab[1][i] = rol32(t, 8);
+		aes_ft_tab[2][i] = rol32(t, 16);
+		aes_ft_tab[3][i] = rol32(t, 24);
+
+		p = isb_tab[i];
+
+		t = p;
+		aes_il_tab[0][i] = t;
+		aes_il_tab[1][i] = rol32(t, 8);
+		aes_il_tab[2][i] = rol32(t, 16);
+		aes_il_tab[3][i] = rol32(t, 24);
+
+		t = ((u32)ff_mult(14, p)) |
+		    ((u32)ff_mult(9, p) << 8) |
+		    ((u32)ff_mult(13, p) << 16) |
+		    ((u32)ff_mult(11, p) << 24);
+
+		aes_it_tab[0][i] = t;
+		aes_it_tab[1][i] = rol32(t, 8);
+		aes_it_tab[2][i] = rol32(t, 16);
+		aes_it_tab[3][i] = rol32(t, 24);
+	}
+}
+
+#define star_x(x) (((x) & 0x7f7f7f7f) << 1) ^ ((((x) & 0x80808080) >> 7) * 0x1b)
+
+#define imix_col(y, x)			\
+	u    = star_x(x);		\
+	v    = star_x(u);		\
+	w    = star_x(v);		\
+	t    = w ^ (x);			\
+	(y)  = u ^ v ^ w;		\
+	(y) ^= ror32(u ^ t,  8) ^	\
+	       ror32(v ^ t, 16) ^	\
+	       ror32(t, 24)
+
+/* initialise the key schedule from the user supplied key */
+
+#define loop4(i)					\
+{							\
+	t = ror32(t,  8); t = ls_box(t) ^ rco_tab[i];	\
+	t ^= E_KEY[4 * i];     E_KEY[4 * i + 4] = t;	\
+	t ^= E_KEY[4 * i + 1]; E_KEY[4 * i + 5] = t;	\
+	t ^= E_KEY[4 * i + 2]; E_KEY[4 * i + 6] = t;	\
+	t ^= E_KEY[4 * i + 3]; E_KEY[4 * i + 7] = t;	\
+}
+
+#define loop6(i)					\
+{							\
+	t = ror32(t,  8); t = ls_box(t) ^ rco_tab[i];	\
+	t ^= E_KEY[6 * i];     E_KEY[6 * i + 6] = t;	\
+	t ^= E_KEY[6 * i + 1]; E_KEY[6 * i + 7] = t;	\
+	t ^= E_KEY[6 * i + 2]; E_KEY[6 * i + 8] = t;	\
+	t ^= E_KEY[6 * i + 3]; E_KEY[6 * i + 9] = t;	\
+	t ^= E_KEY[6 * i + 4]; E_KEY[6 * i + 10] = t;	\
+	t ^= E_KEY[6 * i + 5]; E_KEY[6 * i + 11] = t;	\
+}
+
+#define loop8(i)					\
+{							\
+	t = ror32(t,  8); ; t = ls_box(t) ^ rco_tab[i];	\
+	t ^= E_KEY[8 * i];     E_KEY[8 * i + 8] = t;	\
+	t ^= E_KEY[8 * i + 1]; E_KEY[8 * i + 9] = t;	\
+	t ^= E_KEY[8 * i + 2]; E_KEY[8 * i + 10] = t;	\
+	t ^= E_KEY[8 * i + 3]; E_KEY[8 * i + 11] = t;	\
+	t  = E_KEY[8 * i + 4] ^ ls_box(t);		\
+	E_KEY[8 * i + 12] = t;				\
+	t ^= E_KEY[8 * i + 5]; E_KEY[8 * i + 13] = t;	\
+	t ^= E_KEY[8 * i + 6]; E_KEY[8 * i + 14] = t;	\
+	t ^= E_KEY[8 * i + 7]; E_KEY[8 * i + 15] = t;	\
+}
+
+static int aes_set_key(void *ctx_arg, const u8 *in_key, unsigned int key_len,
+		       u32 *flags)
+{
+	struct aes_ctx *ctx = ctx_arg;
+	u32 i, j, t, u, v, w;
+
+	if (key_len != 16 && key_len != 24 && key_len != 32) {
+		*flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+		return -EINVAL;
+	}
+
+	ctx->key_length = key_len;
+
+	D_KEY[key_len + 24] = E_KEY[0] = u32_in(in_key);
+	D_KEY[key_len + 25] = E_KEY[1] = u32_in(in_key + 4);
+	D_KEY[key_len + 26] = E_KEY[2] = u32_in(in_key + 8);
+	D_KEY[key_len + 27] = E_KEY[3] = u32_in(in_key + 12);
+
+	switch (key_len) {
+	case 16:
+		t = E_KEY[3];
+		for (i = 0; i < 10; ++i)
+			loop4(i);
+		break;
+
+	case 24:
+		E_KEY[4] = u32_in(in_key + 16);
+		t = E_KEY[5] = u32_in(in_key + 20);
+		for (i = 0; i < 8; ++i)
+			loop6 (i);
+		break;
+
+	case 32:
+		E_KEY[4] = u32_in(in_key + 16);
+		E_KEY[5] = u32_in(in_key + 20);
+		E_KEY[6] = u32_in(in_key + 24);
+		t = E_KEY[7] = u32_in(in_key + 28);
+		for (i = 0; i < 7; ++i)
+			loop8(i);
+		break;
+	}
+
+	D_KEY[0] = E_KEY[key_len + 24];
+	D_KEY[1] = E_KEY[key_len + 25];
+	D_KEY[2] = E_KEY[key_len + 26];
+	D_KEY[3] = E_KEY[key_len + 27];
+
+	for (i = 4; i < key_len + 24; ++i) {
+		j = key_len + 24 - (i & ~3) + (i & 3);
+		imix_col(D_KEY[j], E_KEY[i]);
+	}
+
+	return 0;
+}
+
+extern void aes_encrypt(void *ctx_arg, u8 *out, const u8 *in);
+extern void aes_decrypt(void *ctx_arg, u8 *out, const u8 *in);
+
+static struct crypto_alg aes_alg = {
+	.cra_name		=	"aes",
+	.cra_flags		=	CRYPTO_ALG_TYPE_CIPHER,
+	.cra_blocksize		=	AES_BLOCK_SIZE,
+	.cra_ctxsize		=	sizeof(struct aes_ctx),
+	.cra_module		=	THIS_MODULE,
+	.cra_list		=	LIST_HEAD_INIT(aes_alg.cra_list),
+	.cra_u			=	{
+		.cipher = {
+			.cia_min_keysize	=	AES_MIN_KEY_SIZE,
+			.cia_max_keysize	=	AES_MAX_KEY_SIZE,
+			.cia_setkey	   	= 	aes_set_key,
+			.cia_encrypt	 	=	aes_encrypt,
+			.cia_decrypt	  	=	aes_decrypt
+		}
+	}
+};
+
+static int __init aes_init(void)
+{
+	gen_tabs();
+	return crypto_register_alg(&aes_alg);
+}
+
+static void __exit aes_fini(void)
+{
+	crypto_unregister_alg(&aes_alg);
+}
+
+module_init(aes_init);
+module_exit(aes_fini);
+
+MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm");
+MODULE_LICENSE("GPL");
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index f3ca0db85b5..cc935427d53 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -589,7 +589,7 @@ ia32_sys_call_table:
 	.quad compat_sys_mq_timedreceive	/* 280 */
 	.quad compat_sys_mq_notify
 	.quad compat_sys_mq_getsetattr
-	.quad quiet_ni_syscall		/* reserved for kexec */
+	.quad compat_sys_kexec_load	/* reserved for kexec */
 	.quad compat_sys_waitid
 	.quad quiet_ni_syscall		/* sys_altroot */
 	.quad sys_add_key
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index 5ca4a4598fd..48f9e2c19cd 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_SMP)		+= smp.o smpboot.o trampoline.o
 obj-$(CONFIG_X86_LOCAL_APIC)	+= apic.o  nmi.o
 obj-$(CONFIG_X86_IO_APIC)	+= io_apic.o mpparse.o \
 		genapic.o genapic_cluster.o genapic_flat.o
+obj-$(CONFIG_KEXEC)		+= machine_kexec.o relocate_kernel.o crash.o
 obj-$(CONFIG_PM)		+= suspend.o
 obj-$(CONFIG_SOFTWARE_SUSPEND)	+= suspend_asm.o
 obj-$(CONFIG_CPU_FREQ)		+= cpufreq/
diff --git a/arch/x86_64/kernel/acpi/wakeup.S b/arch/x86_64/kernel/acpi/wakeup.S
index a4c630034cd..185faa911db 100644
--- a/arch/x86_64/kernel/acpi/wakeup.S
+++ b/arch/x86_64/kernel/acpi/wakeup.S
@@ -67,7 +67,7 @@ wakeup_code:
 	shll	$4, %eax
 	addl	$(gdta - wakeup_code), %eax
 	movl	%eax, gdt_48a +2 - wakeup_code
-	lgdt	%ds:gdt_48a - wakeup_code		# load gdt with whatever is
+	lgdtl	%ds:gdt_48a - wakeup_code	# load gdt with whatever is
 						# appropriate
 
 	movl	$1, %eax			# protected mode (PE) bit
diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c
index f8e6cc4fecd..375d369570c 100644
--- a/arch/x86_64/kernel/apic.c
+++ b/arch/x86_64/kernel/apic.c
@@ -133,7 +133,7 @@ void __init connect_bsp_APIC(void)
 	}
 }
 
-void disconnect_bsp_APIC(void)
+void disconnect_bsp_APIC(int virt_wire_setup)
 {
 	if (pic_mode) {
 		/*
@@ -146,6 +146,42 @@ void disconnect_bsp_APIC(void)
 		outb(0x70, 0x22);
 		outb(0x00, 0x23);
 	}
+	else {
+		/* Go back to Virtual Wire compatibility mode */
+		unsigned long value;
+
+		/* For the spurious interrupt use vector F, and enable it */
+		value = apic_read(APIC_SPIV);
+		value &= ~APIC_VECTOR_MASK;
+		value |= APIC_SPIV_APIC_ENABLED;
+		value |= 0xf;
+		apic_write_around(APIC_SPIV, value);
+
+		if (!virt_wire_setup) {
+			/* For LVT0 make it edge triggered, active high, external and enabled */
+			value = apic_read(APIC_LVT0);
+			value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
+				APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+				APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
+			value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+			value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
+			apic_write_around(APIC_LVT0, value);
+		}
+		else {
+			/* Disable LVT0 */
+			apic_write_around(APIC_LVT0, APIC_LVT_MASKED);
+		}
+
+		/* For LVT1 make it edge triggered, active high, nmi and enabled */
+		value = apic_read(APIC_LVT1);
+		value &= ~(
+			APIC_MODE_MASK | APIC_SEND_PENDING |
+			APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+			APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
+		value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+		value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
+		apic_write_around(APIC_LVT1, value);
+	}
 }
 
 void disable_local_APIC(void)
@@ -285,7 +321,7 @@ void __init init_bsp_APIC(void)
 	apic_write_around(APIC_LVT1, value);
 }
 
-void __init setup_local_APIC (void)
+void __cpuinit setup_local_APIC (void)
 {
 	unsigned int value, ver, maxlvt;
 
@@ -534,7 +570,7 @@ static struct sys_device device_lapic = {
 	.cls		= &lapic_sysclass,
 };
 
-static void __init apic_pm_activate(void)
+static void __cpuinit apic_pm_activate(void)
 {
 	apic_pm_state.active = 1;
 }
@@ -774,14 +810,14 @@ void __init setup_boot_APIC_clock (void)
 	local_irq_enable();
 }
 
-void __init setup_secondary_APIC_clock(void)
+void __cpuinit setup_secondary_APIC_clock(void)
 {
 	local_irq_disable(); /* FIXME: Do we need this? --RR */
 	setup_APIC_timer(calibration_result);
 	local_irq_enable();
 }
 
-void __init disable_APIC_timer(void)
+void __cpuinit disable_APIC_timer(void)
 {
 	if (using_apic_timer) {
 		unsigned long v;
diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c
new file mode 100644
index 00000000000..d7fa4248501
--- /dev/null
+++ b/arch/x86_64/kernel/crash.c
@@ -0,0 +1,35 @@
+/*
+ * Architecture specific (x86_64) functions for kexec based crash dumps.
+ *
+ * Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
+ *
+ * Copyright (C) IBM Corporation, 2004. All rights reserved.
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/smp.h>
+#include <linux/irq.h>
+#include <linux/reboot.h>
+#include <linux/kexec.h>
+
+#include <asm/processor.h>
+#include <asm/hardirq.h>
+#include <asm/nmi.h>
+#include <asm/hw_irq.h>
+
+note_buf_t crash_notes[NR_CPUS];
+
+void machine_crash_shutdown(struct pt_regs *regs)
+{
+	/* This function is only called after the system
+	 * has paniced or is otherwise in a critical state.
+	 * The minimum amount of code to allow a kexec'd kernel
+	 * to run successfully needs to happen here.
+	 *
+	 * In practice this means shooting down the other cpus in
+	 * an SMP system.
+	 */
+}
diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c
index 7c154dfff64..6ded3a50dfe 100644
--- a/arch/x86_64/kernel/e820.c
+++ b/arch/x86_64/kernel/e820.c
@@ -16,6 +16,7 @@
 #include <linux/bootmem.h>
 #include <linux/ioport.h>
 #include <linux/string.h>
+#include <linux/kexec.h>
 #include <asm/page.h>
 #include <asm/e820.h>
 #include <asm/proto.h>
@@ -191,8 +192,6 @@ void __init e820_reserve_resources(void)
 	int i;
 	for (i = 0; i < e820.nr_map; i++) {
 		struct resource *res;
-		if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
-			continue;
 		res = alloc_bootmem_low(sizeof(struct resource));
 		switch (e820.map[i].type) {
 		case E820_RAM:	res->name = "System RAM"; break;
@@ -212,6 +211,9 @@ void __init e820_reserve_resources(void)
 			 */
 			request_resource(res, &code_resource);
 			request_resource(res, &data_resource);
+#ifdef CONFIG_KEXEC
+			request_resource(res, &crashk_res);
+#endif
 		}
 	}
 }
diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c
index b4cbbad0422..28284696508 100644
--- a/arch/x86_64/kernel/genapic_flat.c
+++ b/arch/x86_64/kernel/genapic_flat.c
@@ -7,6 +7,8 @@
  * Hacked for x86-64 by James Cleverdon from i386 architecture code by
  * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
  * James Cleverdon.
+ * Ashok Raj <ashok.raj@intel.com>
+ * 	Removed IPI broadcast shortcut to support CPU hotplug
  */
 #include <linux/config.h>
 #include <linux/threads.h>
@@ -18,6 +20,46 @@
 #include <asm/smp.h>
 #include <asm/ipi.h>
 
+/*
+ * The following permit choosing broadcast IPI shortcut v.s sending IPI only
+ * to online cpus via the send_IPI_mask varient.
+ * The mask version is my preferred option, since it eliminates a lot of
+ * other extra code that would need to be written to cleanup intrs sent
+ * to a CPU while offline.
+ *
+ * Sending broadcast introduces lots of trouble in CPU hotplug situations.
+ * These IPI's are delivered to cpu's irrespective of their offline status
+ * and could pickup stale intr data when these CPUS are turned online.
+ *
+ * Not using broadcast is a cleaner approach IMO, but Andi Kleen disagrees with
+ * the idea of not using broadcast IPI's anymore. Hence the run time check
+ * is introduced, on his request so we can choose an alternate mechanism.
+ *
+ * Initial wacky performance tests that collect cycle counts show
+ * no increase in using mask v.s broadcast version. In fact they seem
+ * identical in terms of cycle counts.
+ *
+ * if we need to use broadcast, we need to do the following.
+ *
+ * cli;
+ * hold call_lock;
+ * clear any pending IPI, just ack and clear all pending intr
+ * set cpu_online_map;
+ * release call_lock;
+ * sti;
+ *
+ * The complicated dummy irq processing shown above is not required if
+ * we didnt sent IPI's to wrong CPU's in the first place.
+ *
+ * - Ashok Raj <ashok.raj@intel.com>
+ */
+#ifdef CONFIG_HOTPLUG_CPU
+#define DEFAULT_SEND_IPI	(1)
+#else
+#define DEFAULT_SEND_IPI	(0)
+#endif
+
+static int no_broadcast=DEFAULT_SEND_IPI;
 
 static cpumask_t flat_target_cpus(void)
 {
@@ -45,22 +87,6 @@ static void flat_init_apic_ldr(void)
 	apic_write_around(APIC_LDR, val);
 }
 
-static void flat_send_IPI_allbutself(int vector)
-{
-	/*
-	 * if there are no other CPUs in the system then
-	 * we get an APIC send error if we try to broadcast.
-	 * thus we have to avoid sending IPIs in this case.
-	 */
-	if (num_online_cpus() > 1)
-		__send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL);
-}
-
-static void flat_send_IPI_all(int vector)
-{
-	__send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
-}
-
 static void flat_send_IPI_mask(cpumask_t cpumask, int vector)
 {
 	unsigned long mask = cpus_addr(cpumask)[0];
@@ -93,6 +119,39 @@ static void flat_send_IPI_mask(cpumask_t cpumask, int vector)
 	local_irq_restore(flags);
 }
 
+static inline void __local_flat_send_IPI_allbutself(int vector)
+{
+	if (no_broadcast) {
+		cpumask_t mask = cpu_online_map;
+		int this_cpu = get_cpu();
+
+		cpu_clear(this_cpu, mask);
+		flat_send_IPI_mask(mask, vector);
+		put_cpu();
+	}
+	else
+		__send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL);
+}
+
+static inline void __local_flat_send_IPI_all(int vector)
+{
+	if (no_broadcast)
+		flat_send_IPI_mask(cpu_online_map, vector);
+	else
+		__send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
+}
+
+static void flat_send_IPI_allbutself(int vector)
+{
+	if (((num_online_cpus()) - 1) >= 1)
+		__local_flat_send_IPI_allbutself(vector);
+}
+
+static void flat_send_IPI_all(int vector)
+{
+	__local_flat_send_IPI_all(vector);
+}
+
 static int flat_apic_id_registered(void)
 {
 	return physid_isset(GET_APIC_ID(apic_read(APIC_ID)), phys_cpu_present_map);
@@ -111,6 +170,16 @@ static unsigned int phys_pkg_id(int index_msb)
 	return ((ebx >> 24) & 0xFF) >> index_msb;
 }
 
+static __init int no_ipi_broadcast(char *str)
+{
+	get_option(&str, &no_broadcast);
+	printk ("Using %s mode\n", no_broadcast ? "No IPI Broadcast" :
+											"IPI Broadcast");
+	return 1;
+}
+
+__setup("no_ipi_broadcast", no_ipi_broadcast);
+
 struct genapic apic_flat =  {
 	.name = "flat",
 	.int_delivery_mode = dest_LowestPrio,
@@ -125,3 +194,12 @@ struct genapic apic_flat =  {
 	.cpu_mask_to_apicid = flat_cpu_mask_to_apicid,
 	.phys_pkg_id = phys_pkg_id,
 };
+
+static int __init print_ipi_mode(void)
+{
+	printk ("Using IPI %s mode\n", no_broadcast ? "No-Shortcut" :
+											"Shortcut");
+	return 0;
+}
+
+late_initcall(print_ipi_mode);
diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S
index 9bd2e7a4b81..8d765aa77a2 100644
--- a/arch/x86_64/kernel/head.S
+++ b/arch/x86_64/kernel/head.S
@@ -248,23 +248,23 @@ ENTRY(_stext)
 	 */
 .org 0x1000
 ENTRY(init_level4_pgt)
-	.quad	0x0000000000102007		/* -> level3_ident_pgt */
+	.quad	0x0000000000002007 + __PHYSICAL_START	/* -> level3_ident_pgt */
 	.fill	255,8,0
-	.quad	0x000000000010a007
+	.quad	0x000000000000a007 + __PHYSICAL_START
 	.fill	254,8,0
 	/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
-	.quad	0x0000000000103007		/* -> level3_kernel_pgt */
+	.quad	0x0000000000003007 + __PHYSICAL_START	/* -> level3_kernel_pgt */
 
 .org 0x2000
 ENTRY(level3_ident_pgt)
-	.quad	0x0000000000104007
+	.quad	0x0000000000004007 + __PHYSICAL_START
 	.fill	511,8,0
 
 .org 0x3000
 ENTRY(level3_kernel_pgt)
 	.fill	510,8,0
 	/* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
-	.quad	0x0000000000105007		/* -> level2_kernel_pgt */
+	.quad	0x0000000000005007 + __PHYSICAL_START	/* -> level2_kernel_pgt */
 	.fill	1,8,0
 
 .org 0x4000
@@ -337,17 +337,17 @@ ENTRY(empty_bad_pmd_table)
 
 .org 0xa000
 ENTRY(level3_physmem_pgt)
-	.quad	0x0000000000105007		/* -> level2_kernel_pgt (so that __va works even before pagetable_init) */
+	.quad	0x0000000000005007 + __PHYSICAL_START	/* -> level2_kernel_pgt (so that __va works even before pagetable_init) */
 
 	.org 0xb000
 #ifdef CONFIG_ACPI_SLEEP
 ENTRY(wakeup_level4_pgt)
-	.quad	0x0000000000102007		/* -> level3_ident_pgt */
+	.quad	0x0000000000002007 + __PHYSICAL_START	/* -> level3_ident_pgt */
 	.fill	255,8,0
-	.quad	0x000000000010a007
+	.quad	0x000000000000a007 + __PHYSICAL_START
 	.fill	254,8,0
 	/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
-	.quad	0x0000000000103007		/* -> level3_kernel_pgt */
+	.quad	0x0000000000003007 + __PHYSICAL_START	/* -> level3_kernel_pgt */
 #endif
 
 	.data
diff --git a/arch/x86_64/kernel/i387.c b/arch/x86_64/kernel/i387.c
index ba139cac57c..d9b22b633e3 100644
--- a/arch/x86_64/kernel/i387.c
+++ b/arch/x86_64/kernel/i387.c
@@ -42,7 +42,7 @@ void mxcsr_feature_mask_init(void)
  * Called at bootup to set up the initial FPU state that is later cloned
  * into all processes.
  */
-void __init fpu_init(void)
+void __cpuinit fpu_init(void)
 {
 	unsigned long oldcr0 = read_cr0();
 	extern void __bad_fxsave_alignment(void);
diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c
index 19eafa0aa95..a8916909512 100644
--- a/arch/x86_64/kernel/i8259.c
+++ b/arch/x86_64/kernel/i8259.c
@@ -414,10 +414,22 @@ static int i8259A_suspend(struct sys_device *dev, pm_message_t state)
 	return 0;
 }
 
+static int i8259A_shutdown(struct sys_device *dev)
+{
+	/* Put the i8259A into a quiescent state that
+	 * the kernel initialization code can get it
+	 * out of.
+	 */
+	outb(0xff, 0x21);	/* mask all of 8259A-1 */
+	outb(0xff, 0xA1);	/* mask all of 8259A-1 */
+	return 0;
+}
+
 static struct sysdev_class i8259_sysdev_class = {
 	set_kset_name("i8259"),
 	.suspend = i8259A_suspend,
 	.resume = i8259A_resume,
+	.shutdown = i8259A_shutdown,
 };
 
 static struct sys_device device_i8259A = {
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index afd87e64d0a..d206d7e49cf 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -330,7 +330,7 @@ static int find_irq_entry(int apic, int pin, int type)
 /*
  * Find the pin to which IRQ[irq] (ISA) is connected
  */
-static int __init find_isa_irq_pin(int irq, int type)
+static int find_isa_irq_pin(int irq, int type)
 {
 	int i;
 
@@ -1064,7 +1064,6 @@ void print_all_local_APICs (void)
 
 void __apicdebuginit print_PIC(void)
 {
-	extern spinlock_t i8259A_lock;
 	unsigned int v;
 	unsigned long flags;
 
@@ -1132,12 +1131,44 @@ static void __init enable_IO_APIC(void)
  */
 void disable_IO_APIC(void)
 {
+	int pin;
 	/*
 	 * Clear the IO-APIC before rebooting:
 	 */
 	clear_IO_APIC();
 
-	disconnect_bsp_APIC();
+	/*
+	 * If the i82559 is routed through an IOAPIC
+	 * Put that IOAPIC in virtual wire mode
+	 * so legacy interrups can be delivered.
+	 */
+	pin = find_isa_irq_pin(0, mp_ExtINT);
+	if (pin != -1) {
+		struct IO_APIC_route_entry entry;
+		unsigned long flags;
+
+		memset(&entry, 0, sizeof(entry));
+		entry.mask            = 0; /* Enabled */
+		entry.trigger         = 0; /* Edge */
+		entry.irr             = 0;
+		entry.polarity        = 0; /* High */
+		entry.delivery_status = 0;
+		entry.dest_mode       = 0; /* Physical */
+		entry.delivery_mode   = 7; /* ExtInt */
+		entry.vector          = 0;
+		entry.dest.physical.physical_dest = 0;
+
+
+		/*
+		 * Add it to the IO-APIC irq-routing table:
+		 */
+		spin_lock_irqsave(&ioapic_lock, flags);
+		io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1));
+		io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0));
+		spin_unlock_irqrestore(&ioapic_lock, flags);
+	}
+
+	disconnect_bsp_APIC(pin != -1);
 }
 
 /*
diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c
index 62b112e4deb..cc3fb85f514 100644
--- a/arch/x86_64/kernel/irq.c
+++ b/arch/x86_64/kernel/irq.c
@@ -14,6 +14,7 @@
 #include <linux/interrupt.h>
 #include <linux/seq_file.h>
 #include <linux/module.h>
+#include <linux/delay.h>
 #include <asm/uaccess.h>
 #include <asm/io_apic.h>
 
@@ -106,3 +107,31 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
 	return 1;
 }
 
+#ifdef CONFIG_HOTPLUG_CPU
+void fixup_irqs(cpumask_t map)
+{
+	unsigned int irq;
+	static int warned;
+
+	for (irq = 0; irq < NR_IRQS; irq++) {
+		cpumask_t mask;
+		if (irq == 2)
+			continue;
+
+		cpus_and(mask, irq_affinity[irq], map);
+		if (any_online_cpu(mask) == NR_CPUS) {
+			printk("Breaking affinity for irq %i\n", irq);
+			mask = map;
+		}
+		if (irq_desc[irq].handler->set_affinity)
+			irq_desc[irq].handler->set_affinity(irq, mask);
+		else if (irq_desc[irq].action && !(warned++))
+			printk("Cannot set affinity for irq %i\n", irq);
+	}
+
+	/* That doesn't seem sufficient.  Give it 1ms. */
+	local_irq_enable();
+	mdelay(1);
+	local_irq_disable();
+}
+#endif
diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c
index 4e680f87a75..5c6dc705148 100644
--- a/arch/x86_64/kernel/kprobes.c
+++ b/arch/x86_64/kernel/kprobes.c
@@ -38,7 +38,7 @@
 #include <linux/string.h>
 #include <linux/slab.h>
 #include <linux/preempt.h>
-#include <linux/moduleloader.h>
+
 #include <asm/cacheflush.h>
 #include <asm/pgtable.h>
 #include <asm/kdebug.h>
@@ -51,8 +51,6 @@ static struct kprobe *kprobe_prev;
 static unsigned long kprobe_status_prev, kprobe_old_rflags_prev, kprobe_saved_rflags_prev;
 static struct pt_regs jprobe_saved_regs;
 static long *jprobe_saved_rsp;
-static kprobe_opcode_t *get_insn_slot(void);
-static void free_insn_slot(kprobe_opcode_t *slot);
 void jprobe_return_end(void);
 
 /* copy of the kernel stack at the probe fire time */
@@ -274,48 +272,23 @@ static void prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
 		regs->rip = (unsigned long)p->ainsn.insn;
 }
 
-struct task_struct  *arch_get_kprobe_task(void *ptr)
-{
-	return ((struct thread_info *) (((unsigned long) ptr) &
-					(~(THREAD_SIZE -1))))->task;
-}
-
 void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs)
 {
 	unsigned long *sara = (unsigned long *)regs->rsp;
-	struct kretprobe_instance *ri;
-	static void *orig_ret_addr;
+        struct kretprobe_instance *ri;
+
+        if ((ri = get_free_rp_inst(rp)) != NULL) {
+                ri->rp = rp;
+                ri->task = current;
+		ri->ret_addr = (kprobe_opcode_t *) *sara;
 
-	/*
-	 * Save the return address when the return probe hits
-	 * the first time, and use it to populate the (krprobe
-	 * instance)->ret_addr for subsequent return probes at
-	 * the same addrress since stack address would have
-	 * the kretprobe_trampoline by then.
-	 */
-	if (((void*) *sara) != kretprobe_trampoline)
-		orig_ret_addr = (void*) *sara;
-
-	if ((ri = get_free_rp_inst(rp)) != NULL) {
-		ri->rp = rp;
-		ri->stack_addr = sara;
-		ri->ret_addr = orig_ret_addr;
-		add_rp_inst(ri);
 		/* Replace the return addr with trampoline addr */
 		*sara = (unsigned long) &kretprobe_trampoline;
-	} else {
-		rp->nmissed++;
-	}
-}
 
-void arch_kprobe_flush_task(struct task_struct *tk)
-{
-	struct kretprobe_instance *ri;
-	while ((ri = get_rp_inst_tsk(tk)) != NULL) {
-		*((unsigned long *)(ri->stack_addr)) =
-					(unsigned long) ri->ret_addr;
-		recycle_rp_inst(ri);
-	}
+                add_rp_inst(ri);
+        } else {
+                rp->nmissed++;
+        }
 }
 
 /*
@@ -428,36 +401,59 @@ no_kprobe:
  */
 int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
 {
-	struct task_struct *tsk;
-	struct kretprobe_instance *ri;
-	struct hlist_head *head;
-	struct hlist_node *node;
-	unsigned long *sara = (unsigned long *)regs->rsp - 1;
-
-	tsk = arch_get_kprobe_task(sara);
-	head = kretprobe_inst_table_head(tsk);
-
-	hlist_for_each_entry(ri, node, head, hlist) {
-		if (ri->stack_addr == sara && ri->rp) {
-			if (ri->rp->handler)
-				ri->rp->handler(ri, regs);
-		}
-	}
-	return 0;
-}
+        struct kretprobe_instance *ri = NULL;
+        struct hlist_head *head;
+        struct hlist_node *node, *tmp;
+	unsigned long orig_ret_address = 0;
+	unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline;
 
-void trampoline_post_handler(struct kprobe *p, struct pt_regs *regs,
-						unsigned long flags)
-{
-	struct kretprobe_instance *ri;
-	/* RA already popped */
-	unsigned long *sara = ((unsigned long *)regs->rsp) - 1;
+        head = kretprobe_inst_table_head(current);
 
-	while ((ri = get_rp_inst(sara))) {
-		regs->rip = (unsigned long)ri->ret_addr;
+	/*
+	 * It is possible to have multiple instances associated with a given
+	 * task either because an multiple functions in the call path
+	 * have a return probe installed on them, and/or more then one return
+	 * return probe was registered for a target function.
+	 *
+	 * We can handle this because:
+	 *     - instances are always inserted at the head of the list
+	 *     - when multiple return probes are registered for the same
+         *       function, the first instance's ret_addr will point to the
+	 *       real return address, and all the rest will point to
+	 *       kretprobe_trampoline
+	 */
+	hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
+                if (ri->task != current)
+			/* another task is sharing our hash bucket */
+                        continue;
+
+		if (ri->rp && ri->rp->handler)
+			ri->rp->handler(ri, regs);
+
+		orig_ret_address = (unsigned long)ri->ret_addr;
 		recycle_rp_inst(ri);
+
+		if (orig_ret_address != trampoline_address)
+			/*
+			 * This is the real return address. Any other
+			 * instances associated with this task are for
+			 * other calls deeper on the call stack
+			 */
+			break;
 	}
-	regs->eflags &= ~TF_MASK;
+
+	BUG_ON(!orig_ret_address || (orig_ret_address == trampoline_address));
+	regs->rip = orig_ret_address;
+
+	unlock_kprobes();
+	preempt_enable_no_resched();
+
+        /*
+         * By returning a non-zero value, we are telling
+         * kprobe_handler() that we have handled unlocking
+         * and re-enabling preemption.
+         */
+        return 1;
 }
 
 /*
@@ -550,8 +546,7 @@ int post_kprobe_handler(struct pt_regs *regs)
 		current_kprobe->post_handler(current_kprobe, regs, 0);
 	}
 
-	if (current_kprobe->post_handler != trampoline_post_handler)
-		resume_execution(current_kprobe, regs);
+	resume_execution(current_kprobe, regs);
 	regs->eflags |= kprobe_saved_rflags;
 
 	/* Restore the original saved kprobes variables and continue. */
@@ -682,111 +677,12 @@ int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
 	return 0;
 }
 
-/*
- * kprobe->ainsn.insn points to the copy of the instruction to be single-stepped.
- * By default on x86_64, pages we get from kmalloc or vmalloc are not
- * executable.  Single-stepping an instruction on such a page yields an
- * oops.  So instead of storing the instruction copies in their respective
- * kprobe objects, we allocate a page, map it executable, and store all the
- * instruction copies there.  (We can allocate additional pages if somebody
- * inserts a huge number of probes.)  Each page can hold up to INSNS_PER_PAGE
- * instruction slots, each of which is MAX_INSN_SIZE*sizeof(kprobe_opcode_t)
- * bytes.
- */
-#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE*sizeof(kprobe_opcode_t)))
-struct kprobe_insn_page {
-	struct hlist_node hlist;
-	kprobe_opcode_t *insns;		/* page of instruction slots */
-	char slot_used[INSNS_PER_PAGE];
-	int nused;
+static struct kprobe trampoline_p = {
+	.addr = (kprobe_opcode_t *) &kretprobe_trampoline,
+	.pre_handler = trampoline_probe_handler
 };
 
-static struct hlist_head kprobe_insn_pages;
-
-/**
- * get_insn_slot() - Find a slot on an executable page for an instruction.
- * We allocate an executable page if there's no room on existing ones.
- */
-static kprobe_opcode_t *get_insn_slot(void)
-{
-	struct kprobe_insn_page *kip;
-	struct hlist_node *pos;
-
-	hlist_for_each(pos, &kprobe_insn_pages) {
-		kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
-		if (kip->nused < INSNS_PER_PAGE) {
-			int i;
-			for (i = 0; i < INSNS_PER_PAGE; i++) {
-				if (!kip->slot_used[i]) {
-					kip->slot_used[i] = 1;
-					kip->nused++;
-					return kip->insns + (i*MAX_INSN_SIZE);
-				}
-			}
-			/* Surprise!  No unused slots.  Fix kip->nused. */
-			kip->nused = INSNS_PER_PAGE;
-		}
-	}
-
-	/* All out of space.  Need to allocate a new page. Use slot 0.*/
-	kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL);
-	if (!kip) {
-		return NULL;
-	}
-
-	/*
-	 * For the %rip-relative displacement fixups to be doable, we
-	 * need our instruction copy to be within +/- 2GB of any data it
-	 * might access via %rip.  That is, within 2GB of where the
-	 * kernel image and loaded module images reside.  So we allocate
-	 * a page in the module loading area.
-	 */
-	kip->insns = module_alloc(PAGE_SIZE);
-	if (!kip->insns) {
-		kfree(kip);
-		return NULL;
-	}
-	INIT_HLIST_NODE(&kip->hlist);
-	hlist_add_head(&kip->hlist, &kprobe_insn_pages);
-	memset(kip->slot_used, 0, INSNS_PER_PAGE);
-	kip->slot_used[0] = 1;
-	kip->nused = 1;
-	return kip->insns;
-}
-
-/**
- * free_insn_slot() - Free instruction slot obtained from get_insn_slot().
- */
-static void free_insn_slot(kprobe_opcode_t *slot)
+int __init arch_init_kprobes(void)
 {
-	struct kprobe_insn_page *kip;
-	struct hlist_node *pos;
-
-	hlist_for_each(pos, &kprobe_insn_pages) {
-		kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
-		if (kip->insns <= slot
-		    && slot < kip->insns+(INSNS_PER_PAGE*MAX_INSN_SIZE)) {
-			int i = (slot - kip->insns) / MAX_INSN_SIZE;
-			kip->slot_used[i] = 0;
-			kip->nused--;
-			if (kip->nused == 0) {
-				/*
-				 * Page is no longer in use.  Free it unless
-				 * it's the last one.  We keep the last one
-				 * so as not to have to set it up again the
-				 * next time somebody inserts a probe.
-				 */
-				hlist_del(&kip->hlist);
-				if (hlist_empty(&kprobe_insn_pages)) {
-					INIT_HLIST_NODE(&kip->hlist);
-					hlist_add_head(&kip->hlist,
-						&kprobe_insn_pages);
-				} else {
-					module_free(NULL, kip->insns);
-					kfree(kip);
-				}
-			}
-			return;
-		}
-	}
+	return register_kprobe(&trampoline_p);
 }
diff --git a/arch/x86_64/kernel/machine_kexec.c b/arch/x86_64/kernel/machine_kexec.c
new file mode 100644
index 00000000000..60d1eff4156
--- /dev/null
+++ b/arch/x86_64/kernel/machine_kexec.c
@@ -0,0 +1,250 @@
+/*
+ * machine_kexec.c - handle transition of Linux booting another kernel
+ * Copyright (C) 2002-2005 Eric Biederman  <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+
+#include <linux/mm.h>
+#include <linux/kexec.h>
+#include <linux/delay.h>
+#include <linux/string.h>
+#include <linux/reboot.h>
+#include <asm/pda.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/io.h>
+#include <asm/apic.h>
+#include <asm/cpufeature.h>
+#include <asm/hw_irq.h>
+
+#define LEVEL0_SIZE (1UL << 12UL)
+#define LEVEL1_SIZE (1UL << 21UL)
+#define LEVEL2_SIZE (1UL << 30UL)
+#define LEVEL3_SIZE (1UL << 39UL)
+#define LEVEL4_SIZE (1UL << 48UL)
+
+#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE)
+#define L2_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define L3_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+
+static void init_level2_page(u64 *level2p, unsigned long addr)
+{
+	unsigned long end_addr;
+
+	addr &= PAGE_MASK;
+	end_addr = addr + LEVEL2_SIZE;
+	while (addr < end_addr) {
+		*(level2p++) = addr | L1_ATTR;
+		addr += LEVEL1_SIZE;
+	}
+}
+
+static int init_level3_page(struct kimage *image, u64 *level3p,
+				unsigned long addr, unsigned long last_addr)
+{
+	unsigned long end_addr;
+	int result;
+
+	result = 0;
+	addr &= PAGE_MASK;
+	end_addr = addr + LEVEL3_SIZE;
+	while ((addr < last_addr) && (addr < end_addr)) {
+		struct page *page;
+		u64 *level2p;
+
+		page = kimage_alloc_control_pages(image, 0);
+		if (!page) {
+			result = -ENOMEM;
+			goto out;
+		}
+		level2p = (u64 *)page_address(page);
+		init_level2_page(level2p, addr);
+		*(level3p++) = __pa(level2p) | L2_ATTR;
+		addr += LEVEL2_SIZE;
+	}
+	/* clear the unused entries */
+	while (addr < end_addr) {
+		*(level3p++) = 0;
+		addr += LEVEL2_SIZE;
+	}
+out:
+	return result;
+}
+
+
+static int init_level4_page(struct kimage *image, u64 *level4p,
+				unsigned long addr, unsigned long last_addr)
+{
+	unsigned long end_addr;
+	int result;
+
+	result = 0;
+	addr &= PAGE_MASK;
+	end_addr = addr + LEVEL4_SIZE;
+	while ((addr < last_addr) && (addr < end_addr)) {
+		struct page *page;
+		u64 *level3p;
+
+		page = kimage_alloc_control_pages(image, 0);
+		if (!page) {
+			result = -ENOMEM;
+			goto out;
+		}
+		level3p = (u64 *)page_address(page);
+		result = init_level3_page(image, level3p, addr, last_addr);
+		if (result) {
+			goto out;
+		}
+		*(level4p++) = __pa(level3p) | L3_ATTR;
+		addr += LEVEL3_SIZE;
+	}
+	/* clear the unused entries */
+	while (addr < end_addr) {
+		*(level4p++) = 0;
+		addr += LEVEL3_SIZE;
+	}
+out:
+	return result;
+}
+
+
+static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
+{
+	u64 *level4p;
+	level4p = (u64 *)__va(start_pgtable);
+ 	return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
+}
+
+static void set_idt(void *newidt, u16 limit)
+{
+	unsigned char curidt[10];
+
+	/* x86-64 supports unaliged loads & stores */
+	(*(u16 *)(curidt)) = limit;
+	(*(u64 *)(curidt +2)) = (unsigned long)(newidt);
+
+	__asm__ __volatile__ (
+		"lidt %0\n"
+		: "=m" (curidt)
+		);
+};
+
+
+static void set_gdt(void *newgdt, u16 limit)
+{
+	unsigned char curgdt[10];
+
+	/* x86-64 supports unaligned loads & stores */
+	(*(u16 *)(curgdt)) = limit;
+	(*(u64 *)(curgdt +2)) = (unsigned long)(newgdt);
+
+	__asm__ __volatile__ (
+		"lgdt %0\n"
+		: "=m" (curgdt)
+		);
+};
+
+static void load_segments(void)
+{
+	__asm__ __volatile__ (
+		"\tmovl $"STR(__KERNEL_DS)",%eax\n"
+		"\tmovl %eax,%ds\n"
+		"\tmovl %eax,%es\n"
+		"\tmovl %eax,%ss\n"
+		"\tmovl %eax,%fs\n"
+		"\tmovl %eax,%gs\n"
+		);
+#undef STR
+#undef __STR
+}
+
+typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page,
+					unsigned long control_code_buffer,
+					unsigned long start_address,
+					unsigned long pgtable) ATTRIB_NORET;
+
+const extern unsigned char relocate_new_kernel[];
+const extern unsigned long relocate_new_kernel_size;
+
+int machine_kexec_prepare(struct kimage *image)
+{
+	unsigned long start_pgtable, control_code_buffer;
+	int result;
+
+	/* Calculate the offsets */
+	start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+	control_code_buffer = start_pgtable + 4096UL;
+
+	/* Setup the identity mapped 64bit page table */
+	result = init_pgtable(image, start_pgtable);
+	if (result)
+		return result;
+
+	/* Place the code in the reboot code buffer */
+	memcpy(__va(control_code_buffer), relocate_new_kernel,
+						relocate_new_kernel_size);
+
+	return 0;
+}
+
+void machine_kexec_cleanup(struct kimage *image)
+{
+	return;
+}
+
+/*
+ * Do not allocate memory (or fail in any way) in machine_kexec().
+ * We are past the point of no return, committed to rebooting now.
+ */
+NORET_TYPE void machine_kexec(struct kimage *image)
+{
+	unsigned long page_list;
+	unsigned long control_code_buffer;
+	unsigned long start_pgtable;
+	relocate_new_kernel_t rnk;
+
+	/* Interrupts aren't acceptable while we reboot */
+	local_irq_disable();
+
+	/* Calculate the offsets */
+	page_list = image->head;
+	start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+	control_code_buffer = start_pgtable + 4096UL;
+
+	/* Set the low half of the page table to my identity mapped
+	 * page table for kexec.  Leave the high half pointing at the
+	 * kernel pages.   Don't bother to flush the global pages
+	 * as that will happen when I fully switch to my identity mapped
+	 * page table anyway.
+	 */
+	memcpy(__va(read_cr3()), __va(start_pgtable), PAGE_SIZE/2);
+	__flush_tlb();
+
+
+	/* The segment registers are funny things, they are
+	 * automatically loaded from a table, in memory wherever you
+	 * set them to a specific selector, but this table is never
+	 * accessed again unless you set the segment to a different selector.
+	 *
+	 * The more common model are caches where the behide
+	 * the scenes work is done, but is also dropped at arbitrary
+	 * times.
+	 *
+	 * I take advantage of this here by force loading the
+	 * segments, before I zap the gdt with an invalid value.
+	 */
+	load_segments();
+	/* The gdt & idt are now invalid.
+	 * If you want to load them you must set up your own idt & gdt.
+	 */
+	set_gdt(phys_to_virt(0),0);
+	set_idt(phys_to_virt(0),0);
+	/* now call it */
+	rnk = (relocate_new_kernel_t) control_code_buffer;
+	(*rnk)(page_list, control_code_buffer, image->start, start_pgtable);
+}
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index 3a89d735a4f..21e70625a49 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -327,7 +327,7 @@ static void mce_init(void *dummy)
 }
 
 /* Add per CPU specific workarounds here */
-static void __init mce_cpu_quirks(struct cpuinfo_x86 *c) 
+static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
 { 
 	/* This should be disabled by the BIOS, but isn't always */
 	if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
@@ -337,7 +337,7 @@ static void __init mce_cpu_quirks(struct cpuinfo_x86 *c)
 	}
 }			
 
-static void __init mce_cpu_features(struct cpuinfo_x86 *c)
+static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
 {
 	switch (c->x86_vendor) {
 	case X86_VENDOR_INTEL:
@@ -352,7 +352,7 @@ static void __init mce_cpu_features(struct cpuinfo_x86 *c)
  * Called for each booted CPU to set up machine checks.
  * Must be called with preempt off. 
  */
-void __init mcheck_init(struct cpuinfo_x86 *c)
+void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
 {
 	static cpumask_t mce_cpus __initdata = CPU_MASK_NONE;
 
@@ -411,7 +411,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff
 	memset(mcelog.entry, 0, next * sizeof(struct mce));
 	mcelog.next = 0;
 
-	synchronize_kernel();	
+	synchronize_sched();
 
 	/* Collect entries that were still getting written before the synchronize. */
 
@@ -542,7 +542,7 @@ ACCESSOR(bank4ctl,bank[4],mce_restart())
 ACCESSOR(tolerant,tolerant,)
 ACCESSOR(check_interval,check_interval,mce_restart())
 
-static __init int mce_init_device(void)
+static __cpuinit int mce_init_device(void)
 {
 	int err;
 	if (!mce_available(&boot_cpu_data))
diff --git a/arch/x86_64/kernel/mce_intel.c b/arch/x86_64/kernel/mce_intel.c
index 4db9a640069..0be0a795981 100644
--- a/arch/x86_64/kernel/mce_intel.c
+++ b/arch/x86_64/kernel/mce_intel.c
@@ -42,7 +42,7 @@ done:
 	irq_exit();
 }
 
-static void __init intel_init_thermal(struct cpuinfo_x86 *c)
+static void __cpuinit intel_init_thermal(struct cpuinfo_x86 *c)
 {
 	u32 l, h;
 	int tm2 = 0;
@@ -93,7 +93,7 @@ static void __init intel_init_thermal(struct cpuinfo_x86 *c)
 	return;
 }
 
-void __init mce_intel_feature_init(struct cpuinfo_x86 *c)
+void __cpuinit mce_intel_feature_init(struct cpuinfo_x86 *c)
 {
 	intel_init_thermal(c);
 }
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index 31c0f2e6ac9..4e44d6e6b7e 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -98,7 +98,7 @@ static unsigned int nmi_p4_cccr_val;
 	(P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT|	\
 	 P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE)
 
-static __init inline int nmi_known_cpu(void)
+static __cpuinit inline int nmi_known_cpu(void)
 {
 	switch (boot_cpu_data.x86_vendor) {
 	case X86_VENDOR_AMD:
@@ -110,7 +110,7 @@ static __init inline int nmi_known_cpu(void)
 }
 
 /* Run after command line and cpu_init init, but before all other checks */
-void __init nmi_watchdog_default(void)
+void __cpuinit nmi_watchdog_default(void)
 {
 	if (nmi_watchdog != NMI_DEFAULT)
 		return;
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index e59d1f9d616..7577f9d7a75 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -8,7 +8,8 @@
  * 
  *  X86-64 port
  *	Andi Kleen.
- * 
+ *
+ *	CPU hotplug support - ashok.raj@intel.com
  *  $Id: process.c,v 1.38 2002/01/15 10:08:03 ak Exp $
  */
 
@@ -18,6 +19,7 @@
 
 #include <stdarg.h>
 
+#include <linux/cpu.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/kernel.h>
@@ -154,6 +156,29 @@ void cpu_idle_wait(void)
 }
 EXPORT_SYMBOL_GPL(cpu_idle_wait);
 
+#ifdef CONFIG_HOTPLUG_CPU
+DECLARE_PER_CPU(int, cpu_state);
+
+#include <asm/nmi.h>
+/* We don't actually take CPU down, just spin without interrupts. */
+static inline void play_dead(void)
+{
+	idle_task_exit();
+	wbinvd();
+	mb();
+	/* Ack it */
+	__get_cpu_var(cpu_state) = CPU_DEAD;
+
+	while (1)
+		safe_halt();
+}
+#else
+static inline void play_dead(void)
+{
+	BUG();
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
 /*
  * The idle thread. There's no useful work to be
  * done, so just try to conserve power and have a
@@ -174,6 +199,8 @@ void cpu_idle (void)
 			idle = pm_idle;
 			if (!idle)
 				idle = default_idle;
+			if (cpu_is_offline(smp_processor_id()))
+				play_dead();
 			idle();
 		}
 
@@ -204,7 +231,7 @@ static void mwait_idle(void)
 	}
 }
 
-void __init select_idle_routine(const struct cpuinfo_x86 *c)
+void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
 {
 	static int printed;
 	if (cpu_has(c, X86_FEATURE_MWAIT)) {
@@ -455,6 +482,33 @@ out:
 }
 
 /*
+ * This function selects if the context switch from prev to next
+ * has to tweak the TSC disable bit in the cr4.
+ */
+static inline void disable_tsc(struct task_struct *prev_p,
+			       struct task_struct *next_p)
+{
+	struct thread_info *prev, *next;
+
+	/*
+	 * gcc should eliminate the ->thread_info dereference if
+	 * has_secure_computing returns 0 at compile time (SECCOMP=n).
+	 */
+	prev = prev_p->thread_info;
+	next = next_p->thread_info;
+
+	if (has_secure_computing(prev) || has_secure_computing(next)) {
+		/* slow path here */
+		if (has_secure_computing(prev) &&
+		    !has_secure_computing(next)) {
+			write_cr4(read_cr4() & ~X86_CR4_TSD);
+		} else if (!has_secure_computing(prev) &&
+			   has_secure_computing(next))
+			write_cr4(read_cr4() | X86_CR4_TSD);
+	}
+}
+
+/*
  * This special macro can be used to load a debugging register
  */
 #define loaddebug(thread,r) set_debug(thread->debugreg ## r, r)
@@ -572,6 +626,8 @@ struct task_struct *__switch_to(struct task_struct *prev_p, struct task_struct *
 		}
 	}
 
+	disable_tsc(prev_p, next_p);
+
 	return prev_p;
 }
 
diff --git a/arch/x86_64/kernel/reboot.c b/arch/x86_64/kernel/reboot.c
index be4b36f762c..57e71dbdfd6 100644
--- a/arch/x86_64/kernel/reboot.c
+++ b/arch/x86_64/kernel/reboot.c
@@ -66,41 +66,47 @@ static int __init reboot_setup(char *str)
 
 __setup("reboot=", reboot_setup);
 
-#ifdef CONFIG_SMP
-static void smp_halt(void)
+static inline void kb_wait(void)
 {
-	int cpuid = safe_smp_processor_id(); 
-	static int first_entry = 1;
+	int i;
 
-	if (reboot_force)
-		return;
+	for (i=0; i<0x10000; i++)
+		if ((inb_p(0x64) & 0x02) == 0)
+			break;
+}
 
-	if (first_entry) {
-		first_entry = 0;
-		smp_call_function((void *)machine_restart, NULL, 1, 0);
-	}
-			
-	smp_stop_cpu(); 
+void machine_shutdown(void)
+{
+	/* Stop the cpus and apics */
+#ifdef CONFIG_SMP
+	int reboot_cpu_id;
 
-	/* AP calling this. Just halt */
-	if (cpuid != boot_cpu_id) { 
-		for (;;) 
-			asm("hlt");
+	/* The boot cpu is always logical cpu 0 */
+	reboot_cpu_id = 0;
+
+	/* Make certain the cpu I'm about to reboot on is online */
+	if (!cpu_isset(reboot_cpu_id, cpu_online_map)) {
+		reboot_cpu_id = smp_processor_id();
 	}
 
-	/* Wait for all other CPUs to have run smp_stop_cpu */
-	while (!cpus_empty(cpu_online_map))
-		rep_nop(); 
-}
+	/* Make certain I only run on the appropriate processor */
+	set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id));
+
+	/* O.K Now that I'm on the appropriate processor,
+	 * stop all of the others.
+	 */
+	smp_send_stop();
 #endif
 
-static inline void kb_wait(void)
-{
-	int i;
+	local_irq_disable();
 
-	for (i=0; i<0x10000; i++)
-		if ((inb_p(0x64) & 0x02) == 0)
-			break;
+#ifndef CONFIG_SMP
+	disable_local_APIC();
+#endif
+
+	disable_IO_APIC();
+
+	local_irq_enable();
 }
 
 void machine_restart(char * __unused)
@@ -109,9 +115,7 @@ void machine_restart(char * __unused)
 
 	printk("machine restart\n");
 
-#ifdef CONFIG_SMP
-	smp_halt(); 
-#endif
+	machine_shutdown();
 
 	if (!reboot_force) {
 		local_irq_disable();
diff --git a/arch/x86_64/kernel/relocate_kernel.S b/arch/x86_64/kernel/relocate_kernel.S
new file mode 100644
index 00000000000..d24fa9b72a2
--- /dev/null
+++ b/arch/x86_64/kernel/relocate_kernel.S
@@ -0,0 +1,143 @@
+/*
+ * relocate_kernel.S - put the kernel image in place to boot
+ * Copyright (C) 2002-2005 Eric Biederman  <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+
+#include <linux/linkage.h>
+
+	/*
+	 * Must be relocatable PIC code callable as a C function, that once
+	 * it starts can not use the previous processes stack.
+	 */
+	.globl relocate_new_kernel
+	.code64
+relocate_new_kernel:
+	/* %rdi page_list
+	 * %rsi reboot_code_buffer
+	 * %rdx start address
+	 * %rcx page_table
+	 * %r8  arg5
+	 * %r9  arg6
+	 */
+
+	/* zero out flags, and disable interrupts */
+	pushq $0
+	popfq
+
+	/* set a new stack at the bottom of our page... */
+	lea   4096(%rsi), %rsp
+
+	/* store the parameters back on the stack */
+	pushq	%rdx /* store the start address */
+
+	/* Set cr0 to a known state:
+	 * 31 1 == Paging enabled
+	 * 18 0 == Alignment check disabled
+	 * 16 0 == Write protect disabled
+	 * 3  0 == No task switch
+	 * 2  0 == Don't do FP software emulation.
+	 * 0  1 == Proctected mode enabled
+	 */
+	movq	%cr0, %rax
+	andq	$~((1<<18)|(1<<16)|(1<<3)|(1<<2)), %rax
+	orl	$((1<<31)|(1<<0)), %eax
+	movq	%rax, %cr0
+
+	/* Set cr4 to a known state:
+	 * 10 0 == xmm exceptions disabled
+	 * 9  0 == xmm registers instructions disabled
+	 * 8  0 == performance monitoring counter disabled
+	 * 7  0 == page global disabled
+	 * 6  0 == machine check exceptions disabled
+	 * 5  1 == physical address extension enabled
+	 * 4  0 == page size extensions	disabled
+	 * 3  0 == Debug extensions disabled
+	 * 2  0 == Time stamp disable (disabled)
+	 * 1  0 == Protected mode virtual interrupts disabled
+	 * 0  0 == VME disabled
+	 */
+
+	movq	$((1<<5)), %rax
+	movq	%rax, %cr4
+
+	jmp 1f
+1:
+
+	/* Switch to the identity mapped page tables,
+	 * and flush the TLB.
+	*/
+	movq	%rcx, %cr3
+
+	/* Do the copies */
+	movq	%rdi, %rcx 	/* Put the page_list in %rcx */
+	xorq	%rdi, %rdi
+	xorq	%rsi, %rsi
+	jmp	1f
+
+0:	/* top, read another word for the indirection page */
+
+	movq	(%rbx), %rcx
+	addq	$8,	%rbx
+1:
+	testq	$0x1,	%rcx  /* is it a destination page? */
+	jz	2f
+	movq	%rcx,	%rdi
+	andq	$0xfffffffffffff000, %rdi
+	jmp	0b
+2:
+	testq	$0x2,	%rcx  /* is it an indirection page? */
+	jz	2f
+	movq	%rcx,   %rbx
+	andq	$0xfffffffffffff000, %rbx
+	jmp	0b
+2:
+	testq	$0x4,	%rcx  /* is it the done indicator? */
+	jz	2f
+	jmp	3f
+2:
+	testq	$0x8,	%rcx  /* is it the source indicator? */
+	jz	0b	      /* Ignore it otherwise */
+	movq	%rcx,   %rsi  /* For ever source page do a copy */
+	andq	$0xfffffffffffff000, %rsi
+
+	movq	$512,   %rcx
+	rep ; movsq
+	jmp	0b
+3:
+
+	/* To be certain of avoiding problems with self-modifying code
+	 * I need to execute a serializing instruction here.
+	 * So I flush the TLB by reloading %cr3 here, it's handy,
+	 * and not processor dependent.
+	 */
+	movq	%cr3, %rax
+	movq	%rax, %cr3
+
+	/* set all of the registers to known values */
+	/* leave %rsp alone */
+
+	xorq	%rax, %rax
+	xorq	%rbx, %rbx
+	xorq    %rcx, %rcx
+	xorq    %rdx, %rdx
+	xorq    %rsi, %rsi
+	xorq    %rdi, %rdi
+	xorq    %rbp, %rbp
+	xorq	%r8,  %r8
+	xorq	%r9,  %r9
+	xorq	%r10, %r9
+	xorq	%r11, %r11
+	xorq	%r12, %r12
+	xorq	%r13, %r13
+	xorq	%r14, %r14
+	xorq	%r15, %r15
+
+	ret
+relocate_new_kernel_end:
+
+	.globl relocate_new_kernel_size
+relocate_new_kernel_size:
+	.quad relocate_new_kernel_end - relocate_new_kernel
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 000015dd5a8..5fd03225058 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -41,6 +41,7 @@
 #include <linux/kallsyms.h>
 #include <linux/edd.h>
 #include <linux/mmzone.h>
+#include <linux/kexec.h>
 
 #include <asm/mtrr.h>
 #include <asm/uaccess.h>
@@ -367,6 +368,27 @@ static __init void parse_cmdline_early (char ** cmdline_p)
 		if (!memcmp(from, "noexec=", 7))
 			nonx_setup(from + 7);
 
+#ifdef CONFIG_KEXEC
+		/* crashkernel=size@addr specifies the location to reserve for
+		 * a crash kernel.  By reserving this memory we guarantee
+		 * that linux never set's it up as a DMA target.
+		 * Useful for holding code to do something appropriate
+		 * after a kernel panic.
+		 */
+		else if (!memcmp(from, "crashkernel=", 12)) {
+			unsigned long size, base;
+			size = memparse(from+12, &from);
+			if (*from == '@') {
+				base = memparse(from+1, &from);
+				/* FIXME: Do I want a sanity check
+				 * to validate the memory range?
+				 */
+				crashk_res.start = base;
+				crashk_res.end   = base + size - 1;
+			}
+		}
+#endif
+
 	next_char:
 		c = *(from++);
 		if (!c)
@@ -625,6 +647,13 @@ void __init setup_arch(char **cmdline_p)
 #endif
 
 	sparse_init();
+
+#ifdef CONFIG_KEXEC
+	if (crashk_res.start != crashk_res.end) {
+		reserve_bootmem(crashk_res.start,
+			crashk_res.end - crashk_res.start + 1);
+	}
+#endif
 	paging_init();
 
 	check_ioapic();
@@ -676,7 +705,7 @@ void __init setup_arch(char **cmdline_p)
 #endif
 }
 
-static int __init get_model_name(struct cpuinfo_x86 *c)
+static int __cpuinit get_model_name(struct cpuinfo_x86 *c)
 {
 	unsigned int *v;
 
@@ -692,7 +721,7 @@ static int __init get_model_name(struct cpuinfo_x86 *c)
 }
 
 
-static void __init display_cacheinfo(struct cpuinfo_x86 *c)
+static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
 {
 	unsigned int n, dummy, eax, ebx, ecx, edx;
 
@@ -803,7 +832,7 @@ static int __init init_amd(struct cpuinfo_x86 *c)
 	return r;
 }
 
-static void __init detect_ht(struct cpuinfo_x86 *c)
+static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
 {
 #ifdef CONFIG_SMP
 	u32 	eax, ebx, ecx, edx;
@@ -864,7 +893,7 @@ static void __init detect_ht(struct cpuinfo_x86 *c)
 /*
  * find out the number of processor cores on the die
  */
-static int __init intel_num_cpu_cores(struct cpuinfo_x86 *c)
+static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
 {
 	unsigned int eax;
 
@@ -882,7 +911,7 @@ static int __init intel_num_cpu_cores(struct cpuinfo_x86 *c)
 		return 1;
 }
 
-static void __init init_intel(struct cpuinfo_x86 *c)
+static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 {
 	/* Cache sizes */
 	unsigned n;
@@ -902,7 +931,7 @@ static void __init init_intel(struct cpuinfo_x86 *c)
  	c->x86_num_cores = intel_num_cpu_cores(c);
 }
 
-void __init get_cpu_vendor(struct cpuinfo_x86 *c)
+void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
 {
 	char *v = c->x86_vendor_id;
 
@@ -923,7 +952,7 @@ struct cpu_model_info {
 /* Do some early cpuid on the boot CPU to get some parameter that are
    needed before check_bugs. Everything advanced is in identify_cpu
    below. */
-void __init early_identify_cpu(struct cpuinfo_x86 *c)
+void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
 {
 	u32 tfms;
 
@@ -977,7 +1006,7 @@ void __init early_identify_cpu(struct cpuinfo_x86 *c)
 /*
  * This does the hard work of actually picking apart the CPU stuff...
  */
-void __init identify_cpu(struct cpuinfo_x86 *c)
+void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 {
 	int i;
 	u32 xlvl;
@@ -1047,6 +1076,10 @@ void __init identify_cpu(struct cpuinfo_x86 *c)
 #ifdef CONFIG_X86_MCE
 	mcheck_init(c);
 #endif
+	if (c == &boot_cpu_data)
+		mtrr_bp_init();
+	else
+		mtrr_ap_init();
 #ifdef CONFIG_NUMA
 	if (c != &boot_cpu_data)
 		numa_add_cpu(c - cpu_data);
@@ -1054,7 +1087,7 @@ void __init identify_cpu(struct cpuinfo_x86 *c)
 }
  
 
-void __init print_cpu_info(struct cpuinfo_x86 *c)
+void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
 {
 	if (c->x86_model_id[0])
 		printk("%s", c->x86_model_id);
diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c
index 678b7ac33b8..f2b9c6bc999 100644
--- a/arch/x86_64/kernel/setup64.c
+++ b/arch/x86_64/kernel/setup64.c
@@ -29,7 +29,7 @@
 
 char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,};
 
-cpumask_t cpu_initialized __initdata = CPU_MASK_NONE;
+cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
 
 struct x8664_pda cpu_pda[NR_CPUS] __cacheline_aligned; 
 
@@ -171,7 +171,7 @@ void syscall_init(void)
 	wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000); 
 }
 
-void __init check_efer(void)
+void __cpuinit check_efer(void)
 {
 	unsigned long efer;
 
@@ -188,7 +188,7 @@ void __init check_efer(void)
  * 'CPU state barrier', nothing should get across.
  * A lot of state is already set up in PDA init.
  */
-void __init cpu_init (void)
+void __cpuinit cpu_init (void)
 {
 #ifdef CONFIG_SMP
 	int cpu = stack_smp_processor_id();
diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c
index 98b7ba95d58..98590a989f3 100644
--- a/arch/x86_64/kernel/signal.c
+++ b/arch/x86_64/kernel/signal.c
@@ -425,7 +425,7 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset)
 	if (!user_mode(regs))
 		return 1;
 
-	if (try_to_freeze(0))
+	if (try_to_freeze())
 		goto no_signal;
 
 	if (!oldset)
diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c
index 1e379ed17b1..ccae392886a 100644
--- a/arch/x86_64/kernel/smp.c
+++ b/arch/x86_64/kernel/smp.c
@@ -283,6 +283,16 @@ struct call_data_struct {
 
 static struct call_data_struct * call_data;
 
+void lock_ipi_call_lock(void)
+{
+	spin_lock_irq(&call_lock);
+}
+
+void unlock_ipi_call_lock(void)
+{
+	spin_unlock_irq(&call_lock);
+}
+
 /*
  * this function sends a 'generic call function' IPI to all other CPUs
  * in the system.
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index f1ec0f34594..b969ee12872 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -34,6 +34,7 @@
  *      Andi Kleen              :       Converted to new state machine.
  *					Various cleanups.
  *					Probably mostly hotplug CPU ready now.
+ *	Ashok Raj			: CPU hotplug support
  */
 
 
@@ -58,11 +59,6 @@
 #include <asm/proto.h>
 #include <asm/nmi.h>
 
-/* Change for real CPU hotplug. Note other files need to be fixed
-   first too. */
-#define __cpuinit __init
-#define __cpuinitdata __initdata
-
 /* Number of siblings per CPU package */
 int smp_num_siblings = 1;
 /* Package ID of each logical CPU */
@@ -103,6 +99,37 @@ EXPORT_SYMBOL(cpu_core_map);
 extern unsigned char trampoline_data[];
 extern unsigned char trampoline_end[];
 
+/* State of each CPU */
+DEFINE_PER_CPU(int, cpu_state) = { 0 };
+
+/*
+ * Store all idle threads, this can be reused instead of creating
+ * a new thread. Also avoids complicated thread destroy functionality
+ * for idle threads.
+ */
+struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
+
+#define get_idle_for_cpu(x)     (idle_thread_array[(x)])
+#define set_idle_for_cpu(x,p)   (idle_thread_array[(x)] = (p))
+
+/*
+ * cpu_possible_map should be static, it cannot change as cpu's
+ * are onlined, or offlined. The reason is per-cpu data-structures
+ * are allocated by some modules at init time, and dont expect to
+ * do this dynamically on cpu arrival/departure.
+ * cpu_present_map on the other hand can change dynamically.
+ * In case when cpu_hotplug is not compiled, then we resort to current
+ * behaviour, which is cpu_possible == cpu_present.
+ * If cpu-hotplug is supported, then we need to preallocate for all
+ * those NR_CPUS, hence cpu_possible_map represents entire NR_CPUS range.
+ * - Ashok Raj
+ */
+#ifdef CONFIG_HOTPLUG_CPU
+#define fixup_cpu_possible_map(x)	cpu_set((x), cpu_possible_map)
+#else
+#define fixup_cpu_possible_map(x)
+#endif
+
 /*
  * Currently trivial. Write the real->protected mode
  * bootstrap into the page concerned. The caller
@@ -418,6 +445,33 @@ void __cpuinit smp_callin(void)
 	cpu_set(cpuid, cpu_callin_map);
 }
 
+static inline void set_cpu_sibling_map(int cpu)
+{
+	int i;
+
+	if (smp_num_siblings > 1) {
+		for_each_cpu(i) {
+			if (cpu_core_id[cpu] == cpu_core_id[i]) {
+				cpu_set(i, cpu_sibling_map[cpu]);
+				cpu_set(cpu, cpu_sibling_map[i]);
+			}
+		}
+	} else {
+		cpu_set(cpu, cpu_sibling_map[cpu]);
+	}
+
+	if (current_cpu_data.x86_num_cores > 1) {
+		for_each_cpu(i) {
+			if (phys_proc_id[cpu] == phys_proc_id[i]) {
+				cpu_set(i, cpu_core_map[cpu]);
+				cpu_set(cpu, cpu_core_map[i]);
+			}
+		}
+	} else {
+		cpu_core_map[cpu] = cpu_sibling_map[cpu];
+	}
+}
+
 /*
  * Setup code on secondary processor (after comming out of the trampoline)
  */
@@ -448,9 +502,28 @@ void __cpuinit start_secondary(void)
 	enable_APIC_timer();
 
 	/*
+	 * The sibling maps must be set before turing the online map on for
+	 * this cpu
+	 */
+	set_cpu_sibling_map(smp_processor_id());
+
+	/*
+	 * We need to hold call_lock, so there is no inconsistency
+	 * between the time smp_call_function() determines number of
+	 * IPI receipients, and the time when the determination is made
+	 * for which cpus receive the IPI in genapic_flat.c. Holding this
+	 * lock helps us to not include this cpu in a currently in progress
+	 * smp_call_function().
+	 */
+	lock_ipi_call_lock();
+
+	/*
 	 * Allow the master to continue.
 	 */
 	cpu_set(smp_processor_id(), cpu_online_map);
+	per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
+	unlock_ipi_call_lock();
+
 	mb();
 
 	/* Wait for TSC sync to not schedule things before.
@@ -628,33 +701,77 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta
 	return (send_status | accept_status);
 }
 
+struct create_idle {
+	struct task_struct *idle;
+	struct completion done;
+	int cpu;
+};
+
+void do_fork_idle(void *_c_idle)
+{
+	struct create_idle *c_idle = _c_idle;
+
+	c_idle->idle = fork_idle(c_idle->cpu);
+	complete(&c_idle->done);
+}
+
 /*
  * Boot one CPU.
  */
 static int __cpuinit do_boot_cpu(int cpu, int apicid)
 {
-	struct task_struct *idle;
 	unsigned long boot_error;
 	int timeout;
 	unsigned long start_rip;
+	struct create_idle c_idle = {
+		.cpu = cpu,
+		.done = COMPLETION_INITIALIZER(c_idle.done),
+	};
+	DECLARE_WORK(work, do_fork_idle, &c_idle);
+
+	c_idle.idle = get_idle_for_cpu(cpu);
+
+	if (c_idle.idle) {
+		c_idle.idle->thread.rsp = (unsigned long) (((struct pt_regs *)
+			(THREAD_SIZE + (unsigned long) c_idle.idle->thread_info)) - 1);
+		init_idle(c_idle.idle, cpu);
+		goto do_rest;
+	}
+
 	/*
-	 * We can't use kernel_thread since we must avoid to
-	 * reschedule the child.
+	 * During cold boot process, keventd thread is not spun up yet.
+	 * When we do cpu hot-add, we create idle threads on the fly, we should
+	 * not acquire any attributes from the calling context. Hence the clean
+	 * way to create kernel_threads() is to do that from keventd().
+	 * We do the current_is_keventd() due to the fact that ACPI notifier
+	 * was also queuing to keventd() and when the caller is already running
+	 * in context of keventd(), we would end up with locking up the keventd
+	 * thread.
 	 */
-	idle = fork_idle(cpu);
-	if (IS_ERR(idle)) {
+	if (!keventd_up() || current_is_keventd())
+		work.func(work.data);
+	else {
+		schedule_work(&work);
+		wait_for_completion(&c_idle.done);
+	}
+
+	if (IS_ERR(c_idle.idle)) {
 		printk("failed fork for CPU %d\n", cpu);
-		return PTR_ERR(idle);
+		return PTR_ERR(c_idle.idle);
 	}
 
-	cpu_pda[cpu].pcurrent = idle;
+	set_idle_for_cpu(cpu, c_idle.idle);
+
+do_rest:
+
+	cpu_pda[cpu].pcurrent = c_idle.idle;
 
 	start_rip = setup_trampoline();
 
-	init_rsp = idle->thread.rsp;
+	init_rsp = c_idle.idle->thread.rsp;
 	per_cpu(init_tss,cpu).rsp0 = init_rsp;
 	initial_code = start_secondary;
-	clear_ti_thread_flag(idle->thread_info, TIF_FORK);
+	clear_ti_thread_flag(c_idle.idle->thread_info, TIF_FORK);
 
 	printk(KERN_INFO "Booting processor %d/%d rip %lx rsp %lx\n", cpu, apicid,
 	       start_rip, init_rsp);
@@ -746,51 +863,6 @@ cycles_t cacheflush_time;
 unsigned long cache_decay_ticks;
 
 /*
- * Construct cpu_sibling_map[], so that we can tell the sibling CPU
- * on SMT systems efficiently.
- */
-static __cpuinit void detect_siblings(void)
-{
-	int cpu;
-
-	for (cpu = 0; cpu < NR_CPUS; cpu++) {
-		cpus_clear(cpu_sibling_map[cpu]);
-		cpus_clear(cpu_core_map[cpu]);
-	}
-
-	for_each_online_cpu (cpu) {
-		struct cpuinfo_x86 *c = cpu_data + cpu;
-		int siblings = 0;
-		int i;
-		if (smp_num_siblings > 1) {
-			for_each_online_cpu (i) {
-				if (cpu_core_id[cpu] == cpu_core_id[i]) {
-					siblings++;
-					cpu_set(i, cpu_sibling_map[cpu]);
-				}
-			}
-		} else {
-			siblings++;
-			cpu_set(cpu, cpu_sibling_map[cpu]);
-		}
-
-		if (siblings != smp_num_siblings) {
-			printk(KERN_WARNING
-	       "WARNING: %d siblings found for CPU%d, should be %d\n",
-			       siblings, cpu, smp_num_siblings);
-			smp_num_siblings = siblings;
-		}
-		if (c->x86_num_cores > 1) {
-			for_each_online_cpu(i) {
-				if (phys_proc_id[cpu] == phys_proc_id[i])
-					cpu_set(i, cpu_core_map[cpu]);
-			}
-		} else
-			cpu_core_map[cpu] = cpu_sibling_map[cpu];
-	}
-}
-
-/*
  * Cleanup possible dangling ends...
  */
 static __cpuinit void smp_cleanup_boot(void)
@@ -823,7 +895,7 @@ static __cpuinit void smp_cleanup_boot(void)
  *
  * RED-PEN audit/test this more. I bet there is more state messed up here.
  */
-static __cpuinit void disable_smp(void)
+static __init void disable_smp(void)
 {
 	cpu_present_map = cpumask_of_cpu(0);
 	cpu_possible_map = cpumask_of_cpu(0);
@@ -838,7 +910,7 @@ static __cpuinit void disable_smp(void)
 /*
  * Handle user cpus=... parameter.
  */
-static __cpuinit void enforce_max_cpus(unsigned max_cpus)
+static __init void enforce_max_cpus(unsigned max_cpus)
 {
 	int i, k;
 	k = 0;
@@ -855,7 +927,7 @@ static __cpuinit void enforce_max_cpus(unsigned max_cpus)
 /*
  * Various sanity checks.
  */
-static int __cpuinit smp_sanity_check(unsigned max_cpus)
+static int __init smp_sanity_check(unsigned max_cpus)
 {
 	if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
 		printk("weird, boot CPU (#%d) not listed by the BIOS.\n",
@@ -913,7 +985,7 @@ static int __cpuinit smp_sanity_check(unsigned max_cpus)
  * Prepare for SMP bootup.  The MP table or ACPI has been read
  * earlier.  Just do some sanity checking here and enable APIC mode.
  */
-void __cpuinit smp_prepare_cpus(unsigned int max_cpus)
+void __init smp_prepare_cpus(unsigned int max_cpus)
 {
 	int i;
 
@@ -930,10 +1002,9 @@ void __cpuinit smp_prepare_cpus(unsigned int max_cpus)
 		int apicid = cpu_present_to_apicid(i);
 		if (physid_isset(apicid, phys_cpu_present_map)) {
 			cpu_set(i, cpu_present_map);
-			/* possible map would be different if we supported real
-			   CPU hotplug. */
 			cpu_set(i, cpu_possible_map);
 		}
+		fixup_cpu_possible_map(i);
 	}
 
 	if (smp_sanity_check(max_cpus) < 0) {
@@ -978,13 +1049,13 @@ void __init smp_prepare_boot_cpu(void)
 	int me = smp_processor_id();
 	cpu_set(me, cpu_online_map);
 	cpu_set(me, cpu_callout_map);
+	cpu_set(0, cpu_sibling_map[0]);
+	cpu_set(0, cpu_core_map[0]);
+	per_cpu(cpu_state, me) = CPU_ONLINE;
 }
 
 /*
  * Entry point to boot a CPU.
- *
- * This is all __cpuinit, not __devinit for now because we don't support
- * CPU hotplug (yet).
  */
 int __cpuinit __cpu_up(unsigned int cpu)
 {
@@ -1001,6 +1072,15 @@ int __cpuinit __cpu_up(unsigned int cpu)
 		return -EINVAL;
 	}
 
+	/*
+	 * Already booted CPU?
+	 */
+ 	if (cpu_isset(cpu, cpu_callin_map)) {
+		Dprintk("do_boot_cpu %d Already started\n", cpu);
+ 		return -ENOSYS;
+	}
+
+	per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
 	/* Boot it! */
 	err = do_boot_cpu(cpu, apicid);
 	if (err < 0) {
@@ -1013,23 +1093,118 @@ int __cpuinit __cpu_up(unsigned int cpu)
 
 	while (!cpu_isset(cpu, cpu_online_map))
 		cpu_relax();
-	return 0;
+	err = 0;
+
+	return err;
 }
 
 /*
  * Finish the SMP boot.
  */
-void __cpuinit smp_cpus_done(unsigned int max_cpus)
+void __init smp_cpus_done(unsigned int max_cpus)
 {
+#ifndef CONFIG_HOTPLUG_CPU
 	zap_low_mappings();
+#endif
 	smp_cleanup_boot();
 
 #ifdef CONFIG_X86_IO_APIC
 	setup_ioapic_dest();
 #endif
 
-	detect_siblings();
 	time_init_gtod();
 
 	check_nmi_watchdog();
 }
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+static void remove_siblinginfo(int cpu)
+{
+	int sibling;
+
+	for_each_cpu_mask(sibling, cpu_sibling_map[cpu])
+		cpu_clear(cpu, cpu_sibling_map[sibling]);
+	for_each_cpu_mask(sibling, cpu_core_map[cpu])
+		cpu_clear(cpu, cpu_core_map[sibling]);
+	cpus_clear(cpu_sibling_map[cpu]);
+	cpus_clear(cpu_core_map[cpu]);
+	phys_proc_id[cpu] = BAD_APICID;
+	cpu_core_id[cpu] = BAD_APICID;
+}
+
+void remove_cpu_from_maps(void)
+{
+	int cpu = smp_processor_id();
+
+	cpu_clear(cpu, cpu_callout_map);
+	cpu_clear(cpu, cpu_callin_map);
+	clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */
+}
+
+int __cpu_disable(void)
+{
+	int cpu = smp_processor_id();
+
+	/*
+	 * Perhaps use cpufreq to drop frequency, but that could go
+	 * into generic code.
+ 	 *
+	 * We won't take down the boot processor on i386 due to some
+	 * interrupts only being able to be serviced by the BSP.
+	 * Especially so if we're not using an IOAPIC	-zwane
+	 */
+	if (cpu == 0)
+		return -EBUSY;
+
+	disable_APIC_timer();
+
+	/*
+	 * HACK:
+	 * Allow any queued timer interrupts to get serviced
+	 * This is only a temporary solution until we cleanup
+	 * fixup_irqs as we do for IA64.
+	 */
+	local_irq_enable();
+	mdelay(1);
+
+	local_irq_disable();
+	remove_siblinginfo(cpu);
+
+	/* It's now safe to remove this processor from the online map */
+	cpu_clear(cpu, cpu_online_map);
+	remove_cpu_from_maps();
+	fixup_irqs(cpu_online_map);
+	return 0;
+}
+
+void __cpu_die(unsigned int cpu)
+{
+	/* We don't do anything here: idle task is faking death itself. */
+	unsigned int i;
+
+	for (i = 0; i < 10; i++) {
+		/* They ack this in play_dead by setting CPU_DEAD */
+		if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
+			printk ("CPU %d is now offline\n", cpu);
+			return;
+		}
+		current->state = TASK_UNINTERRUPTIBLE;
+		schedule_timeout(HZ/10);
+	}
+ 	printk(KERN_ERR "CPU %u didn't die...\n", cpu);
+}
+
+#else /* ... !CONFIG_HOTPLUG_CPU */
+
+int __cpu_disable(void)
+{
+	return -ENOSYS;
+}
+
+void __cpu_die(unsigned int cpu)
+{
+	/* We said "no" in __cpu_disable */
+	BUG();
+}
+#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/arch/x86_64/kernel/suspend.c b/arch/x86_64/kernel/suspend.c
index ebaa1e37d65..0612640d91b 100644
--- a/arch/x86_64/kernel/suspend.c
+++ b/arch/x86_64/kernel/suspend.c
@@ -44,7 +44,6 @@ void __save_processor_state(struct saved_context *ctxt)
 	 */
 	asm volatile ("sgdt %0" : "=m" (ctxt->gdt_limit));
 	asm volatile ("sidt %0" : "=m" (ctxt->idt_limit));
-	asm volatile ("sldt %0" : "=m" (ctxt->ldt));
 	asm volatile ("str %0"  : "=m" (ctxt->tr));
 
 	/* XMM0..XMM15 should be handled by kernel_fpu_begin(). */
@@ -69,6 +68,7 @@ void __save_processor_state(struct saved_context *ctxt)
 	asm volatile ("movq %%cr2, %0" : "=r" (ctxt->cr2));
 	asm volatile ("movq %%cr3, %0" : "=r" (ctxt->cr3));
 	asm volatile ("movq %%cr4, %0" : "=r" (ctxt->cr4));
+	asm volatile ("movq %%cr8, %0" : "=r" (ctxt->cr8));
 }
 
 void save_processor_state(void)
@@ -90,12 +90,20 @@ void __restore_processor_state(struct saved_context *ctxt)
 	/*
 	 * control registers
 	 */
+	asm volatile ("movq %0, %%cr8" :: "r" (ctxt->cr8));
 	asm volatile ("movq %0, %%cr4" :: "r" (ctxt->cr4));
 	asm volatile ("movq %0, %%cr3" :: "r" (ctxt->cr3));
 	asm volatile ("movq %0, %%cr2" :: "r" (ctxt->cr2));
 	asm volatile ("movq %0, %%cr0" :: "r" (ctxt->cr0));
 
 	/*
+	 * now restore the descriptor tables to their proper values
+	 * ltr is done i fix_processor_context().
+	 */
+	asm volatile ("lgdt %0" :: "m" (ctxt->gdt_limit));
+	asm volatile ("lidt %0" :: "m" (ctxt->idt_limit));
+
+	/*
 	 * segment registers
 	 */
 	asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds));
@@ -108,17 +116,10 @@ void __restore_processor_state(struct saved_context *ctxt)
 	wrmsrl(MSR_GS_BASE, ctxt->gs_base);
 	wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base);
 
-	/*
-	 * now restore the descriptor tables to their proper values
-	 * ltr is done i fix_processor_context().
-	 */
-	asm volatile ("lgdt %0" :: "m" (ctxt->gdt_limit));
-	asm volatile ("lidt %0" :: "m" (ctxt->idt_limit));
-	asm volatile ("lldt %0" :: "m" (ctxt->ldt));
-
 	fix_processor_context();
 
 	do_fpu_end();
+	mtrr_ap_init();
 }
 
 void restore_processor_state(void)
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 121646fc43f..10273663000 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -586,11 +586,17 @@ static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
 asmlinkage void default_do_nmi(struct pt_regs *regs)
 {
 	unsigned char reason = 0;
+	int cpu;
+
+	cpu = smp_processor_id();
 
 	/* Only the BSP gets external NMIs from the system.  */
-	if (!smp_processor_id())
+	if (!cpu)
 		reason = get_nmi_reason();
 
+	if (!cpu_online(cpu))
+		return;
+
 	if (!(reason & 0xc0)) {
 		if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 0, SIGINT)
 								== NOTIFY_STOP)
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
index 59ebd5beda8..2a94f9b60b2 100644
--- a/arch/x86_64/kernel/vmlinux.lds.S
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -2,7 +2,10 @@
  * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
  */
 
+#define LOAD_OFFSET __START_KERNEL_map
+
 #include <asm-generic/vmlinux.lds.h>
+#include <asm/page.h>
 #include <linux/config.h>
 
 OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
@@ -11,28 +14,30 @@ ENTRY(phys_startup_64)
 jiffies_64 = jiffies;
 SECTIONS
 {
-  . = 0xffffffff80100000;
+  . = __START_KERNEL;
   phys_startup_64 = startup_64 - LOAD_OFFSET;
   _text = .;			/* Text and read-only data */
-  .text : {
+  .text :  AT(ADDR(.text) - LOAD_OFFSET) {
 	*(.text)
 	SCHED_TEXT
 	LOCK_TEXT
 	*(.fixup)
 	*(.gnu.warning)
 	} = 0x9090
-  .text.lock : { *(.text.lock) }	/* out-of-line lock text */
+  				/* out-of-line lock text */
+  .text.lock : AT(ADDR(.text.lock) - LOAD_OFFSET) { *(.text.lock) }
 
   _etext = .;			/* End of text section */
 
   . = ALIGN(16);		/* Exception table */
   __start___ex_table = .;
-  __ex_table : { *(__ex_table) }
+  __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) }
   __stop___ex_table = .;
 
   RODATA
 
-  .data : {			/* Data */
+				/* Data */
+  .data : AT(ADDR(.data) - LOAD_OFFSET) {
 	*(.data)
 	CONSTRUCTORS
 	}
@@ -40,62 +45,99 @@ SECTIONS
   _edata = .;			/* End of data section */
 
   __bss_start = .;		/* BSS */
-  .bss : {
+  .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
 	*(.bss.page_aligned)	
 	*(.bss)
 	}
   __bss_end = .;
 
+  . = ALIGN(PAGE_SIZE);
+  . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+  .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
+	*(.data.cacheline_aligned)
+  }
   . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
-  .data.cacheline_aligned : { *(.data.cacheline_aligned) }
+  .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
+  	*(.data.read_mostly)
+  }
+
+#define VSYSCALL_ADDR (-10*1024*1024)
+#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))
+#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))
+
+#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
+#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
 
-#define AFTER(x)      BINALIGN(LOADADDR(x) + SIZEOF(x), 16)
-#define BINALIGN(x,y) (((x) + (y) - 1)  & ~((y) - 1))
-#define CACHE_ALIGN(x) BINALIGN(x, CONFIG_X86_L1_CACHE_BYTES)
+#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR)
+#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
+
+  . = VSYSCALL_ADDR;
+  .vsyscall_0 :	 AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) }
+  __vsyscall_0 = VSYSCALL_VIRT_ADDR;
 
-  .vsyscall_0 -10*1024*1024: AT ((LOADADDR(.data.cacheline_aligned) + SIZEOF(.data.cacheline_aligned) + 4095) & ~(4095)) { *(.vsyscall_0) }
-  __vsyscall_0 = LOADADDR(.vsyscall_0);
   . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
-  .xtime_lock : AT CACHE_ALIGN(AFTER(.vsyscall_0)) { *(.xtime_lock) }
-  xtime_lock = LOADADDR(.xtime_lock);
-  .vxtime : AT AFTER(.xtime_lock) { *(.vxtime) }
-  vxtime = LOADADDR(.vxtime);
-  .wall_jiffies : AT AFTER(.vxtime) { *(.wall_jiffies) }
-  wall_jiffies = LOADADDR(.wall_jiffies);
-  .sys_tz : AT AFTER(.wall_jiffies) { *(.sys_tz) }
-  sys_tz = LOADADDR(.sys_tz);
-  .sysctl_vsyscall : AT AFTER(.sys_tz) { *(.sysctl_vsyscall) }
-  sysctl_vsyscall = LOADADDR(.sysctl_vsyscall); 
-  .xtime : AT AFTER(.sysctl_vsyscall) { *(.xtime) }
-  xtime = LOADADDR(.xtime);
+  .xtime_lock : AT(VLOAD(.xtime_lock)) { *(.xtime_lock) }
+  xtime_lock = VVIRT(.xtime_lock);
+
+  .vxtime : AT(VLOAD(.vxtime)) { *(.vxtime) }
+  vxtime = VVIRT(.vxtime);
+
+  .wall_jiffies : AT(VLOAD(.wall_jiffies)) { *(.wall_jiffies) }
+  wall_jiffies = VVIRT(.wall_jiffies);
+
+  .sys_tz : AT(VLOAD(.sys_tz)) { *(.sys_tz) }
+  sys_tz = VVIRT(.sys_tz);
+
+  .sysctl_vsyscall : AT(VLOAD(.sysctl_vsyscall)) { *(.sysctl_vsyscall) }
+  sysctl_vsyscall = VVIRT(.sysctl_vsyscall);
+
+  .xtime : AT(VLOAD(.xtime)) { *(.xtime) }
+  xtime = VVIRT(.xtime);
+
   . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
-  .jiffies : AT CACHE_ALIGN(AFTER(.xtime)) { *(.jiffies) }
-  jiffies = LOADADDR(.jiffies);
-  .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT (LOADADDR(.vsyscall_0) + 1024) { *(.vsyscall_1) }
-  . = LOADADDR(.vsyscall_0) + 4096;
+  .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) }
+  jiffies = VVIRT(.jiffies);
+
+  .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { *(.vsyscall_1) }
+  .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) { *(.vsyscall_2) }
+  .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { *(.vsyscall_3) }
+
+  . = VSYSCALL_VIRT_ADDR + 4096;
+
+#undef VSYSCALL_ADDR
+#undef VSYSCALL_PHYS_ADDR
+#undef VSYSCALL_VIRT_ADDR
+#undef VLOAD_OFFSET
+#undef VLOAD
+#undef VVIRT_OFFSET
+#undef VVIRT
 
   . = ALIGN(8192);		/* init_task */
-  .data.init_task : { *(.data.init_task) }
+  .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
+	*(.data.init_task)
+  }
 
   . = ALIGN(4096);
-  .data.page_aligned : { *(.data.page_aligned) }
+  .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
+	*(.data.page_aligned)
+  }
 
   . = ALIGN(4096);		/* Init code and data */
   __init_begin = .;
-  .init.text : { 
+  .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
 	_sinittext = .;
 	*(.init.text)
 	_einittext = .;
   }
   __initdata_begin = .;
-  .init.data : { *(.init.data) }
+  .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) }
   __initdata_end = .;
   . = ALIGN(16);
   __setup_start = .;
-  .init.setup : { *(.init.setup) }
+  .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) }
   __setup_end = .;
   __initcall_start = .;
-  .initcall.init : {
+  .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
 	*(.initcall1.init) 
 	*(.initcall2.init) 
 	*(.initcall3.init) 
@@ -106,32 +148,38 @@ SECTIONS
   }
   __initcall_end = .;
   __con_initcall_start = .;
-  .con_initcall.init : { *(.con_initcall.init) }
+  .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
+	*(.con_initcall.init)
+  }
   __con_initcall_end = .;
   SECURITY_INIT
   . = ALIGN(8);
   __alt_instructions = .;
-  .altinstructions : { *(.altinstructions) } 
+  .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
+	*(.altinstructions)
+  }
   __alt_instructions_end = .; 
- .altinstr_replacement : { *(.altinstr_replacement) }
+  .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
+	*(.altinstr_replacement)
+  }
   /* .exit.text is discard at runtime, not link time, to deal with references
      from .altinstructions and .eh_frame */
-  .exit.text : { *(.exit.text) }
-  .exit.data : { *(.exit.data) }	
+  .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) }
+  .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) }
   . = ALIGN(4096);
   __initramfs_start = .;
-  .init.ramfs : { *(.init.ramfs) }
+  .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) }
   __initramfs_end = .;	
   . = ALIGN(32);
   __per_cpu_start = .;
-  .data.percpu  : { *(.data.percpu) }
+  .data.percpu  : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) }
   __per_cpu_end = .;
   . = ALIGN(4096);
   __init_end = .;
 
   . = ALIGN(4096);
   __nosave_begin = .;
-  .data_nosave : { *(.data.nosave) }
+  .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) }
   . = ALIGN(4096);
   __nosave_end = .;
 
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 84cde796ecb..ac61c186eb0 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -251,7 +251,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
 	setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
 }
 
-__init void numa_add_cpu(int cpu)
+__cpuinit void numa_add_cpu(int cpu)
 {
 	/* BP is initialized elsewhere */
 	if (cpu) 
diff --git a/arch/x86_64/pci/mmconfig.c b/arch/x86_64/pci/mmconfig.c
index b693c232fd0..657e88aa090 100644
--- a/arch/x86_64/pci/mmconfig.c
+++ b/arch/x86_64/pci/mmconfig.c
@@ -7,25 +7,50 @@
 
 #include <linux/pci.h>
 #include <linux/init.h>
+#include <linux/acpi.h>
 #include "pci.h"
 
 #define MMCONFIG_APER_SIZE (256*1024*1024)
 
-/* The physical address of the MMCONFIG aperture.  Set from ACPI tables. */
-u32 pci_mmcfg_base_addr;
-
 /* Static virtual mapping of the MMCONFIG aperture */
-char *pci_mmcfg_virt;
+struct mmcfg_virt {
+	struct acpi_table_mcfg_config *cfg;
+	char *virt;
+};
+static struct mmcfg_virt *pci_mmcfg_virt;
 
-static inline char *pci_dev_base(unsigned int bus, unsigned int devfn)
+static char *get_virt(unsigned int seg, int bus)
 {
-	return pci_mmcfg_virt + ((bus << 20) | (devfn << 12));
+	int cfg_num = -1;
+	struct acpi_table_mcfg_config *cfg;
+
+	while (1) {
+		++cfg_num;
+		if (cfg_num >= pci_mmcfg_config_num) {
+			/* something bad is going on, no cfg table is found. */
+			/* so we fall back to the old way we used to do this */
+			/* and just rely on the first entry to be correct. */
+			return pci_mmcfg_virt[0].virt;
+		}
+		cfg = pci_mmcfg_virt[cfg_num].cfg;
+		if (cfg->pci_segment_group_number != seg)
+			continue;
+		if ((cfg->start_bus_number <= bus) &&
+		    (cfg->end_bus_number >= bus))
+			return pci_mmcfg_virt[cfg_num].virt;
+	}
+}
+
+static inline char *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn)
+{
+
+	return get_virt(seg, bus) + ((bus << 20) | (devfn << 12));
 }
 
 static int pci_mmcfg_read(unsigned int seg, unsigned int bus,
 			  unsigned int devfn, int reg, int len, u32 *value)
 {
-	char *addr = pci_dev_base(bus, devfn); 
+	char *addr = pci_dev_base(seg, bus, devfn);
 
 	if (unlikely(!value || (bus > 255) || (devfn > 255) || (reg > 4095)))
 		return -EINVAL;
@@ -48,7 +73,7 @@ static int pci_mmcfg_read(unsigned int seg, unsigned int bus,
 static int pci_mmcfg_write(unsigned int seg, unsigned int bus,
 			   unsigned int devfn, int reg, int len, u32 value)
 {
-	char *addr = pci_dev_base(bus,devfn);
+	char *addr = pci_dev_base(seg, bus, devfn);
 
 	if (unlikely((bus > 255) || (devfn > 255) || (reg > 4095)))
 		return -EINVAL;
@@ -75,9 +100,15 @@ static struct pci_raw_ops pci_mmcfg = {
 
 static int __init pci_mmcfg_init(void)
 {
+	int i;
+
 	if ((pci_probe & PCI_PROBE_MMCONF) == 0)
 		return 0;
-	if (!pci_mmcfg_base_addr)
+
+	acpi_table_parse(ACPI_MCFG, acpi_parse_mcfg);
+	if ((pci_mmcfg_config_num == 0) ||
+	    (pci_mmcfg_config == NULL) ||
+	    (pci_mmcfg_config[0].base_address == 0))
 		return 0;
 
 	/* Kludge for now. Don't use mmconfig on AMD systems because
@@ -88,13 +119,22 @@ static int __init pci_mmcfg_init(void)
 		return 0; 
 
 	/* RED-PEN i386 doesn't do _nocache right now */
-	pci_mmcfg_virt = ioremap_nocache(pci_mmcfg_base_addr, MMCONFIG_APER_SIZE);
-	if (!pci_mmcfg_virt) { 
-		printk("PCI: Cannot map mmconfig aperture\n");
+	pci_mmcfg_virt = kmalloc(sizeof(*pci_mmcfg_virt) * pci_mmcfg_config_num, GFP_KERNEL);
+	if (pci_mmcfg_virt == NULL) {
+		printk("PCI: Can not allocate memory for mmconfig structures\n");
 		return 0;
-	}	
+	}
+	for (i = 0; i < pci_mmcfg_config_num; ++i) {
+		pci_mmcfg_virt[i].cfg = &pci_mmcfg_config[i];
+		pci_mmcfg_virt[i].virt = ioremap_nocache(pci_mmcfg_config[i].base_address, MMCONFIG_APER_SIZE);
+		if (!pci_mmcfg_virt[i].virt) {
+			printk("PCI: Cannot map mmconfig aperture for segment %d\n",
+			       pci_mmcfg_config[i].pci_segment_group_number);
+			return 0;
+		}
+		printk(KERN_INFO "PCI: Using MMCONFIG at %x\n", pci_mmcfg_config[i].base_address);
+	}
 
-	printk(KERN_INFO "PCI: Using MMCONFIG at %x\n", pci_mmcfg_base_addr);
 	raw_pci_ops = &pci_mmcfg;
 	pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF;