diff options
Diffstat (limited to 'arch/x86_64')
40 files changed, 1869 insertions, 446 deletions
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index db259757dc8..4b8326177c5 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig @@ -207,33 +207,6 @@ config SMP If you don't know what to do here, say N. -config PREEMPT - bool "Preemptible Kernel" - ---help--- - This option reduces the latency of the kernel when reacting to - real-time or interactive events by allowing a low priority process to - be preempted even if it is in kernel mode executing a system call. - This allows applications to run more reliably even when the system is - under load. On contrary it may also break your drivers and add - priority inheritance problems to your system. Don't select it if - you rely on a stable system or have slightly obscure hardware. - It's also not very well tested on x86-64 currently. - You have been warned. - - Say Y here if you are feeling brave and building a kernel for a - desktop, embedded or real-time system. Say N if you are unsure. - -config PREEMPT_BKL - bool "Preempt The Big Kernel Lock" - depends on PREEMPT - default y - help - This option reduces the latency of the kernel by making the - big kernel lock preemptible. - - Say Y here if you are building a kernel for a desktop system. - Say N if you are unsure. - config SCHED_SMT bool "SMT (Hyperthreading) scheduler support" depends on SMP @@ -244,6 +217,8 @@ config SCHED_SMT cost of slightly increased overhead in some places. If unsure say N here. +source "kernel/Kconfig.preempt" + config K8_NUMA bool "K8 NUMA support" select NUMA @@ -313,6 +288,15 @@ config NR_CPUS This is purely to save memory - each supported CPU requires memory in the static kernel configuration. +config HOTPLUG_CPU + bool "Support for hot-pluggable CPUs (EXPERIMENTAL)" + depends on SMP && HOTPLUG && EXPERIMENTAL + help + Say Y here to experiment with turning CPUs off and on. CPUs + can be controlled through /sys/devices/system/cpu/cpu#. + Say N if you want to disable CPU hotplug. + + config HPET_TIMER bool default y @@ -385,6 +369,34 @@ config X86_MCE_INTEL Additional support for intel specific MCE features such as the thermal monitor. +config PHYSICAL_START + hex "Physical address where the kernel is loaded" if EMBEDDED + default "0x100000" + help + This gives the physical address where the kernel is loaded. + Primarily used in the case of kexec on panic where the + fail safe kernel needs to run at a different address than + the panic-ed kernel. + + Don't change this unless you know what you are doing. + +config KEXEC + bool "kexec system call (EXPERIMENTAL)" + depends on EXPERIMENTAL + help + kexec is a system call that implements the ability to shutdown your + current kernel, and to start another kernel. It is like a reboot + but it is indepedent of the system firmware. And like a reboot + you can start any kernel with it, not just Linux. + + The name comes from the similiarity to the exec system call. + + It is an ongoing process to be certain the hardware in a machine + is properly shutdown, so do not be surprised if this code does not + initially work for you. It may help to enable device hotplugging + support. As of this writing the exact hardware interface is + strongly in flux, so no good recommendation can be made. + config SECCOMP bool "Enable seccomp to safely compute untrusted bytecode" depends on PROC_FS @@ -503,6 +515,8 @@ config UID16 endmenu +source "net/Kconfig" + source drivers/Kconfig source "drivers/firmware/Kconfig" diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile index 6f90c246c41..42891569767 100644 --- a/arch/x86_64/Makefile +++ b/arch/x86_64/Makefile @@ -35,7 +35,7 @@ export IA32_CC IA32_LD IA32_AS IA32_OBJCOPY IA32_CPP LDFLAGS := -m elf_x86_64 OBJCOPYFLAGS := -O binary -R .note -R .comment -S -LDFLAGS_vmlinux := -e stext +LDFLAGS_vmlinux := CHECKFLAGS += -D__x86_64__ -m64 @@ -65,7 +65,9 @@ CFLAGS += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,) head-y := arch/x86_64/kernel/head.o arch/x86_64/kernel/head64.o arch/x86_64/kernel/init_task.o libs-y += arch/x86_64/lib/ -core-y += arch/x86_64/kernel/ arch/x86_64/mm/ +core-y += arch/x86_64/kernel/ \ + arch/x86_64/mm/ \ + arch/x86_64/crypto/ core-$(CONFIG_IA32_EMULATION) += arch/x86_64/ia32/ drivers-$(CONFIG_PCI) += arch/x86_64/pci/ drivers-$(CONFIG_OPROFILE) += arch/x86_64/oprofile/ diff --git a/arch/x86_64/boot/compressed/head.S b/arch/x86_64/boot/compressed/head.S index 27264dbd575..6f55565e4d4 100644 --- a/arch/x86_64/boot/compressed/head.S +++ b/arch/x86_64/boot/compressed/head.S @@ -2,8 +2,6 @@ * linux/boot/head.S * * Copyright (C) 1991, 1992, 1993 Linus Torvalds - * - * $Id: head.S,v 1.3 2001/04/20 00:59:28 ak Exp $ */ /* @@ -21,13 +19,14 @@ */ /* - * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 + * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 */ .code32 .text #include <linux/linkage.h> #include <asm/segment.h> +#include <asm/page.h> .code32 .globl startup_32 @@ -77,7 +76,7 @@ startup_32: jnz 3f addl $8,%esp xorl %ebx,%ebx - ljmp $(__KERNEL_CS), $0x100000 + ljmp $(__KERNEL_CS), $__PHYSICAL_START /* * We come here, if we were loaded high. @@ -103,7 +102,7 @@ startup_32: popl %ecx # lcount popl %edx # high_buffer_start popl %eax # hcount - movl $0x100000,%edi + movl $__PHYSICAL_START,%edi cli # make sure we don't get interrupted ljmp $(__KERNEL_CS), $0x1000 # and jump to the move routine @@ -128,7 +127,7 @@ move_routine_start: movsl movl %ebx,%esi # Restore setup pointer xorl %ebx,%ebx - ljmp $(__KERNEL_CS), $0x100000 + ljmp $(__KERNEL_CS), $__PHYSICAL_START move_routine_end: diff --git a/arch/x86_64/boot/compressed/misc.c b/arch/x86_64/boot/compressed/misc.c index c8b9216f9e6..b38d5b8b5fb 100644 --- a/arch/x86_64/boot/compressed/misc.c +++ b/arch/x86_64/boot/compressed/misc.c @@ -11,6 +11,7 @@ #include "miscsetup.h" #include <asm/io.h> +#include <asm/page.h> /* * gzip declarations @@ -92,8 +93,11 @@ static unsigned long output_ptr = 0; static void *malloc(int size); static void free(void *where); +void* memset(void* s, int c, unsigned n); +void* memcpy(void* dest, const void* src, unsigned n); + static void putstr(const char *); - + extern int end; static long free_mem_ptr = (long)&end; static long free_mem_end_ptr; @@ -284,7 +288,7 @@ void setup_normal_output_buffer(void) #else if ((ALT_MEM_K > EXT_MEM_K ? ALT_MEM_K : EXT_MEM_K) < 1024) error("Less than 2MB of memory"); #endif - output_data = (char *)0x100000; /* Points to 1M */ + output_data = (char *)__PHYSICAL_START; /* Normally Points to 1M */ free_mem_end_ptr = (long)real_mode; } @@ -307,8 +311,8 @@ void setup_output_buffer_if_we_run_high(struct moveparams *mv) low_buffer_size = low_buffer_end - LOW_BUFFER_START; high_loaded = 1; free_mem_end_ptr = (long)high_buffer_start; - if ( (0x100000 + low_buffer_size) > ((ulg)high_buffer_start)) { - high_buffer_start = (uch *)(0x100000 + low_buffer_size); + if ( (__PHYSICAL_START + low_buffer_size) > ((ulg)high_buffer_start)) { + high_buffer_start = (uch *)(__PHYSICAL_START + low_buffer_size); mv->hcount = 0; /* say: we need not to move high_buffer */ } else mv->hcount = -1; diff --git a/arch/x86_64/boot/install.sh b/arch/x86_64/boot/install.sh index f17b40dfc0f..198af15a775 100644 --- a/arch/x86_64/boot/install.sh +++ b/arch/x86_64/boot/install.sh @@ -1,6 +1,6 @@ #!/bin/sh # -# arch/i386/boot/install.sh +# arch/x86_64/boot/install.sh # # This file is subject to the terms and conditions of the GNU General Public # License. See the file "COPYING" in the main directory of this archive diff --git a/arch/x86_64/boot/setup.S b/arch/x86_64/boot/setup.S index 75d4d2ad93b..ff58b2832b7 100644 --- a/arch/x86_64/boot/setup.S +++ b/arch/x86_64/boot/setup.S @@ -33,7 +33,7 @@ * Transcribed from Intel (as86) -> AT&T (gas) by Chris Noe, May 1999. * <stiker@northlink.com> * - * Fix to work around buggy BIOSes which dont use carry bit correctly + * Fix to work around buggy BIOSes which don't use carry bit correctly * and/or report extended memory in CX/DX for e801h memory size detection * call. As a result the kernel got wrong figures. The int15/e801h docs * from Ralf Brown interrupt list seem to indicate AX/BX should be used @@ -383,7 +383,7 @@ sse_ok: # a whole bunch of different types, and allows memory holes and # everything. We scan through this memory map and build a list # of the first 32 memory areas, which we return at [E820MAP]. -# This is documented at http://www.teleport.com/~acpi/acpihtml/topic245.htm +# This is documented at http://www.acpi.info/, in the ACPI 2.0 specification. #define SMAP 0x534d4150 @@ -436,7 +436,7 @@ bail820: meme801: stc # fix to work around buggy - xorw %cx,%cx # BIOSes which dont clear/set + xorw %cx,%cx # BIOSes which don't clear/set xorw %dx,%dx # carry on pass/error of # e801h memory size call # or merely pass cx,dx though @@ -733,7 +733,7 @@ flush_instr: # # but we yet haven't reloaded the CS register, so the default size # of the target offset still is 16 bit. -# However, using an operant prefix (0x66), the CPU will properly +# However, using an operand prefix (0x66), the CPU will properly # take our 48 bit far pointer. (INTeL 80386 Programmer's Reference # Manual, Mixing 16-bit and 32-bit code, page 16-6) diff --git a/arch/x86_64/boot/tools/build.c b/arch/x86_64/boot/tools/build.c index c2fa6631317..18b5bac1c42 100644 --- a/arch/x86_64/boot/tools/build.c +++ b/arch/x86_64/boot/tools/build.c @@ -1,6 +1,4 @@ /* - * $Id: build.c,v 1.3 2001/06/26 15:14:50 pavel Exp $ - * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 1997 Martin Mares */ @@ -8,7 +6,8 @@ /* * This file builds a disk-image from three different files: * - * - bootsect: exactly 512 bytes of 8086 machine code, loads the rest + * - bootsect: compatibility mbr which prints an error message if + * someone tries to boot the kernel directly. * - setup: 8086 machine code, sets up system parm * - system: 80386 code for actual system * diff --git a/arch/x86_64/crypto/Makefile b/arch/x86_64/crypto/Makefile new file mode 100644 index 00000000000..426d20f4b72 --- /dev/null +++ b/arch/x86_64/crypto/Makefile @@ -0,0 +1,9 @@ +# +# x86_64/crypto/Makefile +# +# Arch-specific CryptoAPI modules. +# + +obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o + +aes-x86_64-y := aes-x86_64-asm.o aes.o diff --git a/arch/x86_64/crypto/aes-x86_64-asm.S b/arch/x86_64/crypto/aes-x86_64-asm.S new file mode 100644 index 00000000000..483cbb23ab8 --- /dev/null +++ b/arch/x86_64/crypto/aes-x86_64-asm.S @@ -0,0 +1,186 @@ +/* AES (Rijndael) implementation (FIPS PUB 197) for x86_64 + * + * Copyright (C) 2005 Andreas Steinmetz, <ast@domdv.de> + * + * License: + * This code can be distributed under the terms of the GNU General Public + * License (GPL) Version 2 provided that the above header down to and + * including this sentence is retained in full. + */ + +.extern aes_ft_tab +.extern aes_it_tab +.extern aes_fl_tab +.extern aes_il_tab + +.text + +#define R1 %rax +#define R1E %eax +#define R1X %ax +#define R1H %ah +#define R1L %al +#define R2 %rbx +#define R2E %ebx +#define R2X %bx +#define R2H %bh +#define R2L %bl +#define R3 %rcx +#define R3E %ecx +#define R3X %cx +#define R3H %ch +#define R3L %cl +#define R4 %rdx +#define R4E %edx +#define R4X %dx +#define R4H %dh +#define R4L %dl +#define R5 %rsi +#define R5E %esi +#define R6 %rdi +#define R6E %edi +#define R7 %rbp +#define R7E %ebp +#define R8 %r8 +#define R9 %r9 +#define R10 %r10 +#define R11 %r11 + +#define prologue(FUNC,BASE,B128,B192,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11) \ + .global FUNC; \ + .type FUNC,@function; \ + .align 8; \ +FUNC: movq r1,r2; \ + movq r3,r4; \ + leaq BASE+52(r8),r9; \ + movq r10,r11; \ + movl (r7),r5 ## E; \ + movl 4(r7),r1 ## E; \ + movl 8(r7),r6 ## E; \ + movl 12(r7),r7 ## E; \ + movl (r8),r10 ## E; \ + xorl -48(r9),r5 ## E; \ + xorl -44(r9),r1 ## E; \ + xorl -40(r9),r6 ## E; \ + xorl -36(r9),r7 ## E; \ + cmpl $24,r10 ## E; \ + jb B128; \ + leaq 32(r9),r9; \ + je B192; \ + leaq 32(r9),r9; + +#define epilogue(r1,r2,r3,r4,r5,r6,r7,r8,r9) \ + movq r1,r2; \ + movq r3,r4; \ + movl r5 ## E,(r9); \ + movl r6 ## E,4(r9); \ + movl r7 ## E,8(r9); \ + movl r8 ## E,12(r9); \ + ret; + +#define round(TAB,OFFSET,r1,r2,r3,r4,r5,r6,r7,r8,ra,rb,rc,rd) \ + movzbl r2 ## H,r5 ## E; \ + movzbl r2 ## L,r6 ## E; \ + movl TAB+1024(,r5,4),r5 ## E;\ + movw r4 ## X,r2 ## X; \ + movl TAB(,r6,4),r6 ## E; \ + roll $16,r2 ## E; \ + shrl $16,r4 ## E; \ + movzbl r4 ## H,r7 ## E; \ + movzbl r4 ## L,r4 ## E; \ + xorl OFFSET(r8),ra ## E; \ + xorl OFFSET+4(r8),rb ## E; \ + xorl TAB+3072(,r7,4),r5 ## E;\ + xorl TAB+2048(,r4,4),r6 ## E;\ + movzbl r1 ## L,r7 ## E; \ + movzbl r1 ## H,r4 ## E; \ + movl TAB+1024(,r4,4),r4 ## E;\ + movw r3 ## X,r1 ## X; \ + roll $16,r1 ## E; \ + shrl $16,r3 ## E; \ + xorl TAB(,r7,4),r5 ## E; \ + movzbl r3 ## H,r7 ## E; \ + movzbl r3 ## L,r3 ## E; \ + xorl TAB+3072(,r7,4),r4 ## E;\ + xorl TAB+2048(,r3,4),r5 ## E;\ + movzbl r1 ## H,r7 ## E; \ + movzbl r1 ## L,r3 ## E; \ + shrl $16,r1 ## E; \ + xorl TAB+3072(,r7,4),r6 ## E;\ + movl TAB+2048(,r3,4),r3 ## E;\ + movzbl r1 ## H,r7 ## E; \ + movzbl r1 ## L,r1 ## E; \ + xorl TAB+1024(,r7,4),r6 ## E;\ + xorl TAB(,r1,4),r3 ## E; \ + movzbl r2 ## H,r1 ## E; \ + movzbl r2 ## L,r7 ## E; \ + shrl $16,r2 ## E; \ + xorl TAB+3072(,r1,4),r3 ## E;\ + xorl TAB+2048(,r7,4),r4 ## E;\ + movzbl r2 ## H,r1 ## E; \ + movzbl r2 ## L,r2 ## E; \ + xorl OFFSET+8(r8),rc ## E; \ + xorl OFFSET+12(r8),rd ## E; \ + xorl TAB+1024(,r1,4),r3 ## E;\ + xorl TAB(,r2,4),r4 ## E; + +#define move_regs(r1,r2,r3,r4) \ + movl r3 ## E,r1 ## E; \ + movl r4 ## E,r2 ## E; + +#define entry(FUNC,BASE,B128,B192) \ + prologue(FUNC,BASE,B128,B192,R2,R8,R7,R9,R1,R3,R4,R6,R10,R5,R11) + +#define return epilogue(R8,R2,R9,R7,R5,R6,R3,R4,R11) + +#define encrypt_round(TAB,OFFSET) \ + round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R10,R5,R6,R3,R4) \ + move_regs(R1,R2,R5,R6) + +#define encrypt_final(TAB,OFFSET) \ + round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R10,R5,R6,R3,R4) + +#define decrypt_round(TAB,OFFSET) \ + round(TAB,OFFSET,R2,R1,R4,R3,R6,R5,R7,R10,R5,R6,R3,R4) \ + move_regs(R1,R2,R5,R6) + +#define decrypt_final(TAB,OFFSET) \ + round(TAB,OFFSET,R2,R1,R4,R3,R6,R5,R7,R10,R5,R6,R3,R4) + +/* void aes_encrypt(void *ctx, u8 *out, const u8 *in) */ + + entry(aes_encrypt,0,enc128,enc192) + encrypt_round(aes_ft_tab,-96) + encrypt_round(aes_ft_tab,-80) +enc192: encrypt_round(aes_ft_tab,-64) + encrypt_round(aes_ft_tab,-48) +enc128: encrypt_round(aes_ft_tab,-32) + encrypt_round(aes_ft_tab,-16) + encrypt_round(aes_ft_tab, 0) + encrypt_round(aes_ft_tab, 16) + encrypt_round(aes_ft_tab, 32) + encrypt_round(aes_ft_tab, 48) + encrypt_round(aes_ft_tab, 64) + encrypt_round(aes_ft_tab, 80) + encrypt_round(aes_ft_tab, 96) + encrypt_final(aes_fl_tab,112) + return + +/* void aes_decrypt(void *ctx, u8 *out, const u8 *in) */ + + entry(aes_decrypt,240,dec128,dec192) + decrypt_round(aes_it_tab,-96) + decrypt_round(aes_it_tab,-80) +dec192: decrypt_round(aes_it_tab,-64) + decrypt_round(aes_it_tab,-48) +dec128: decrypt_round(aes_it_tab,-32) + decrypt_round(aes_it_tab,-16) + decrypt_round(aes_it_tab, 0) + decrypt_round(aes_it_tab, 16) + decrypt_round(aes_it_tab, 32) + decrypt_round(aes_it_tab, 48) + decrypt_round(aes_it_tab, 64) + decrypt_round(aes_it_tab, 80) + decrypt_round(aes_it_tab, 96) + decrypt_final(aes_il_tab,112) + return diff --git a/arch/x86_64/crypto/aes.c b/arch/x86_64/crypto/aes.c new file mode 100644 index 00000000000..2b5c4010ce3 --- /dev/null +++ b/arch/x86_64/crypto/aes.c @@ -0,0 +1,324 @@ +/* + * Cryptographic API. + * + * AES Cipher Algorithm. + * + * Based on Brian Gladman's code. + * + * Linux developers: + * Alexander Kjeldaas <astor@fast.no> + * Herbert Valerio Riedel <hvr@hvrlab.org> + * Kyle McMartin <kyle@debian.org> + * Adam J. Richter <adam@yggdrasil.com> (conversion to 2.5 API). + * Andreas Steinmetz <ast@domdv.de> (adapted to x86_64 assembler) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * --------------------------------------------------------------------------- + * Copyright (c) 2002, Dr Brian Gladman <brg@gladman.me.uk>, Worcester, UK. + * All rights reserved. + * + * LICENSE TERMS + * + * The free distribution and use of this software in both source and binary + * form is allowed (with or without changes) provided that: + * + * 1. distributions of this source code include the above copyright + * notice, this list of conditions and the following disclaimer; + * + * 2. distributions in binary form include the above copyright + * notice, this list of conditions and the following disclaimer + * in the documentation and/or other associated materials; + * + * 3. the copyright holder's name is not used to endorse products + * built using this software without specific written permission. + * + * ALTERNATIVELY, provided that this notice is retained in full, this product + * may be distributed under the terms of the GNU General Public License (GPL), + * in which case the provisions of the GPL apply INSTEAD OF those given above. + * + * DISCLAIMER + * + * This software is provided 'as is' with no explicit or implied warranties + * in respect of its properties, including, but not limited to, correctness + * and/or fitness for purpose. + * --------------------------------------------------------------------------- + */ + +/* Some changes from the Gladman version: + s/RIJNDAEL(e_key)/E_KEY/g + s/RIJNDAEL(d_key)/D_KEY/g +*/ + +#include <asm/byteorder.h> +#include <linux/bitops.h> +#include <linux/crypto.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/types.h> + +#define AES_MIN_KEY_SIZE 16 +#define AES_MAX_KEY_SIZE 32 + +#define AES_BLOCK_SIZE 16 + +/* + * #define byte(x, nr) ((unsigned char)((x) >> (nr*8))) + */ +static inline u8 byte(const u32 x, const unsigned n) +{ + return x >> (n << 3); +} + +#define u32_in(x) le32_to_cpu(*(const __le32 *)(x)) + +struct aes_ctx +{ + u32 key_length; + u32 E[60]; + u32 D[60]; +}; + +#define E_KEY ctx->E +#define D_KEY ctx->D + +static u8 pow_tab[256] __initdata; +static u8 log_tab[256] __initdata; +static u8 sbx_tab[256] __initdata; +static u8 isb_tab[256] __initdata; +static u32 rco_tab[10]; +u32 aes_ft_tab[4][256]; +u32 aes_it_tab[4][256]; + +u32 aes_fl_tab[4][256]; +u32 aes_il_tab[4][256]; + +static inline u8 f_mult(u8 a, u8 b) +{ + u8 aa = log_tab[a], cc = aa + log_tab[b]; + + return pow_tab[cc + (cc < aa ? 1 : 0)]; +} + +#define ff_mult(a, b) (a && b ? f_mult(a, b) : 0) + +#define ls_box(x) \ + (aes_fl_tab[0][byte(x, 0)] ^ \ + aes_fl_tab[1][byte(x, 1)] ^ \ + aes_fl_tab[2][byte(x, 2)] ^ \ + aes_fl_tab[3][byte(x, 3)]) + +static void __init gen_tabs(void) +{ + u32 i, t; + u8 p, q; + + /* log and power tables for GF(2**8) finite field with + 0x011b as modular polynomial - the simplest primitive + root is 0x03, used here to generate the tables */ + + for (i = 0, p = 1; i < 256; ++i) { + pow_tab[i] = (u8)p; + log_tab[p] = (u8)i; + + p ^= (p << 1) ^ (p & 0x80 ? 0x01b : 0); + } + + log_tab[1] = 0; + + for (i = 0, p = 1; i < 10; ++i) { + rco_tab[i] = p; + + p = (p << 1) ^ (p & 0x80 ? 0x01b : 0); + } + + for (i = 0; i < 256; ++i) { + p = (i ? pow_tab[255 - log_tab[i]] : 0); + q = ((p >> 7) | (p << 1)) ^ ((p >> 6) | (p << 2)); + p ^= 0x63 ^ q ^ ((q >> 6) | (q << 2)); + sbx_tab[i] = p; + isb_tab[p] = (u8)i; + } + + for (i = 0; i < 256; ++i) { + p = sbx_tab[i]; + + t = p; + aes_fl_tab[0][i] = t; + aes_fl_tab[1][i] = rol32(t, 8); + aes_fl_tab[2][i] = rol32(t, 16); + aes_fl_tab[3][i] = rol32(t, 24); + + t = ((u32)ff_mult(2, p)) | + ((u32)p << 8) | + ((u32)p << 16) | ((u32)ff_mult(3, p) << 24); + + aes_ft_tab[0][i] = t; + aes_ft_tab[1][i] = rol32(t, 8); + aes_ft_tab[2][i] = rol32(t, 16); + aes_ft_tab[3][i] = rol32(t, 24); + + p = isb_tab[i]; + + t = p; + aes_il_tab[0][i] = t; + aes_il_tab[1][i] = rol32(t, 8); + aes_il_tab[2][i] = rol32(t, 16); + aes_il_tab[3][i] = rol32(t, 24); + + t = ((u32)ff_mult(14, p)) | + ((u32)ff_mult(9, p) << 8) | + ((u32)ff_mult(13, p) << 16) | + ((u32)ff_mult(11, p) << 24); + + aes_it_tab[0][i] = t; + aes_it_tab[1][i] = rol32(t, 8); + aes_it_tab[2][i] = rol32(t, 16); + aes_it_tab[3][i] = rol32(t, 24); + } +} + +#define star_x(x) (((x) & 0x7f7f7f7f) << 1) ^ ((((x) & 0x80808080) >> 7) * 0x1b) + +#define imix_col(y, x) \ + u = star_x(x); \ + v = star_x(u); \ + w = star_x(v); \ + t = w ^ (x); \ + (y) = u ^ v ^ w; \ + (y) ^= ror32(u ^ t, 8) ^ \ + ror32(v ^ t, 16) ^ \ + ror32(t, 24) + +/* initialise the key schedule from the user supplied key */ + +#define loop4(i) \ +{ \ + t = ror32(t, 8); t = ls_box(t) ^ rco_tab[i]; \ + t ^= E_KEY[4 * i]; E_KEY[4 * i + 4] = t; \ + t ^= E_KEY[4 * i + 1]; E_KEY[4 * i + 5] = t; \ + t ^= E_KEY[4 * i + 2]; E_KEY[4 * i + 6] = t; \ + t ^= E_KEY[4 * i + 3]; E_KEY[4 * i + 7] = t; \ +} + +#define loop6(i) \ +{ \ + t = ror32(t, 8); t = ls_box(t) ^ rco_tab[i]; \ + t ^= E_KEY[6 * i]; E_KEY[6 * i + 6] = t; \ + t ^= E_KEY[6 * i + 1]; E_KEY[6 * i + 7] = t; \ + t ^= E_KEY[6 * i + 2]; E_KEY[6 * i + 8] = t; \ + t ^= E_KEY[6 * i + 3]; E_KEY[6 * i + 9] = t; \ + t ^= E_KEY[6 * i + 4]; E_KEY[6 * i + 10] = t; \ + t ^= E_KEY[6 * i + 5]; E_KEY[6 * i + 11] = t; \ +} + +#define loop8(i) \ +{ \ + t = ror32(t, 8); ; t = ls_box(t) ^ rco_tab[i]; \ + t ^= E_KEY[8 * i]; E_KEY[8 * i + 8] = t; \ + t ^= E_KEY[8 * i + 1]; E_KEY[8 * i + 9] = t; \ + t ^= E_KEY[8 * i + 2]; E_KEY[8 * i + 10] = t; \ + t ^= E_KEY[8 * i + 3]; E_KEY[8 * i + 11] = t; \ + t = E_KEY[8 * i + 4] ^ ls_box(t); \ + E_KEY[8 * i + 12] = t; \ + t ^= E_KEY[8 * i + 5]; E_KEY[8 * i + 13] = t; \ + t ^= E_KEY[8 * i + 6]; E_KEY[8 * i + 14] = t; \ + t ^= E_KEY[8 * i + 7]; E_KEY[8 * i + 15] = t; \ +} + +static int aes_set_key(void *ctx_arg, const u8 *in_key, unsigned int key_len, + u32 *flags) +{ + struct aes_ctx *ctx = ctx_arg; + u32 i, j, t, u, v, w; + + if (key_len != 16 && key_len != 24 && key_len != 32) { + *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + return -EINVAL; + } + + ctx->key_length = key_len; + + D_KEY[key_len + 24] = E_KEY[0] = u32_in(in_key); + D_KEY[key_len + 25] = E_KEY[1] = u32_in(in_key + 4); + D_KEY[key_len + 26] = E_KEY[2] = u32_in(in_key + 8); + D_KEY[key_len + 27] = E_KEY[3] = u32_in(in_key + 12); + + switch (key_len) { + case 16: + t = E_KEY[3]; + for (i = 0; i < 10; ++i) + loop4(i); + break; + + case 24: + E_KEY[4] = u32_in(in_key + 16); + t = E_KEY[5] = u32_in(in_key + 20); + for (i = 0; i < 8; ++i) + loop6 (i); + break; + + case 32: + E_KEY[4] = u32_in(in_key + 16); + E_KEY[5] = u32_in(in_key + 20); + E_KEY[6] = u32_in(in_key + 24); + t = E_KEY[7] = u32_in(in_key + 28); + for (i = 0; i < 7; ++i) + loop8(i); + break; + } + + D_KEY[0] = E_KEY[key_len + 24]; + D_KEY[1] = E_KEY[key_len + 25]; + D_KEY[2] = E_KEY[key_len + 26]; + D_KEY[3] = E_KEY[key_len + 27]; + + for (i = 4; i < key_len + 24; ++i) { + j = key_len + 24 - (i & ~3) + (i & 3); + imix_col(D_KEY[j], E_KEY[i]); + } + + return 0; +} + +extern void aes_encrypt(void *ctx_arg, u8 *out, const u8 *in); +extern void aes_decrypt(void *ctx_arg, u8 *out, const u8 *in); + +static struct crypto_alg aes_alg = { + .cra_name = "aes", + .cra_flags = CRYPTO_ALG_TYPE_CIPHER, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct aes_ctx), + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(aes_alg.cra_list), + .cra_u = { + .cipher = { + .cia_min_keysize = AES_MIN_KEY_SIZE, + .cia_max_keysize = AES_MAX_KEY_SIZE, + .cia_setkey = aes_set_key, + .cia_encrypt = aes_encrypt, + .cia_decrypt = aes_decrypt + } + } +}; + +static int __init aes_init(void) +{ + gen_tabs(); + return crypto_register_alg(&aes_alg); +} + +static void __exit aes_fini(void) +{ + crypto_unregister_alg(&aes_alg); +} + +module_init(aes_init); +module_exit(aes_fini); + +MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm"); +MODULE_LICENSE("GPL"); diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S index f3ca0db85b5..cc935427d53 100644 --- a/arch/x86_64/ia32/ia32entry.S +++ b/arch/x86_64/ia32/ia32entry.S @@ -589,7 +589,7 @@ ia32_sys_call_table: .quad compat_sys_mq_timedreceive /* 280 */ .quad compat_sys_mq_notify .quad compat_sys_mq_getsetattr - .quad quiet_ni_syscall /* reserved for kexec */ + .quad compat_sys_kexec_load /* reserved for kexec */ .quad compat_sys_waitid .quad quiet_ni_syscall /* sys_altroot */ .quad sys_add_key diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile index 5ca4a4598fd..48f9e2c19cd 100644 --- a/arch/x86_64/kernel/Makefile +++ b/arch/x86_64/kernel/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o mpparse.o \ genapic.o genapic_cluster.o genapic_flat.o +obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o obj-$(CONFIG_PM) += suspend.o obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend_asm.o obj-$(CONFIG_CPU_FREQ) += cpufreq/ diff --git a/arch/x86_64/kernel/acpi/wakeup.S b/arch/x86_64/kernel/acpi/wakeup.S index a4c630034cd..185faa911db 100644 --- a/arch/x86_64/kernel/acpi/wakeup.S +++ b/arch/x86_64/kernel/acpi/wakeup.S @@ -67,7 +67,7 @@ wakeup_code: shll $4, %eax addl $(gdta - wakeup_code), %eax movl %eax, gdt_48a +2 - wakeup_code - lgdt %ds:gdt_48a - wakeup_code # load gdt with whatever is + lgdtl %ds:gdt_48a - wakeup_code # load gdt with whatever is # appropriate movl $1, %eax # protected mode (PE) bit diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index f8e6cc4fecd..375d369570c 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -133,7 +133,7 @@ void __init connect_bsp_APIC(void) } } -void disconnect_bsp_APIC(void) +void disconnect_bsp_APIC(int virt_wire_setup) { if (pic_mode) { /* @@ -146,6 +146,42 @@ void disconnect_bsp_APIC(void) outb(0x70, 0x22); outb(0x00, 0x23); } + else { + /* Go back to Virtual Wire compatibility mode */ + unsigned long value; + + /* For the spurious interrupt use vector F, and enable it */ + value = apic_read(APIC_SPIV); + value &= ~APIC_VECTOR_MASK; + value |= APIC_SPIV_APIC_ENABLED; + value |= 0xf; + apic_write_around(APIC_SPIV, value); + + if (!virt_wire_setup) { + /* For LVT0 make it edge triggered, active high, external and enabled */ + value = apic_read(APIC_LVT0); + value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED ); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); + apic_write_around(APIC_LVT0, value); + } + else { + /* Disable LVT0 */ + apic_write_around(APIC_LVT0, APIC_LVT_MASKED); + } + + /* For LVT1 make it edge triggered, active high, nmi and enabled */ + value = apic_read(APIC_LVT1); + value &= ~( + APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); + apic_write_around(APIC_LVT1, value); + } } void disable_local_APIC(void) @@ -285,7 +321,7 @@ void __init init_bsp_APIC(void) apic_write_around(APIC_LVT1, value); } -void __init setup_local_APIC (void) +void __cpuinit setup_local_APIC (void) { unsigned int value, ver, maxlvt; @@ -534,7 +570,7 @@ static struct sys_device device_lapic = { .cls = &lapic_sysclass, }; -static void __init apic_pm_activate(void) +static void __cpuinit apic_pm_activate(void) { apic_pm_state.active = 1; } @@ -774,14 +810,14 @@ void __init setup_boot_APIC_clock (void) local_irq_enable(); } -void __init setup_secondary_APIC_clock(void) +void __cpuinit setup_secondary_APIC_clock(void) { local_irq_disable(); /* FIXME: Do we need this? --RR */ setup_APIC_timer(calibration_result); local_irq_enable(); } -void __init disable_APIC_timer(void) +void __cpuinit disable_APIC_timer(void) { if (using_apic_timer) { unsigned long v; diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c new file mode 100644 index 00000000000..d7fa4248501 --- /dev/null +++ b/arch/x86_64/kernel/crash.c @@ -0,0 +1,35 @@ +/* + * Architecture specific (x86_64) functions for kexec based crash dumps. + * + * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) + * + * Copyright (C) IBM Corporation, 2004. All rights reserved. + * + */ + +#include <linux/init.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/smp.h> +#include <linux/irq.h> +#include <linux/reboot.h> +#include <linux/kexec.h> + +#include <asm/processor.h> +#include <asm/hardirq.h> +#include <asm/nmi.h> +#include <asm/hw_irq.h> + +note_buf_t crash_notes[NR_CPUS]; + +void machine_crash_shutdown(struct pt_regs *regs) +{ + /* This function is only called after the system + * has paniced or is otherwise in a critical state. + * The minimum amount of code to allow a kexec'd kernel + * to run successfully needs to happen here. + * + * In practice this means shooting down the other cpus in + * an SMP system. + */ +} diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index 7c154dfff64..6ded3a50dfe 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c @@ -16,6 +16,7 @@ #include <linux/bootmem.h> #include <linux/ioport.h> #include <linux/string.h> +#include <linux/kexec.h> #include <asm/page.h> #include <asm/e820.h> #include <asm/proto.h> @@ -191,8 +192,6 @@ void __init e820_reserve_resources(void) int i; for (i = 0; i < e820.nr_map; i++) { struct resource *res; - if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL) - continue; res = alloc_bootmem_low(sizeof(struct resource)); switch (e820.map[i].type) { case E820_RAM: res->name = "System RAM"; break; @@ -212,6 +211,9 @@ void __init e820_reserve_resources(void) */ request_resource(res, &code_resource); request_resource(res, &data_resource); +#ifdef CONFIG_KEXEC + request_resource(res, &crashk_res); +#endif } } } diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c index b4cbbad0422..28284696508 100644 --- a/arch/x86_64/kernel/genapic_flat.c +++ b/arch/x86_64/kernel/genapic_flat.c @@ -7,6 +7,8 @@ * Hacked for x86-64 by James Cleverdon from i386 architecture code by * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and * James Cleverdon. + * Ashok Raj <ashok.raj@intel.com> + * Removed IPI broadcast shortcut to support CPU hotplug */ #include <linux/config.h> #include <linux/threads.h> @@ -18,6 +20,46 @@ #include <asm/smp.h> #include <asm/ipi.h> +/* + * The following permit choosing broadcast IPI shortcut v.s sending IPI only + * to online cpus via the send_IPI_mask varient. + * The mask version is my preferred option, since it eliminates a lot of + * other extra code that would need to be written to cleanup intrs sent + * to a CPU while offline. + * + * Sending broadcast introduces lots of trouble in CPU hotplug situations. + * These IPI's are delivered to cpu's irrespective of their offline status + * and could pickup stale intr data when these CPUS are turned online. + * + * Not using broadcast is a cleaner approach IMO, but Andi Kleen disagrees with + * the idea of not using broadcast IPI's anymore. Hence the run time check + * is introduced, on his request so we can choose an alternate mechanism. + * + * Initial wacky performance tests that collect cycle counts show + * no increase in using mask v.s broadcast version. In fact they seem + * identical in terms of cycle counts. + * + * if we need to use broadcast, we need to do the following. + * + * cli; + * hold call_lock; + * clear any pending IPI, just ack and clear all pending intr + * set cpu_online_map; + * release call_lock; + * sti; + * + * The complicated dummy irq processing shown above is not required if + * we didnt sent IPI's to wrong CPU's in the first place. + * + * - Ashok Raj <ashok.raj@intel.com> + */ +#ifdef CONFIG_HOTPLUG_CPU +#define DEFAULT_SEND_IPI (1) +#else +#define DEFAULT_SEND_IPI (0) +#endif + +static int no_broadcast=DEFAULT_SEND_IPI; static cpumask_t flat_target_cpus(void) { @@ -45,22 +87,6 @@ static void flat_init_apic_ldr(void) apic_write_around(APIC_LDR, val); } -static void flat_send_IPI_allbutself(int vector) -{ - /* - * if there are no other CPUs in the system then - * we get an APIC send error if we try to broadcast. - * thus we have to avoid sending IPIs in this case. - */ - if (num_online_cpus() > 1) - __send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL); -} - -static void flat_send_IPI_all(int vector) -{ - __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL); -} - static void flat_send_IPI_mask(cpumask_t cpumask, int vector) { unsigned long mask = cpus_addr(cpumask)[0]; @@ -93,6 +119,39 @@ static void flat_send_IPI_mask(cpumask_t cpumask, int vector) local_irq_restore(flags); } +static inline void __local_flat_send_IPI_allbutself(int vector) +{ + if (no_broadcast) { + cpumask_t mask = cpu_online_map; + int this_cpu = get_cpu(); + + cpu_clear(this_cpu, mask); + flat_send_IPI_mask(mask, vector); + put_cpu(); + } + else + __send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL); +} + +static inline void __local_flat_send_IPI_all(int vector) +{ + if (no_broadcast) + flat_send_IPI_mask(cpu_online_map, vector); + else + __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL); +} + +static void flat_send_IPI_allbutself(int vector) +{ + if (((num_online_cpus()) - 1) >= 1) + __local_flat_send_IPI_allbutself(vector); +} + +static void flat_send_IPI_all(int vector) +{ + __local_flat_send_IPI_all(vector); +} + static int flat_apic_id_registered(void) { return physid_isset(GET_APIC_ID(apic_read(APIC_ID)), phys_cpu_present_map); @@ -111,6 +170,16 @@ static unsigned int phys_pkg_id(int index_msb) return ((ebx >> 24) & 0xFF) >> index_msb; } +static __init int no_ipi_broadcast(char *str) +{ + get_option(&str, &no_broadcast); + printk ("Using %s mode\n", no_broadcast ? "No IPI Broadcast" : + "IPI Broadcast"); + return 1; +} + +__setup("no_ipi_broadcast", no_ipi_broadcast); + struct genapic apic_flat = { .name = "flat", .int_delivery_mode = dest_LowestPrio, @@ -125,3 +194,12 @@ struct genapic apic_flat = { .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, .phys_pkg_id = phys_pkg_id, }; + +static int __init print_ipi_mode(void) +{ + printk ("Using IPI %s mode\n", no_broadcast ? "No-Shortcut" : + "Shortcut"); + return 0; +} + +late_initcall(print_ipi_mode); diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index 9bd2e7a4b81..8d765aa77a2 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -248,23 +248,23 @@ ENTRY(_stext) */ .org 0x1000 ENTRY(init_level4_pgt) - .quad 0x0000000000102007 /* -> level3_ident_pgt */ + .quad 0x0000000000002007 + __PHYSICAL_START /* -> level3_ident_pgt */ .fill 255,8,0 - .quad 0x000000000010a007 + .quad 0x000000000000a007 + __PHYSICAL_START .fill 254,8,0 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ - .quad 0x0000000000103007 /* -> level3_kernel_pgt */ + .quad 0x0000000000003007 + __PHYSICAL_START /* -> level3_kernel_pgt */ .org 0x2000 ENTRY(level3_ident_pgt) - .quad 0x0000000000104007 + .quad 0x0000000000004007 + __PHYSICAL_START .fill 511,8,0 .org 0x3000 ENTRY(level3_kernel_pgt) .fill 510,8,0 /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ - .quad 0x0000000000105007 /* -> level2_kernel_pgt */ + .quad 0x0000000000005007 + __PHYSICAL_START /* -> level2_kernel_pgt */ .fill 1,8,0 .org 0x4000 @@ -337,17 +337,17 @@ ENTRY(empty_bad_pmd_table) .org 0xa000 ENTRY(level3_physmem_pgt) - .quad 0x0000000000105007 /* -> level2_kernel_pgt (so that __va works even before pagetable_init) */ + .quad 0x0000000000005007 + __PHYSICAL_START /* -> level2_kernel_pgt (so that __va works even before pagetable_init) */ .org 0xb000 #ifdef CONFIG_ACPI_SLEEP ENTRY(wakeup_level4_pgt) - .quad 0x0000000000102007 /* -> level3_ident_pgt */ + .quad 0x0000000000002007 + __PHYSICAL_START /* -> level3_ident_pgt */ .fill 255,8,0 - .quad 0x000000000010a007 + .quad 0x000000000000a007 + __PHYSICAL_START .fill 254,8,0 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ - .quad 0x0000000000103007 /* -> level3_kernel_pgt */ + .quad 0x0000000000003007 + __PHYSICAL_START /* -> level3_kernel_pgt */ #endif .data diff --git a/arch/x86_64/kernel/i387.c b/arch/x86_64/kernel/i387.c index ba139cac57c..d9b22b633e3 100644 --- a/arch/x86_64/kernel/i387.c +++ b/arch/x86_64/kernel/i387.c @@ -42,7 +42,7 @@ void mxcsr_feature_mask_init(void) * Called at bootup to set up the initial FPU state that is later cloned * into all processes. */ -void __init fpu_init(void) +void __cpuinit fpu_init(void) { unsigned long oldcr0 = read_cr0(); extern void __bad_fxsave_alignment(void); diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c index 19eafa0aa95..a8916909512 100644 --- a/arch/x86_64/kernel/i8259.c +++ b/arch/x86_64/kernel/i8259.c @@ -414,10 +414,22 @@ static int i8259A_suspend(struct sys_device *dev, pm_message_t state) return 0; } +static int i8259A_shutdown(struct sys_device *dev) +{ + /* Put the i8259A into a quiescent state that + * the kernel initialization code can get it + * out of. + */ + outb(0xff, 0x21); /* mask all of 8259A-1 */ + outb(0xff, 0xA1); /* mask all of 8259A-1 */ + return 0; +} + static struct sysdev_class i8259_sysdev_class = { set_kset_name("i8259"), .suspend = i8259A_suspend, .resume = i8259A_resume, + .shutdown = i8259A_shutdown, }; static struct sys_device device_i8259A = { diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index afd87e64d0a..d206d7e49cf 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -330,7 +330,7 @@ static int find_irq_entry(int apic, int pin, int type) /* * Find the pin to which IRQ[irq] (ISA) is connected */ -static int __init find_isa_irq_pin(int irq, int type) +static int find_isa_irq_pin(int irq, int type) { int i; @@ -1064,7 +1064,6 @@ void print_all_local_APICs (void) void __apicdebuginit print_PIC(void) { - extern spinlock_t i8259A_lock; unsigned int v; unsigned long flags; @@ -1132,12 +1131,44 @@ static void __init enable_IO_APIC(void) */ void disable_IO_APIC(void) { + int pin; /* * Clear the IO-APIC before rebooting: */ clear_IO_APIC(); - disconnect_bsp_APIC(); + /* + * If the i82559 is routed through an IOAPIC + * Put that IOAPIC in virtual wire mode + * so legacy interrups can be delivered. + */ + pin = find_isa_irq_pin(0, mp_ExtINT); + if (pin != -1) { + struct IO_APIC_route_entry entry; + unsigned long flags; + + memset(&entry, 0, sizeof(entry)); + entry.mask = 0; /* Enabled */ + entry.trigger = 0; /* Edge */ + entry.irr = 0; + entry.polarity = 0; /* High */ + entry.delivery_status = 0; + entry.dest_mode = 0; /* Physical */ + entry.delivery_mode = 7; /* ExtInt */ + entry.vector = 0; + entry.dest.physical.physical_dest = 0; + + + /* + * Add it to the IO-APIC irq-routing table: + */ + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1)); + io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0)); + spin_unlock_irqrestore(&ioapic_lock, flags); + } + + disconnect_bsp_APIC(pin != -1); } /* diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c index 62b112e4deb..cc3fb85f514 100644 --- a/arch/x86_64/kernel/irq.c +++ b/arch/x86_64/kernel/irq.c @@ -14,6 +14,7 @@ #include <linux/interrupt.h> #include <linux/seq_file.h> #include <linux/module.h> +#include <linux/delay.h> #include <asm/uaccess.h> #include <asm/io_apic.h> @@ -106,3 +107,31 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs) return 1; } +#ifdef CONFIG_HOTPLUG_CPU +void fixup_irqs(cpumask_t map) +{ + unsigned int irq; + static int warned; + + for (irq = 0; irq < NR_IRQS; irq++) { + cpumask_t mask; + if (irq == 2) + continue; + + cpus_and(mask, irq_affinity[irq], map); + if (any_online_cpu(mask) == NR_CPUS) { + printk("Breaking affinity for irq %i\n", irq); + mask = map; + } + if (irq_desc[irq].handler->set_affinity) + irq_desc[irq].handler->set_affinity(irq, mask); + else if (irq_desc[irq].action && !(warned++)) + printk("Cannot set affinity for irq %i\n", irq); + } + + /* That doesn't seem sufficient. Give it 1ms. */ + local_irq_enable(); + mdelay(1); + local_irq_disable(); +} +#endif diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c index 4e680f87a75..5c6dc705148 100644 --- a/arch/x86_64/kernel/kprobes.c +++ b/arch/x86_64/kernel/kprobes.c @@ -38,7 +38,7 @@ #include <linux/string.h> #include <linux/slab.h> #include <linux/preempt.h> -#include <linux/moduleloader.h> + #include <asm/cacheflush.h> #include <asm/pgtable.h> #include <asm/kdebug.h> @@ -51,8 +51,6 @@ static struct kprobe *kprobe_prev; static unsigned long kprobe_status_prev, kprobe_old_rflags_prev, kprobe_saved_rflags_prev; static struct pt_regs jprobe_saved_regs; static long *jprobe_saved_rsp; -static kprobe_opcode_t *get_insn_slot(void); -static void free_insn_slot(kprobe_opcode_t *slot); void jprobe_return_end(void); /* copy of the kernel stack at the probe fire time */ @@ -274,48 +272,23 @@ static void prepare_singlestep(struct kprobe *p, struct pt_regs *regs) regs->rip = (unsigned long)p->ainsn.insn; } -struct task_struct *arch_get_kprobe_task(void *ptr) -{ - return ((struct thread_info *) (((unsigned long) ptr) & - (~(THREAD_SIZE -1))))->task; -} - void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs) { unsigned long *sara = (unsigned long *)regs->rsp; - struct kretprobe_instance *ri; - static void *orig_ret_addr; + struct kretprobe_instance *ri; + + if ((ri = get_free_rp_inst(rp)) != NULL) { + ri->rp = rp; + ri->task = current; + ri->ret_addr = (kprobe_opcode_t *) *sara; - /* - * Save the return address when the return probe hits - * the first time, and use it to populate the (krprobe - * instance)->ret_addr for subsequent return probes at - * the same addrress since stack address would have - * the kretprobe_trampoline by then. - */ - if (((void*) *sara) != kretprobe_trampoline) - orig_ret_addr = (void*) *sara; - - if ((ri = get_free_rp_inst(rp)) != NULL) { - ri->rp = rp; - ri->stack_addr = sara; - ri->ret_addr = orig_ret_addr; - add_rp_inst(ri); /* Replace the return addr with trampoline addr */ *sara = (unsigned long) &kretprobe_trampoline; - } else { - rp->nmissed++; - } -} -void arch_kprobe_flush_task(struct task_struct *tk) -{ - struct kretprobe_instance *ri; - while ((ri = get_rp_inst_tsk(tk)) != NULL) { - *((unsigned long *)(ri->stack_addr)) = - (unsigned long) ri->ret_addr; - recycle_rp_inst(ri); - } + add_rp_inst(ri); + } else { + rp->nmissed++; + } } /* @@ -428,36 +401,59 @@ no_kprobe: */ int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) { - struct task_struct *tsk; - struct kretprobe_instance *ri; - struct hlist_head *head; - struct hlist_node *node; - unsigned long *sara = (unsigned long *)regs->rsp - 1; - - tsk = arch_get_kprobe_task(sara); - head = kretprobe_inst_table_head(tsk); - - hlist_for_each_entry(ri, node, head, hlist) { - if (ri->stack_addr == sara && ri->rp) { - if (ri->rp->handler) - ri->rp->handler(ri, regs); - } - } - return 0; -} + struct kretprobe_instance *ri = NULL; + struct hlist_head *head; + struct hlist_node *node, *tmp; + unsigned long orig_ret_address = 0; + unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline; -void trampoline_post_handler(struct kprobe *p, struct pt_regs *regs, - unsigned long flags) -{ - struct kretprobe_instance *ri; - /* RA already popped */ - unsigned long *sara = ((unsigned long *)regs->rsp) - 1; + head = kretprobe_inst_table_head(current); - while ((ri = get_rp_inst(sara))) { - regs->rip = (unsigned long)ri->ret_addr; + /* + * It is possible to have multiple instances associated with a given + * task either because an multiple functions in the call path + * have a return probe installed on them, and/or more then one return + * return probe was registered for a target function. + * + * We can handle this because: + * - instances are always inserted at the head of the list + * - when multiple return probes are registered for the same + * function, the first instance's ret_addr will point to the + * real return address, and all the rest will point to + * kretprobe_trampoline + */ + hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { + if (ri->task != current) + /* another task is sharing our hash bucket */ + continue; + + if (ri->rp && ri->rp->handler) + ri->rp->handler(ri, regs); + + orig_ret_address = (unsigned long)ri->ret_addr; recycle_rp_inst(ri); + + if (orig_ret_address != trampoline_address) + /* + * This is the real return address. Any other + * instances associated with this task are for + * other calls deeper on the call stack + */ + break; } - regs->eflags &= ~TF_MASK; + + BUG_ON(!orig_ret_address || (orig_ret_address == trampoline_address)); + regs->rip = orig_ret_address; + + unlock_kprobes(); + preempt_enable_no_resched(); + + /* + * By returning a non-zero value, we are telling + * kprobe_handler() that we have handled unlocking + * and re-enabling preemption. + */ + return 1; } /* @@ -550,8 +546,7 @@ int post_kprobe_handler(struct pt_regs *regs) current_kprobe->post_handler(current_kprobe, regs, 0); } - if (current_kprobe->post_handler != trampoline_post_handler) - resume_execution(current_kprobe, regs); + resume_execution(current_kprobe, regs); regs->eflags |= kprobe_saved_rflags; /* Restore the original saved kprobes variables and continue. */ @@ -682,111 +677,12 @@ int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) return 0; } -/* - * kprobe->ainsn.insn points to the copy of the instruction to be single-stepped. - * By default on x86_64, pages we get from kmalloc or vmalloc are not - * executable. Single-stepping an instruction on such a page yields an - * oops. So instead of storing the instruction copies in their respective - * kprobe objects, we allocate a page, map it executable, and store all the - * instruction copies there. (We can allocate additional pages if somebody - * inserts a huge number of probes.) Each page can hold up to INSNS_PER_PAGE - * instruction slots, each of which is MAX_INSN_SIZE*sizeof(kprobe_opcode_t) - * bytes. - */ -#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE*sizeof(kprobe_opcode_t))) -struct kprobe_insn_page { - struct hlist_node hlist; - kprobe_opcode_t *insns; /* page of instruction slots */ - char slot_used[INSNS_PER_PAGE]; - int nused; +static struct kprobe trampoline_p = { + .addr = (kprobe_opcode_t *) &kretprobe_trampoline, + .pre_handler = trampoline_probe_handler }; -static struct hlist_head kprobe_insn_pages; - -/** - * get_insn_slot() - Find a slot on an executable page for an instruction. - * We allocate an executable page if there's no room on existing ones. - */ -static kprobe_opcode_t *get_insn_slot(void) -{ - struct kprobe_insn_page *kip; - struct hlist_node *pos; - - hlist_for_each(pos, &kprobe_insn_pages) { - kip = hlist_entry(pos, struct kprobe_insn_page, hlist); - if (kip->nused < INSNS_PER_PAGE) { - int i; - for (i = 0; i < INSNS_PER_PAGE; i++) { - if (!kip->slot_used[i]) { - kip->slot_used[i] = 1; - kip->nused++; - return kip->insns + (i*MAX_INSN_SIZE); - } - } - /* Surprise! No unused slots. Fix kip->nused. */ - kip->nused = INSNS_PER_PAGE; - } - } - - /* All out of space. Need to allocate a new page. Use slot 0.*/ - kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL); - if (!kip) { - return NULL; - } - - /* - * For the %rip-relative displacement fixups to be doable, we - * need our instruction copy to be within +/- 2GB of any data it - * might access via %rip. That is, within 2GB of where the - * kernel image and loaded module images reside. So we allocate - * a page in the module loading area. - */ - kip->insns = module_alloc(PAGE_SIZE); - if (!kip->insns) { - kfree(kip); - return NULL; - } - INIT_HLIST_NODE(&kip->hlist); - hlist_add_head(&kip->hlist, &kprobe_insn_pages); - memset(kip->slot_used, 0, INSNS_PER_PAGE); - kip->slot_used[0] = 1; - kip->nused = 1; - return kip->insns; -} - -/** - * free_insn_slot() - Free instruction slot obtained from get_insn_slot(). - */ -static void free_insn_slot(kprobe_opcode_t *slot) +int __init arch_init_kprobes(void) { - struct kprobe_insn_page *kip; - struct hlist_node *pos; - - hlist_for_each(pos, &kprobe_insn_pages) { - kip = hlist_entry(pos, struct kprobe_insn_page, hlist); - if (kip->insns <= slot - && slot < kip->insns+(INSNS_PER_PAGE*MAX_INSN_SIZE)) { - int i = (slot - kip->insns) / MAX_INSN_SIZE; - kip->slot_used[i] = 0; - kip->nused--; - if (kip->nused == 0) { - /* - * Page is no longer in use. Free it unless - * it's the last one. We keep the last one - * so as not to have to set it up again the - * next time somebody inserts a probe. - */ - hlist_del(&kip->hlist); - if (hlist_empty(&kprobe_insn_pages)) { - INIT_HLIST_NODE(&kip->hlist); - hlist_add_head(&kip->hlist, - &kprobe_insn_pages); - } else { - module_free(NULL, kip->insns); - kfree(kip); - } - } - return; - } - } + return register_kprobe(&trampoline_p); } diff --git a/arch/x86_64/kernel/machine_kexec.c b/arch/x86_64/kernel/machine_kexec.c new file mode 100644 index 00000000000..60d1eff4156 --- /dev/null +++ b/arch/x86_64/kernel/machine_kexec.c @@ -0,0 +1,250 @@ +/* + * machine_kexec.c - handle transition of Linux booting another kernel + * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com> + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include <linux/mm.h> +#include <linux/kexec.h> +#include <linux/delay.h> +#include <linux/string.h> +#include <linux/reboot.h> +#include <asm/pda.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/tlbflush.h> +#include <asm/mmu_context.h> +#include <asm/io.h> +#include <asm/apic.h> +#include <asm/cpufeature.h> +#include <asm/hw_irq.h> + +#define LEVEL0_SIZE (1UL << 12UL) +#define LEVEL1_SIZE (1UL << 21UL) +#define LEVEL2_SIZE (1UL << 30UL) +#define LEVEL3_SIZE (1UL << 39UL) +#define LEVEL4_SIZE (1UL << 48UL) + +#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) +#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE) +#define L2_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) +#define L3_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) + +static void init_level2_page(u64 *level2p, unsigned long addr) +{ + unsigned long end_addr; + + addr &= PAGE_MASK; + end_addr = addr + LEVEL2_SIZE; + while (addr < end_addr) { + *(level2p++) = addr | L1_ATTR; + addr += LEVEL1_SIZE; + } +} + +static int init_level3_page(struct kimage *image, u64 *level3p, + unsigned long addr, unsigned long last_addr) +{ + unsigned long end_addr; + int result; + + result = 0; + addr &= PAGE_MASK; + end_addr = addr + LEVEL3_SIZE; + while ((addr < last_addr) && (addr < end_addr)) { + struct page *page; + u64 *level2p; + + page = kimage_alloc_control_pages(image, 0); + if (!page) { + result = -ENOMEM; + goto out; + } + level2p = (u64 *)page_address(page); + init_level2_page(level2p, addr); + *(level3p++) = __pa(level2p) | L2_ATTR; + addr += LEVEL2_SIZE; + } + /* clear the unused entries */ + while (addr < end_addr) { + *(level3p++) = 0; + addr += LEVEL2_SIZE; + } +out: + return result; +} + + +static int init_level4_page(struct kimage *image, u64 *level4p, + unsigned long addr, unsigned long last_addr) +{ + unsigned long end_addr; + int result; + + result = 0; + addr &= PAGE_MASK; + end_addr = addr + LEVEL4_SIZE; + while ((addr < last_addr) && (addr < end_addr)) { + struct page *page; + u64 *level3p; + + page = kimage_alloc_control_pages(image, 0); + if (!page) { + result = -ENOMEM; + goto out; + } + level3p = (u64 *)page_address(page); + result = init_level3_page(image, level3p, addr, last_addr); + if (result) { + goto out; + } + *(level4p++) = __pa(level3p) | L3_ATTR; + addr += LEVEL3_SIZE; + } + /* clear the unused entries */ + while (addr < end_addr) { + *(level4p++) = 0; + addr += LEVEL3_SIZE; + } +out: + return result; +} + + +static int init_pgtable(struct kimage *image, unsigned long start_pgtable) +{ + u64 *level4p; + level4p = (u64 *)__va(start_pgtable); + return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT); +} + +static void set_idt(void *newidt, u16 limit) +{ + unsigned char curidt[10]; + + /* x86-64 supports unaliged loads & stores */ + (*(u16 *)(curidt)) = limit; + (*(u64 *)(curidt +2)) = (unsigned long)(newidt); + + __asm__ __volatile__ ( + "lidt %0\n" + : "=m" (curidt) + ); +}; + + +static void set_gdt(void *newgdt, u16 limit) +{ + unsigned char curgdt[10]; + + /* x86-64 supports unaligned loads & stores */ + (*(u16 *)(curgdt)) = limit; + (*(u64 *)(curgdt +2)) = (unsigned long)(newgdt); + + __asm__ __volatile__ ( + "lgdt %0\n" + : "=m" (curgdt) + ); +}; + +static void load_segments(void) +{ + __asm__ __volatile__ ( + "\tmovl $"STR(__KERNEL_DS)",%eax\n" + "\tmovl %eax,%ds\n" + "\tmovl %eax,%es\n" + "\tmovl %eax,%ss\n" + "\tmovl %eax,%fs\n" + "\tmovl %eax,%gs\n" + ); +#undef STR +#undef __STR +} + +typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page, + unsigned long control_code_buffer, + unsigned long start_address, + unsigned long pgtable) ATTRIB_NORET; + +const extern unsigned char relocate_new_kernel[]; +const extern unsigned long relocate_new_kernel_size; + +int machine_kexec_prepare(struct kimage *image) +{ + unsigned long start_pgtable, control_code_buffer; + int result; + + /* Calculate the offsets */ + start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; + control_code_buffer = start_pgtable + 4096UL; + + /* Setup the identity mapped 64bit page table */ + result = init_pgtable(image, start_pgtable); + if (result) + return result; + + /* Place the code in the reboot code buffer */ + memcpy(__va(control_code_buffer), relocate_new_kernel, + relocate_new_kernel_size); + + return 0; +} + +void machine_kexec_cleanup(struct kimage *image) +{ + return; +} + +/* + * Do not allocate memory (or fail in any way) in machine_kexec(). + * We are past the point of no return, committed to rebooting now. + */ +NORET_TYPE void machine_kexec(struct kimage *image) +{ + unsigned long page_list; + unsigned long control_code_buffer; + unsigned long start_pgtable; + relocate_new_kernel_t rnk; + + /* Interrupts aren't acceptable while we reboot */ + local_irq_disable(); + + /* Calculate the offsets */ + page_list = image->head; + start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; + control_code_buffer = start_pgtable + 4096UL; + + /* Set the low half of the page table to my identity mapped + * page table for kexec. Leave the high half pointing at the + * kernel pages. Don't bother to flush the global pages + * as that will happen when I fully switch to my identity mapped + * page table anyway. + */ + memcpy(__va(read_cr3()), __va(start_pgtable), PAGE_SIZE/2); + __flush_tlb(); + + + /* The segment registers are funny things, they are + * automatically loaded from a table, in memory wherever you + * set them to a specific selector, but this table is never + * accessed again unless you set the segment to a different selector. + * + * The more common model are caches where the behide + * the scenes work is done, but is also dropped at arbitrary + * times. + * + * I take advantage of this here by force loading the + * segments, before I zap the gdt with an invalid value. + */ + load_segments(); + /* The gdt & idt are now invalid. + * If you want to load them you must set up your own idt & gdt. + */ + set_gdt(phys_to_virt(0),0); + set_idt(phys_to_virt(0),0); + /* now call it */ + rnk = (relocate_new_kernel_t) control_code_buffer; + (*rnk)(page_list, control_code_buffer, image->start, start_pgtable); +} diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c index 3a89d735a4f..21e70625a49 100644 --- a/arch/x86_64/kernel/mce.c +++ b/arch/x86_64/kernel/mce.c @@ -327,7 +327,7 @@ static void mce_init(void *dummy) } /* Add per CPU specific workarounds here */ -static void __init mce_cpu_quirks(struct cpuinfo_x86 *c) +static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) { /* This should be disabled by the BIOS, but isn't always */ if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) { @@ -337,7 +337,7 @@ static void __init mce_cpu_quirks(struct cpuinfo_x86 *c) } } -static void __init mce_cpu_features(struct cpuinfo_x86 *c) +static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c) { switch (c->x86_vendor) { case X86_VENDOR_INTEL: @@ -352,7 +352,7 @@ static void __init mce_cpu_features(struct cpuinfo_x86 *c) * Called for each booted CPU to set up machine checks. * Must be called with preempt off. */ -void __init mcheck_init(struct cpuinfo_x86 *c) +void __cpuinit mcheck_init(struct cpuinfo_x86 *c) { static cpumask_t mce_cpus __initdata = CPU_MASK_NONE; @@ -411,7 +411,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff memset(mcelog.entry, 0, next * sizeof(struct mce)); mcelog.next = 0; - synchronize_kernel(); + synchronize_sched(); /* Collect entries that were still getting written before the synchronize. */ @@ -542,7 +542,7 @@ ACCESSOR(bank4ctl,bank[4],mce_restart()) ACCESSOR(tolerant,tolerant,) ACCESSOR(check_interval,check_interval,mce_restart()) -static __init int mce_init_device(void) +static __cpuinit int mce_init_device(void) { int err; if (!mce_available(&boot_cpu_data)) diff --git a/arch/x86_64/kernel/mce_intel.c b/arch/x86_64/kernel/mce_intel.c index 4db9a640069..0be0a795981 100644 --- a/arch/x86_64/kernel/mce_intel.c +++ b/arch/x86_64/kernel/mce_intel.c @@ -42,7 +42,7 @@ done: irq_exit(); } -static void __init intel_init_thermal(struct cpuinfo_x86 *c) +static void __cpuinit intel_init_thermal(struct cpuinfo_x86 *c) { u32 l, h; int tm2 = 0; @@ -93,7 +93,7 @@ static void __init intel_init_thermal(struct cpuinfo_x86 *c) return; } -void __init mce_intel_feature_init(struct cpuinfo_x86 *c) +void __cpuinit mce_intel_feature_init(struct cpuinfo_x86 *c) { intel_init_thermal(c); } diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index 31c0f2e6ac9..4e44d6e6b7e 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -98,7 +98,7 @@ static unsigned int nmi_p4_cccr_val; (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \ P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE) -static __init inline int nmi_known_cpu(void) +static __cpuinit inline int nmi_known_cpu(void) { switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: @@ -110,7 +110,7 @@ static __init inline int nmi_known_cpu(void) } /* Run after command line and cpu_init init, but before all other checks */ -void __init nmi_watchdog_default(void) +void __cpuinit nmi_watchdog_default(void) { if (nmi_watchdog != NMI_DEFAULT) return; diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index e59d1f9d616..7577f9d7a75 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -8,7 +8,8 @@ * * X86-64 port * Andi Kleen. - * + * + * CPU hotplug support - ashok.raj@intel.com * $Id: process.c,v 1.38 2002/01/15 10:08:03 ak Exp $ */ @@ -18,6 +19,7 @@ #include <stdarg.h> +#include <linux/cpu.h> #include <linux/errno.h> #include <linux/sched.h> #include <linux/kernel.h> @@ -154,6 +156,29 @@ void cpu_idle_wait(void) } EXPORT_SYMBOL_GPL(cpu_idle_wait); +#ifdef CONFIG_HOTPLUG_CPU +DECLARE_PER_CPU(int, cpu_state); + +#include <asm/nmi.h> +/* We don't actually take CPU down, just spin without interrupts. */ +static inline void play_dead(void) +{ + idle_task_exit(); + wbinvd(); + mb(); + /* Ack it */ + __get_cpu_var(cpu_state) = CPU_DEAD; + + while (1) + safe_halt(); +} +#else +static inline void play_dead(void) +{ + BUG(); +} +#endif /* CONFIG_HOTPLUG_CPU */ + /* * The idle thread. There's no useful work to be * done, so just try to conserve power and have a @@ -174,6 +199,8 @@ void cpu_idle (void) idle = pm_idle; if (!idle) idle = default_idle; + if (cpu_is_offline(smp_processor_id())) + play_dead(); idle(); } @@ -204,7 +231,7 @@ static void mwait_idle(void) } } -void __init select_idle_routine(const struct cpuinfo_x86 *c) +void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) { static int printed; if (cpu_has(c, X86_FEATURE_MWAIT)) { @@ -455,6 +482,33 @@ out: } /* + * This function selects if the context switch from prev to next + * has to tweak the TSC disable bit in the cr4. + */ +static inline void disable_tsc(struct task_struct *prev_p, + struct task_struct *next_p) +{ + struct thread_info *prev, *next; + + /* + * gcc should eliminate the ->thread_info dereference if + * has_secure_computing returns 0 at compile time (SECCOMP=n). + */ + prev = prev_p->thread_info; + next = next_p->thread_info; + + if (has_secure_computing(prev) || has_secure_computing(next)) { + /* slow path here */ + if (has_secure_computing(prev) && + !has_secure_computing(next)) { + write_cr4(read_cr4() & ~X86_CR4_TSD); + } else if (!has_secure_computing(prev) && + has_secure_computing(next)) + write_cr4(read_cr4() | X86_CR4_TSD); + } +} + +/* * This special macro can be used to load a debugging register */ #define loaddebug(thread,r) set_debug(thread->debugreg ## r, r) @@ -572,6 +626,8 @@ struct task_struct *__switch_to(struct task_struct *prev_p, struct task_struct * } } + disable_tsc(prev_p, next_p); + return prev_p; } diff --git a/arch/x86_64/kernel/reboot.c b/arch/x86_64/kernel/reboot.c index be4b36f762c..57e71dbdfd6 100644 --- a/arch/x86_64/kernel/reboot.c +++ b/arch/x86_64/kernel/reboot.c @@ -66,41 +66,47 @@ static int __init reboot_setup(char *str) __setup("reboot=", reboot_setup); -#ifdef CONFIG_SMP -static void smp_halt(void) +static inline void kb_wait(void) { - int cpuid = safe_smp_processor_id(); - static int first_entry = 1; + int i; - if (reboot_force) - return; + for (i=0; i<0x10000; i++) + if ((inb_p(0x64) & 0x02) == 0) + break; +} - if (first_entry) { - first_entry = 0; - smp_call_function((void *)machine_restart, NULL, 1, 0); - } - - smp_stop_cpu(); +void machine_shutdown(void) +{ + /* Stop the cpus and apics */ +#ifdef CONFIG_SMP + int reboot_cpu_id; - /* AP calling this. Just halt */ - if (cpuid != boot_cpu_id) { - for (;;) - asm("hlt"); + /* The boot cpu is always logical cpu 0 */ + reboot_cpu_id = 0; + + /* Make certain the cpu I'm about to reboot on is online */ + if (!cpu_isset(reboot_cpu_id, cpu_online_map)) { + reboot_cpu_id = smp_processor_id(); } - /* Wait for all other CPUs to have run smp_stop_cpu */ - while (!cpus_empty(cpu_online_map)) - rep_nop(); -} + /* Make certain I only run on the appropriate processor */ + set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id)); + + /* O.K Now that I'm on the appropriate processor, + * stop all of the others. + */ + smp_send_stop(); #endif -static inline void kb_wait(void) -{ - int i; + local_irq_disable(); - for (i=0; i<0x10000; i++) - if ((inb_p(0x64) & 0x02) == 0) - break; +#ifndef CONFIG_SMP + disable_local_APIC(); +#endif + + disable_IO_APIC(); + + local_irq_enable(); } void machine_restart(char * __unused) @@ -109,9 +115,7 @@ void machine_restart(char * __unused) printk("machine restart\n"); -#ifdef CONFIG_SMP - smp_halt(); -#endif + machine_shutdown(); if (!reboot_force) { local_irq_disable(); diff --git a/arch/x86_64/kernel/relocate_kernel.S b/arch/x86_64/kernel/relocate_kernel.S new file mode 100644 index 00000000000..d24fa9b72a2 --- /dev/null +++ b/arch/x86_64/kernel/relocate_kernel.S @@ -0,0 +1,143 @@ +/* + * relocate_kernel.S - put the kernel image in place to boot + * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com> + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include <linux/linkage.h> + + /* + * Must be relocatable PIC code callable as a C function, that once + * it starts can not use the previous processes stack. + */ + .globl relocate_new_kernel + .code64 +relocate_new_kernel: + /* %rdi page_list + * %rsi reboot_code_buffer + * %rdx start address + * %rcx page_table + * %r8 arg5 + * %r9 arg6 + */ + + /* zero out flags, and disable interrupts */ + pushq $0 + popfq + + /* set a new stack at the bottom of our page... */ + lea 4096(%rsi), %rsp + + /* store the parameters back on the stack */ + pushq %rdx /* store the start address */ + + /* Set cr0 to a known state: + * 31 1 == Paging enabled + * 18 0 == Alignment check disabled + * 16 0 == Write protect disabled + * 3 0 == No task switch + * 2 0 == Don't do FP software emulation. + * 0 1 == Proctected mode enabled + */ + movq %cr0, %rax + andq $~((1<<18)|(1<<16)|(1<<3)|(1<<2)), %rax + orl $((1<<31)|(1<<0)), %eax + movq %rax, %cr0 + + /* Set cr4 to a known state: + * 10 0 == xmm exceptions disabled + * 9 0 == xmm registers instructions disabled + * 8 0 == performance monitoring counter disabled + * 7 0 == page global disabled + * 6 0 == machine check exceptions disabled + * 5 1 == physical address extension enabled + * 4 0 == page size extensions disabled + * 3 0 == Debug extensions disabled + * 2 0 == Time stamp disable (disabled) + * 1 0 == Protected mode virtual interrupts disabled + * 0 0 == VME disabled + */ + + movq $((1<<5)), %rax + movq %rax, %cr4 + + jmp 1f +1: + + /* Switch to the identity mapped page tables, + * and flush the TLB. + */ + movq %rcx, %cr3 + + /* Do the copies */ + movq %rdi, %rcx /* Put the page_list in %rcx */ + xorq %rdi, %rdi + xorq %rsi, %rsi + jmp 1f + +0: /* top, read another word for the indirection page */ + + movq (%rbx), %rcx + addq $8, %rbx +1: + testq $0x1, %rcx /* is it a destination page? */ + jz 2f + movq %rcx, %rdi + andq $0xfffffffffffff000, %rdi + jmp 0b +2: + testq $0x2, %rcx /* is it an indirection page? */ + jz 2f + movq %rcx, %rbx + andq $0xfffffffffffff000, %rbx + jmp 0b +2: + testq $0x4, %rcx /* is it the done indicator? */ + jz 2f + jmp 3f +2: + testq $0x8, %rcx /* is it the source indicator? */ + jz 0b /* Ignore it otherwise */ + movq %rcx, %rsi /* For ever source page do a copy */ + andq $0xfffffffffffff000, %rsi + + movq $512, %rcx + rep ; movsq + jmp 0b +3: + + /* To be certain of avoiding problems with self-modifying code + * I need to execute a serializing instruction here. + * So I flush the TLB by reloading %cr3 here, it's handy, + * and not processor dependent. + */ + movq %cr3, %rax + movq %rax, %cr3 + + /* set all of the registers to known values */ + /* leave %rsp alone */ + + xorq %rax, %rax + xorq %rbx, %rbx + xorq %rcx, %rcx + xorq %rdx, %rdx + xorq %rsi, %rsi + xorq %rdi, %rdi + xorq %rbp, %rbp + xorq %r8, %r8 + xorq %r9, %r9 + xorq %r10, %r9 + xorq %r11, %r11 + xorq %r12, %r12 + xorq %r13, %r13 + xorq %r14, %r14 + xorq %r15, %r15 + + ret +relocate_new_kernel_end: + + .globl relocate_new_kernel_size +relocate_new_kernel_size: + .quad relocate_new_kernel_end - relocate_new_kernel diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 000015dd5a8..5fd03225058 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -41,6 +41,7 @@ #include <linux/kallsyms.h> #include <linux/edd.h> #include <linux/mmzone.h> +#include <linux/kexec.h> #include <asm/mtrr.h> #include <asm/uaccess.h> @@ -367,6 +368,27 @@ static __init void parse_cmdline_early (char ** cmdline_p) if (!memcmp(from, "noexec=", 7)) nonx_setup(from + 7); +#ifdef CONFIG_KEXEC + /* crashkernel=size@addr specifies the location to reserve for + * a crash kernel. By reserving this memory we guarantee + * that linux never set's it up as a DMA target. + * Useful for holding code to do something appropriate + * after a kernel panic. + */ + else if (!memcmp(from, "crashkernel=", 12)) { + unsigned long size, base; + size = memparse(from+12, &from); + if (*from == '@') { + base = memparse(from+1, &from); + /* FIXME: Do I want a sanity check + * to validate the memory range? + */ + crashk_res.start = base; + crashk_res.end = base + size - 1; + } + } +#endif + next_char: c = *(from++); if (!c) @@ -625,6 +647,13 @@ void __init setup_arch(char **cmdline_p) #endif sparse_init(); + +#ifdef CONFIG_KEXEC + if (crashk_res.start != crashk_res.end) { + reserve_bootmem(crashk_res.start, + crashk_res.end - crashk_res.start + 1); + } +#endif paging_init(); check_ioapic(); @@ -676,7 +705,7 @@ void __init setup_arch(char **cmdline_p) #endif } -static int __init get_model_name(struct cpuinfo_x86 *c) +static int __cpuinit get_model_name(struct cpuinfo_x86 *c) { unsigned int *v; @@ -692,7 +721,7 @@ static int __init get_model_name(struct cpuinfo_x86 *c) } -static void __init display_cacheinfo(struct cpuinfo_x86 *c) +static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) { unsigned int n, dummy, eax, ebx, ecx, edx; @@ -803,7 +832,7 @@ static int __init init_amd(struct cpuinfo_x86 *c) return r; } -static void __init detect_ht(struct cpuinfo_x86 *c) +static void __cpuinit detect_ht(struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP u32 eax, ebx, ecx, edx; @@ -864,7 +893,7 @@ static void __init detect_ht(struct cpuinfo_x86 *c) /* * find out the number of processor cores on the die */ -static int __init intel_num_cpu_cores(struct cpuinfo_x86 *c) +static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c) { unsigned int eax; @@ -882,7 +911,7 @@ static int __init intel_num_cpu_cores(struct cpuinfo_x86 *c) return 1; } -static void __init init_intel(struct cpuinfo_x86 *c) +static void __cpuinit init_intel(struct cpuinfo_x86 *c) { /* Cache sizes */ unsigned n; @@ -902,7 +931,7 @@ static void __init init_intel(struct cpuinfo_x86 *c) c->x86_num_cores = intel_num_cpu_cores(c); } -void __init get_cpu_vendor(struct cpuinfo_x86 *c) +void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) { char *v = c->x86_vendor_id; @@ -923,7 +952,7 @@ struct cpu_model_info { /* Do some early cpuid on the boot CPU to get some parameter that are needed before check_bugs. Everything advanced is in identify_cpu below. */ -void __init early_identify_cpu(struct cpuinfo_x86 *c) +void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) { u32 tfms; @@ -977,7 +1006,7 @@ void __init early_identify_cpu(struct cpuinfo_x86 *c) /* * This does the hard work of actually picking apart the CPU stuff... */ -void __init identify_cpu(struct cpuinfo_x86 *c) +void __cpuinit identify_cpu(struct cpuinfo_x86 *c) { int i; u32 xlvl; @@ -1047,6 +1076,10 @@ void __init identify_cpu(struct cpuinfo_x86 *c) #ifdef CONFIG_X86_MCE mcheck_init(c); #endif + if (c == &boot_cpu_data) + mtrr_bp_init(); + else + mtrr_ap_init(); #ifdef CONFIG_NUMA if (c != &boot_cpu_data) numa_add_cpu(c - cpu_data); @@ -1054,7 +1087,7 @@ void __init identify_cpu(struct cpuinfo_x86 *c) } -void __init print_cpu_info(struct cpuinfo_x86 *c) +void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) { if (c->x86_model_id[0]) printk("%s", c->x86_model_id); diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c index 678b7ac33b8..f2b9c6bc999 100644 --- a/arch/x86_64/kernel/setup64.c +++ b/arch/x86_64/kernel/setup64.c @@ -29,7 +29,7 @@ char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,}; -cpumask_t cpu_initialized __initdata = CPU_MASK_NONE; +cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; struct x8664_pda cpu_pda[NR_CPUS] __cacheline_aligned; @@ -171,7 +171,7 @@ void syscall_init(void) wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000); } -void __init check_efer(void) +void __cpuinit check_efer(void) { unsigned long efer; @@ -188,7 +188,7 @@ void __init check_efer(void) * 'CPU state barrier', nothing should get across. * A lot of state is already set up in PDA init. */ -void __init cpu_init (void) +void __cpuinit cpu_init (void) { #ifdef CONFIG_SMP int cpu = stack_smp_processor_id(); diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c index 98b7ba95d58..98590a989f3 100644 --- a/arch/x86_64/kernel/signal.c +++ b/arch/x86_64/kernel/signal.c @@ -425,7 +425,7 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset) if (!user_mode(regs)) return 1; - if (try_to_freeze(0)) + if (try_to_freeze()) goto no_signal; if (!oldset) diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c index 1e379ed17b1..ccae392886a 100644 --- a/arch/x86_64/kernel/smp.c +++ b/arch/x86_64/kernel/smp.c @@ -283,6 +283,16 @@ struct call_data_struct { static struct call_data_struct * call_data; +void lock_ipi_call_lock(void) +{ + spin_lock_irq(&call_lock); +} + +void unlock_ipi_call_lock(void) +{ + spin_unlock_irq(&call_lock); +} + /* * this function sends a 'generic call function' IPI to all other CPUs * in the system. diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index f1ec0f34594..b969ee12872 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -34,6 +34,7 @@ * Andi Kleen : Converted to new state machine. * Various cleanups. * Probably mostly hotplug CPU ready now. + * Ashok Raj : CPU hotplug support */ @@ -58,11 +59,6 @@ #include <asm/proto.h> #include <asm/nmi.h> -/* Change for real CPU hotplug. Note other files need to be fixed - first too. */ -#define __cpuinit __init -#define __cpuinitdata __initdata - /* Number of siblings per CPU package */ int smp_num_siblings = 1; /* Package ID of each logical CPU */ @@ -103,6 +99,37 @@ EXPORT_SYMBOL(cpu_core_map); extern unsigned char trampoline_data[]; extern unsigned char trampoline_end[]; +/* State of each CPU */ +DEFINE_PER_CPU(int, cpu_state) = { 0 }; + +/* + * Store all idle threads, this can be reused instead of creating + * a new thread. Also avoids complicated thread destroy functionality + * for idle threads. + */ +struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ; + +#define get_idle_for_cpu(x) (idle_thread_array[(x)]) +#define set_idle_for_cpu(x,p) (idle_thread_array[(x)] = (p)) + +/* + * cpu_possible_map should be static, it cannot change as cpu's + * are onlined, or offlined. The reason is per-cpu data-structures + * are allocated by some modules at init time, and dont expect to + * do this dynamically on cpu arrival/departure. + * cpu_present_map on the other hand can change dynamically. + * In case when cpu_hotplug is not compiled, then we resort to current + * behaviour, which is cpu_possible == cpu_present. + * If cpu-hotplug is supported, then we need to preallocate for all + * those NR_CPUS, hence cpu_possible_map represents entire NR_CPUS range. + * - Ashok Raj + */ +#ifdef CONFIG_HOTPLUG_CPU +#define fixup_cpu_possible_map(x) cpu_set((x), cpu_possible_map) +#else +#define fixup_cpu_possible_map(x) +#endif + /* * Currently trivial. Write the real->protected mode * bootstrap into the page concerned. The caller @@ -418,6 +445,33 @@ void __cpuinit smp_callin(void) cpu_set(cpuid, cpu_callin_map); } +static inline void set_cpu_sibling_map(int cpu) +{ + int i; + + if (smp_num_siblings > 1) { + for_each_cpu(i) { + if (cpu_core_id[cpu] == cpu_core_id[i]) { + cpu_set(i, cpu_sibling_map[cpu]); + cpu_set(cpu, cpu_sibling_map[i]); + } + } + } else { + cpu_set(cpu, cpu_sibling_map[cpu]); + } + + if (current_cpu_data.x86_num_cores > 1) { + for_each_cpu(i) { + if (phys_proc_id[cpu] == phys_proc_id[i]) { + cpu_set(i, cpu_core_map[cpu]); + cpu_set(cpu, cpu_core_map[i]); + } + } + } else { + cpu_core_map[cpu] = cpu_sibling_map[cpu]; + } +} + /* * Setup code on secondary processor (after comming out of the trampoline) */ @@ -448,9 +502,28 @@ void __cpuinit start_secondary(void) enable_APIC_timer(); /* + * The sibling maps must be set before turing the online map on for + * this cpu + */ + set_cpu_sibling_map(smp_processor_id()); + + /* + * We need to hold call_lock, so there is no inconsistency + * between the time smp_call_function() determines number of + * IPI receipients, and the time when the determination is made + * for which cpus receive the IPI in genapic_flat.c. Holding this + * lock helps us to not include this cpu in a currently in progress + * smp_call_function(). + */ + lock_ipi_call_lock(); + + /* * Allow the master to continue. */ cpu_set(smp_processor_id(), cpu_online_map); + per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; + unlock_ipi_call_lock(); + mb(); /* Wait for TSC sync to not schedule things before. @@ -628,33 +701,77 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta return (send_status | accept_status); } +struct create_idle { + struct task_struct *idle; + struct completion done; + int cpu; +}; + +void do_fork_idle(void *_c_idle) +{ + struct create_idle *c_idle = _c_idle; + + c_idle->idle = fork_idle(c_idle->cpu); + complete(&c_idle->done); +} + /* * Boot one CPU. */ static int __cpuinit do_boot_cpu(int cpu, int apicid) { - struct task_struct *idle; unsigned long boot_error; int timeout; unsigned long start_rip; + struct create_idle c_idle = { + .cpu = cpu, + .done = COMPLETION_INITIALIZER(c_idle.done), + }; + DECLARE_WORK(work, do_fork_idle, &c_idle); + + c_idle.idle = get_idle_for_cpu(cpu); + + if (c_idle.idle) { + c_idle.idle->thread.rsp = (unsigned long) (((struct pt_regs *) + (THREAD_SIZE + (unsigned long) c_idle.idle->thread_info)) - 1); + init_idle(c_idle.idle, cpu); + goto do_rest; + } + /* - * We can't use kernel_thread since we must avoid to - * reschedule the child. + * During cold boot process, keventd thread is not spun up yet. + * When we do cpu hot-add, we create idle threads on the fly, we should + * not acquire any attributes from the calling context. Hence the clean + * way to create kernel_threads() is to do that from keventd(). + * We do the current_is_keventd() due to the fact that ACPI notifier + * was also queuing to keventd() and when the caller is already running + * in context of keventd(), we would end up with locking up the keventd + * thread. */ - idle = fork_idle(cpu); - if (IS_ERR(idle)) { + if (!keventd_up() || current_is_keventd()) + work.func(work.data); + else { + schedule_work(&work); + wait_for_completion(&c_idle.done); + } + + if (IS_ERR(c_idle.idle)) { printk("failed fork for CPU %d\n", cpu); - return PTR_ERR(idle); + return PTR_ERR(c_idle.idle); } - cpu_pda[cpu].pcurrent = idle; + set_idle_for_cpu(cpu, c_idle.idle); + +do_rest: + + cpu_pda[cpu].pcurrent = c_idle.idle; start_rip = setup_trampoline(); - init_rsp = idle->thread.rsp; + init_rsp = c_idle.idle->thread.rsp; per_cpu(init_tss,cpu).rsp0 = init_rsp; initial_code = start_secondary; - clear_ti_thread_flag(idle->thread_info, TIF_FORK); + clear_ti_thread_flag(c_idle.idle->thread_info, TIF_FORK); printk(KERN_INFO "Booting processor %d/%d rip %lx rsp %lx\n", cpu, apicid, start_rip, init_rsp); @@ -746,51 +863,6 @@ cycles_t cacheflush_time; unsigned long cache_decay_ticks; /* - * Construct cpu_sibling_map[], so that we can tell the sibling CPU - * on SMT systems efficiently. - */ -static __cpuinit void detect_siblings(void) -{ - int cpu; - - for (cpu = 0; cpu < NR_CPUS; cpu++) { - cpus_clear(cpu_sibling_map[cpu]); - cpus_clear(cpu_core_map[cpu]); - } - - for_each_online_cpu (cpu) { - struct cpuinfo_x86 *c = cpu_data + cpu; - int siblings = 0; - int i; - if (smp_num_siblings > 1) { - for_each_online_cpu (i) { - if (cpu_core_id[cpu] == cpu_core_id[i]) { - siblings++; - cpu_set(i, cpu_sibling_map[cpu]); - } - } - } else { - siblings++; - cpu_set(cpu, cpu_sibling_map[cpu]); - } - - if (siblings != smp_num_siblings) { - printk(KERN_WARNING - "WARNING: %d siblings found for CPU%d, should be %d\n", - siblings, cpu, smp_num_siblings); - smp_num_siblings = siblings; - } - if (c->x86_num_cores > 1) { - for_each_online_cpu(i) { - if (phys_proc_id[cpu] == phys_proc_id[i]) - cpu_set(i, cpu_core_map[cpu]); - } - } else - cpu_core_map[cpu] = cpu_sibling_map[cpu]; - } -} - -/* * Cleanup possible dangling ends... */ static __cpuinit void smp_cleanup_boot(void) @@ -823,7 +895,7 @@ static __cpuinit void smp_cleanup_boot(void) * * RED-PEN audit/test this more. I bet there is more state messed up here. */ -static __cpuinit void disable_smp(void) +static __init void disable_smp(void) { cpu_present_map = cpumask_of_cpu(0); cpu_possible_map = cpumask_of_cpu(0); @@ -838,7 +910,7 @@ static __cpuinit void disable_smp(void) /* * Handle user cpus=... parameter. */ -static __cpuinit void enforce_max_cpus(unsigned max_cpus) +static __init void enforce_max_cpus(unsigned max_cpus) { int i, k; k = 0; @@ -855,7 +927,7 @@ static __cpuinit void enforce_max_cpus(unsigned max_cpus) /* * Various sanity checks. */ -static int __cpuinit smp_sanity_check(unsigned max_cpus) +static int __init smp_sanity_check(unsigned max_cpus) { if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { printk("weird, boot CPU (#%d) not listed by the BIOS.\n", @@ -913,7 +985,7 @@ static int __cpuinit smp_sanity_check(unsigned max_cpus) * Prepare for SMP bootup. The MP table or ACPI has been read * earlier. Just do some sanity checking here and enable APIC mode. */ -void __cpuinit smp_prepare_cpus(unsigned int max_cpus) +void __init smp_prepare_cpus(unsigned int max_cpus) { int i; @@ -930,10 +1002,9 @@ void __cpuinit smp_prepare_cpus(unsigned int max_cpus) int apicid = cpu_present_to_apicid(i); if (physid_isset(apicid, phys_cpu_present_map)) { cpu_set(i, cpu_present_map); - /* possible map would be different if we supported real - CPU hotplug. */ cpu_set(i, cpu_possible_map); } + fixup_cpu_possible_map(i); } if (smp_sanity_check(max_cpus) < 0) { @@ -978,13 +1049,13 @@ void __init smp_prepare_boot_cpu(void) int me = smp_processor_id(); cpu_set(me, cpu_online_map); cpu_set(me, cpu_callout_map); + cpu_set(0, cpu_sibling_map[0]); + cpu_set(0, cpu_core_map[0]); + per_cpu(cpu_state, me) = CPU_ONLINE; } /* * Entry point to boot a CPU. - * - * This is all __cpuinit, not __devinit for now because we don't support - * CPU hotplug (yet). */ int __cpuinit __cpu_up(unsigned int cpu) { @@ -1001,6 +1072,15 @@ int __cpuinit __cpu_up(unsigned int cpu) return -EINVAL; } + /* + * Already booted CPU? + */ + if (cpu_isset(cpu, cpu_callin_map)) { + Dprintk("do_boot_cpu %d Already started\n", cpu); + return -ENOSYS; + } + + per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; /* Boot it! */ err = do_boot_cpu(cpu, apicid); if (err < 0) { @@ -1013,23 +1093,118 @@ int __cpuinit __cpu_up(unsigned int cpu) while (!cpu_isset(cpu, cpu_online_map)) cpu_relax(); - return 0; + err = 0; + + return err; } /* * Finish the SMP boot. */ -void __cpuinit smp_cpus_done(unsigned int max_cpus) +void __init smp_cpus_done(unsigned int max_cpus) { +#ifndef CONFIG_HOTPLUG_CPU zap_low_mappings(); +#endif smp_cleanup_boot(); #ifdef CONFIG_X86_IO_APIC setup_ioapic_dest(); #endif - detect_siblings(); time_init_gtod(); check_nmi_watchdog(); } + +#ifdef CONFIG_HOTPLUG_CPU + +static void remove_siblinginfo(int cpu) +{ + int sibling; + + for_each_cpu_mask(sibling, cpu_sibling_map[cpu]) + cpu_clear(cpu, cpu_sibling_map[sibling]); + for_each_cpu_mask(sibling, cpu_core_map[cpu]) + cpu_clear(cpu, cpu_core_map[sibling]); + cpus_clear(cpu_sibling_map[cpu]); + cpus_clear(cpu_core_map[cpu]); + phys_proc_id[cpu] = BAD_APICID; + cpu_core_id[cpu] = BAD_APICID; +} + +void remove_cpu_from_maps(void) +{ + int cpu = smp_processor_id(); + + cpu_clear(cpu, cpu_callout_map); + cpu_clear(cpu, cpu_callin_map); + clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */ +} + +int __cpu_disable(void) +{ + int cpu = smp_processor_id(); + + /* + * Perhaps use cpufreq to drop frequency, but that could go + * into generic code. + * + * We won't take down the boot processor on i386 due to some + * interrupts only being able to be serviced by the BSP. + * Especially so if we're not using an IOAPIC -zwane + */ + if (cpu == 0) + return -EBUSY; + + disable_APIC_timer(); + + /* + * HACK: + * Allow any queued timer interrupts to get serviced + * This is only a temporary solution until we cleanup + * fixup_irqs as we do for IA64. + */ + local_irq_enable(); + mdelay(1); + + local_irq_disable(); + remove_siblinginfo(cpu); + + /* It's now safe to remove this processor from the online map */ + cpu_clear(cpu, cpu_online_map); + remove_cpu_from_maps(); + fixup_irqs(cpu_online_map); + return 0; +} + +void __cpu_die(unsigned int cpu) +{ + /* We don't do anything here: idle task is faking death itself. */ + unsigned int i; + + for (i = 0; i < 10; i++) { + /* They ack this in play_dead by setting CPU_DEAD */ + if (per_cpu(cpu_state, cpu) == CPU_DEAD) { + printk ("CPU %d is now offline\n", cpu); + return; + } + current->state = TASK_UNINTERRUPTIBLE; + schedule_timeout(HZ/10); + } + printk(KERN_ERR "CPU %u didn't die...\n", cpu); +} + +#else /* ... !CONFIG_HOTPLUG_CPU */ + +int __cpu_disable(void) +{ + return -ENOSYS; +} + +void __cpu_die(unsigned int cpu) +{ + /* We said "no" in __cpu_disable */ + BUG(); +} +#endif /* CONFIG_HOTPLUG_CPU */ diff --git a/arch/x86_64/kernel/suspend.c b/arch/x86_64/kernel/suspend.c index ebaa1e37d65..0612640d91b 100644 --- a/arch/x86_64/kernel/suspend.c +++ b/arch/x86_64/kernel/suspend.c @@ -44,7 +44,6 @@ void __save_processor_state(struct saved_context *ctxt) */ asm volatile ("sgdt %0" : "=m" (ctxt->gdt_limit)); asm volatile ("sidt %0" : "=m" (ctxt->idt_limit)); - asm volatile ("sldt %0" : "=m" (ctxt->ldt)); asm volatile ("str %0" : "=m" (ctxt->tr)); /* XMM0..XMM15 should be handled by kernel_fpu_begin(). */ @@ -69,6 +68,7 @@ void __save_processor_state(struct saved_context *ctxt) asm volatile ("movq %%cr2, %0" : "=r" (ctxt->cr2)); asm volatile ("movq %%cr3, %0" : "=r" (ctxt->cr3)); asm volatile ("movq %%cr4, %0" : "=r" (ctxt->cr4)); + asm volatile ("movq %%cr8, %0" : "=r" (ctxt->cr8)); } void save_processor_state(void) @@ -90,12 +90,20 @@ void __restore_processor_state(struct saved_context *ctxt) /* * control registers */ + asm volatile ("movq %0, %%cr8" :: "r" (ctxt->cr8)); asm volatile ("movq %0, %%cr4" :: "r" (ctxt->cr4)); asm volatile ("movq %0, %%cr3" :: "r" (ctxt->cr3)); asm volatile ("movq %0, %%cr2" :: "r" (ctxt->cr2)); asm volatile ("movq %0, %%cr0" :: "r" (ctxt->cr0)); /* + * now restore the descriptor tables to their proper values + * ltr is done i fix_processor_context(). + */ + asm volatile ("lgdt %0" :: "m" (ctxt->gdt_limit)); + asm volatile ("lidt %0" :: "m" (ctxt->idt_limit)); + + /* * segment registers */ asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds)); @@ -108,17 +116,10 @@ void __restore_processor_state(struct saved_context *ctxt) wrmsrl(MSR_GS_BASE, ctxt->gs_base); wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); - /* - * now restore the descriptor tables to their proper values - * ltr is done i fix_processor_context(). - */ - asm volatile ("lgdt %0" :: "m" (ctxt->gdt_limit)); - asm volatile ("lidt %0" :: "m" (ctxt->idt_limit)); - asm volatile ("lldt %0" :: "m" (ctxt->ldt)); - fix_processor_context(); do_fpu_end(); + mtrr_ap_init(); } void restore_processor_state(void) diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 121646fc43f..10273663000 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -586,11 +586,17 @@ static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs) asmlinkage void default_do_nmi(struct pt_regs *regs) { unsigned char reason = 0; + int cpu; + + cpu = smp_processor_id(); /* Only the BSP gets external NMIs from the system. */ - if (!smp_processor_id()) + if (!cpu) reason = get_nmi_reason(); + if (!cpu_online(cpu)) + return; + if (!(reason & 0xc0)) { if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 0, SIGINT) == NOTIFY_STOP) diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S index 59ebd5beda8..2a94f9b60b2 100644 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ b/arch/x86_64/kernel/vmlinux.lds.S @@ -2,7 +2,10 @@ * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>; */ +#define LOAD_OFFSET __START_KERNEL_map + #include <asm-generic/vmlinux.lds.h> +#include <asm/page.h> #include <linux/config.h> OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64") @@ -11,28 +14,30 @@ ENTRY(phys_startup_64) jiffies_64 = jiffies; SECTIONS { - . = 0xffffffff80100000; + . = __START_KERNEL; phys_startup_64 = startup_64 - LOAD_OFFSET; _text = .; /* Text and read-only data */ - .text : { + .text : AT(ADDR(.text) - LOAD_OFFSET) { *(.text) SCHED_TEXT LOCK_TEXT *(.fixup) *(.gnu.warning) } = 0x9090 - .text.lock : { *(.text.lock) } /* out-of-line lock text */ + /* out-of-line lock text */ + .text.lock : AT(ADDR(.text.lock) - LOAD_OFFSET) { *(.text.lock) } _etext = .; /* End of text section */ . = ALIGN(16); /* Exception table */ __start___ex_table = .; - __ex_table : { *(__ex_table) } + __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) } __stop___ex_table = .; RODATA - .data : { /* Data */ + /* Data */ + .data : AT(ADDR(.data) - LOAD_OFFSET) { *(.data) CONSTRUCTORS } @@ -40,62 +45,99 @@ SECTIONS _edata = .; /* End of data section */ __bss_start = .; /* BSS */ - .bss : { + .bss : AT(ADDR(.bss) - LOAD_OFFSET) { *(.bss.page_aligned) *(.bss) } __bss_end = .; + . = ALIGN(PAGE_SIZE); + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { + *(.data.cacheline_aligned) + } . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .data.cacheline_aligned : { *(.data.cacheline_aligned) } + .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { + *(.data.read_mostly) + } + +#define VSYSCALL_ADDR (-10*1024*1024) +#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) +#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) + +#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR) +#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) -#define AFTER(x) BINALIGN(LOADADDR(x) + SIZEOF(x), 16) -#define BINALIGN(x,y) (((x) + (y) - 1) & ~((y) - 1)) -#define CACHE_ALIGN(x) BINALIGN(x, CONFIG_X86_L1_CACHE_BYTES) +#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR) +#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) + + . = VSYSCALL_ADDR; + .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } + __vsyscall_0 = VSYSCALL_VIRT_ADDR; - .vsyscall_0 -10*1024*1024: AT ((LOADADDR(.data.cacheline_aligned) + SIZEOF(.data.cacheline_aligned) + 4095) & ~(4095)) { *(.vsyscall_0) } - __vsyscall_0 = LOADADDR(.vsyscall_0); . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .xtime_lock : AT CACHE_ALIGN(AFTER(.vsyscall_0)) { *(.xtime_lock) } - xtime_lock = LOADADDR(.xtime_lock); - .vxtime : AT AFTER(.xtime_lock) { *(.vxtime) } - vxtime = LOADADDR(.vxtime); - .wall_jiffies : AT AFTER(.vxtime) { *(.wall_jiffies) } - wall_jiffies = LOADADDR(.wall_jiffies); - .sys_tz : AT AFTER(.wall_jiffies) { *(.sys_tz) } - sys_tz = LOADADDR(.sys_tz); - .sysctl_vsyscall : AT AFTER(.sys_tz) { *(.sysctl_vsyscall) } - sysctl_vsyscall = LOADADDR(.sysctl_vsyscall); - .xtime : AT AFTER(.sysctl_vsyscall) { *(.xtime) } - xtime = LOADADDR(.xtime); + .xtime_lock : AT(VLOAD(.xtime_lock)) { *(.xtime_lock) } + xtime_lock = VVIRT(.xtime_lock); + + .vxtime : AT(VLOAD(.vxtime)) { *(.vxtime) } + vxtime = VVIRT(.vxtime); + + .wall_jiffies : AT(VLOAD(.wall_jiffies)) { *(.wall_jiffies) } + wall_jiffies = VVIRT(.wall_jiffies); + + .sys_tz : AT(VLOAD(.sys_tz)) { *(.sys_tz) } + sys_tz = VVIRT(.sys_tz); + + .sysctl_vsyscall : AT(VLOAD(.sysctl_vsyscall)) { *(.sysctl_vsyscall) } + sysctl_vsyscall = VVIRT(.sysctl_vsyscall); + + .xtime : AT(VLOAD(.xtime)) { *(.xtime) } + xtime = VVIRT(.xtime); + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .jiffies : AT CACHE_ALIGN(AFTER(.xtime)) { *(.jiffies) } - jiffies = LOADADDR(.jiffies); - .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT (LOADADDR(.vsyscall_0) + 1024) { *(.vsyscall_1) } - . = LOADADDR(.vsyscall_0) + 4096; + .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) } + jiffies = VVIRT(.jiffies); + + .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { *(.vsyscall_1) } + .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) { *(.vsyscall_2) } + .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { *(.vsyscall_3) } + + . = VSYSCALL_VIRT_ADDR + 4096; + +#undef VSYSCALL_ADDR +#undef VSYSCALL_PHYS_ADDR +#undef VSYSCALL_VIRT_ADDR +#undef VLOAD_OFFSET +#undef VLOAD +#undef VVIRT_OFFSET +#undef VVIRT . = ALIGN(8192); /* init_task */ - .data.init_task : { *(.data.init_task) } + .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { + *(.data.init_task) + } . = ALIGN(4096); - .data.page_aligned : { *(.data.page_aligned) } + .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { + *(.data.page_aligned) + } . = ALIGN(4096); /* Init code and data */ __init_begin = .; - .init.text : { + .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { _sinittext = .; *(.init.text) _einittext = .; } __initdata_begin = .; - .init.data : { *(.init.data) } + .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) } __initdata_end = .; . = ALIGN(16); __setup_start = .; - .init.setup : { *(.init.setup) } + .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) } __setup_end = .; __initcall_start = .; - .initcall.init : { + .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { *(.initcall1.init) *(.initcall2.init) *(.initcall3.init) @@ -106,32 +148,38 @@ SECTIONS } __initcall_end = .; __con_initcall_start = .; - .con_initcall.init : { *(.con_initcall.init) } + .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { + *(.con_initcall.init) + } __con_initcall_end = .; SECURITY_INIT . = ALIGN(8); __alt_instructions = .; - .altinstructions : { *(.altinstructions) } + .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { + *(.altinstructions) + } __alt_instructions_end = .; - .altinstr_replacement : { *(.altinstr_replacement) } + .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { + *(.altinstr_replacement) + } /* .exit.text is discard at runtime, not link time, to deal with references from .altinstructions and .eh_frame */ - .exit.text : { *(.exit.text) } - .exit.data : { *(.exit.data) } + .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) } + .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) } . = ALIGN(4096); __initramfs_start = .; - .init.ramfs : { *(.init.ramfs) } + .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) } __initramfs_end = .; . = ALIGN(32); __per_cpu_start = .; - .data.percpu : { *(.data.percpu) } + .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) } __per_cpu_end = .; . = ALIGN(4096); __init_end = .; . = ALIGN(4096); __nosave_begin = .; - .data_nosave : { *(.data.nosave) } + .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) } . = ALIGN(4096); __nosave_end = .; diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index 84cde796ecb..ac61c186eb0 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c @@ -251,7 +251,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); } -__init void numa_add_cpu(int cpu) +__cpuinit void numa_add_cpu(int cpu) { /* BP is initialized elsewhere */ if (cpu) diff --git a/arch/x86_64/pci/mmconfig.c b/arch/x86_64/pci/mmconfig.c index b693c232fd0..657e88aa090 100644 --- a/arch/x86_64/pci/mmconfig.c +++ b/arch/x86_64/pci/mmconfig.c @@ -7,25 +7,50 @@ #include <linux/pci.h> #include <linux/init.h> +#include <linux/acpi.h> #include "pci.h" #define MMCONFIG_APER_SIZE (256*1024*1024) -/* The physical address of the MMCONFIG aperture. Set from ACPI tables. */ -u32 pci_mmcfg_base_addr; - /* Static virtual mapping of the MMCONFIG aperture */ -char *pci_mmcfg_virt; +struct mmcfg_virt { + struct acpi_table_mcfg_config *cfg; + char *virt; +}; +static struct mmcfg_virt *pci_mmcfg_virt; -static inline char *pci_dev_base(unsigned int bus, unsigned int devfn) +static char *get_virt(unsigned int seg, int bus) { - return pci_mmcfg_virt + ((bus << 20) | (devfn << 12)); + int cfg_num = -1; + struct acpi_table_mcfg_config *cfg; + + while (1) { + ++cfg_num; + if (cfg_num >= pci_mmcfg_config_num) { + /* something bad is going on, no cfg table is found. */ + /* so we fall back to the old way we used to do this */ + /* and just rely on the first entry to be correct. */ + return pci_mmcfg_virt[0].virt; + } + cfg = pci_mmcfg_virt[cfg_num].cfg; + if (cfg->pci_segment_group_number != seg) + continue; + if ((cfg->start_bus_number <= bus) && + (cfg->end_bus_number >= bus)) + return pci_mmcfg_virt[cfg_num].virt; + } +} + +static inline char *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn) +{ + + return get_virt(seg, bus) + ((bus << 20) | (devfn << 12)); } static int pci_mmcfg_read(unsigned int seg, unsigned int bus, unsigned int devfn, int reg, int len, u32 *value) { - char *addr = pci_dev_base(bus, devfn); + char *addr = pci_dev_base(seg, bus, devfn); if (unlikely(!value || (bus > 255) || (devfn > 255) || (reg > 4095))) return -EINVAL; @@ -48,7 +73,7 @@ static int pci_mmcfg_read(unsigned int seg, unsigned int bus, static int pci_mmcfg_write(unsigned int seg, unsigned int bus, unsigned int devfn, int reg, int len, u32 value) { - char *addr = pci_dev_base(bus,devfn); + char *addr = pci_dev_base(seg, bus, devfn); if (unlikely((bus > 255) || (devfn > 255) || (reg > 4095))) return -EINVAL; @@ -75,9 +100,15 @@ static struct pci_raw_ops pci_mmcfg = { static int __init pci_mmcfg_init(void) { + int i; + if ((pci_probe & PCI_PROBE_MMCONF) == 0) return 0; - if (!pci_mmcfg_base_addr) + + acpi_table_parse(ACPI_MCFG, acpi_parse_mcfg); + if ((pci_mmcfg_config_num == 0) || + (pci_mmcfg_config == NULL) || + (pci_mmcfg_config[0].base_address == 0)) return 0; /* Kludge for now. Don't use mmconfig on AMD systems because @@ -88,13 +119,22 @@ static int __init pci_mmcfg_init(void) return 0; /* RED-PEN i386 doesn't do _nocache right now */ - pci_mmcfg_virt = ioremap_nocache(pci_mmcfg_base_addr, MMCONFIG_APER_SIZE); - if (!pci_mmcfg_virt) { - printk("PCI: Cannot map mmconfig aperture\n"); + pci_mmcfg_virt = kmalloc(sizeof(*pci_mmcfg_virt) * pci_mmcfg_config_num, GFP_KERNEL); + if (pci_mmcfg_virt == NULL) { + printk("PCI: Can not allocate memory for mmconfig structures\n"); return 0; - } + } + for (i = 0; i < pci_mmcfg_config_num; ++i) { + pci_mmcfg_virt[i].cfg = &pci_mmcfg_config[i]; + pci_mmcfg_virt[i].virt = ioremap_nocache(pci_mmcfg_config[i].base_address, MMCONFIG_APER_SIZE); + if (!pci_mmcfg_virt[i].virt) { + printk("PCI: Cannot map mmconfig aperture for segment %d\n", + pci_mmcfg_config[i].pci_segment_group_number); + return 0; + } + printk(KERN_INFO "PCI: Using MMCONFIG at %x\n", pci_mmcfg_config[i].base_address); + } - printk(KERN_INFO "PCI: Using MMCONFIG at %x\n", pci_mmcfg_base_addr); raw_pci_ops = &pci_mmcfg; pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; |