diff options
Diffstat (limited to 'arch/x86_64')
63 files changed, 2634 insertions, 774 deletions
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index 289f448ac89..660a03a89e6 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig @@ -207,33 +207,6 @@ config SMP If you don't know what to do here, say N. -config PREEMPT - bool "Preemptible Kernel" - ---help--- - This option reduces the latency of the kernel when reacting to - real-time or interactive events by allowing a low priority process to - be preempted even if it is in kernel mode executing a system call. - This allows applications to run more reliably even when the system is - under load. On contrary it may also break your drivers and add - priority inheritance problems to your system. Don't select it if - you rely on a stable system or have slightly obscure hardware. - It's also not very well tested on x86-64 currently. - You have been warned. - - Say Y here if you are feeling brave and building a kernel for a - desktop, embedded or real-time system. Say N if you are unsure. - -config PREEMPT_BKL - bool "Preempt The Big Kernel Lock" - depends on PREEMPT - default y - help - This option reduces the latency of the kernel by making the - big kernel lock preemptible. - - Say Y here if you are building a kernel for a desktop system. - Say N if you are unsure. - config SCHED_SMT bool "SMT (Hyperthreading) scheduler support" depends on SMP @@ -244,6 +217,8 @@ config SCHED_SMT cost of slightly increased overhead in some places. If unsure say N here. +source "kernel/Kconfig.preempt" + config K8_NUMA bool "K8 NUMA support" select NUMA @@ -265,7 +240,7 @@ config NUMA_EMU into virtual nodes when booted with "numa=fake=N", where N is the number of nodes. This is only useful for debugging. -config DISCONTIGMEM +config ARCH_DISCONTIGMEM_ENABLE bool depends on NUMA default y @@ -274,6 +249,27 @@ config NUMA bool default n +config ARCH_DISCONTIGMEM_ENABLE + def_bool y + depends on NUMA + +config ARCH_DISCONTIGMEM_DEFAULT + def_bool y + depends on NUMA + +config ARCH_SPARSEMEM_ENABLE + def_bool y + depends on NUMA + +config ARCH_FLATMEM_ENABLE + def_bool y + depends on !NUMA + +source "mm/Kconfig" + +config HAVE_ARCH_EARLY_PFN_TO_NID + def_bool y + config HAVE_DEC_LOCK bool depends on SMP @@ -292,6 +288,15 @@ config NR_CPUS This is purely to save memory - each supported CPU requires memory in the static kernel configuration. +config HOTPLUG_CPU + bool "Support for hot-pluggable CPUs (EXPERIMENTAL)" + depends on SMP && HOTPLUG && EXPERIMENTAL + help + Say Y here to experiment with turning CPUs off and on. CPUs + can be controlled through /sys/devices/system/cpu/cpu#. + Say N if you want to disable CPU hotplug. + + config HPET_TIMER bool default y @@ -324,12 +329,15 @@ config HPET_EMULATE_RTC config GART_IOMMU bool "IOMMU support" + default y depends on PCI help - Support the K8 IOMMU. Needed to run systems with more than 4GB of memory + Support the IOMMU. Needed to run systems with more than 3GB of memory properly with 32-bit PCI devices that do not support DAC (Double Address Cycle). The IOMMU can be turned off at runtime with the iommu=off parameter. Normally the kernel will take the right choice by itself. + This option includes a driver for the AMD Opteron/Athlon64 IOMMU + and a software emulation used on some other systems. If unsure, say Y. # need this always enabled with GART_IOMMU for the VIA workaround @@ -364,6 +372,34 @@ config X86_MCE_INTEL Additional support for intel specific MCE features such as the thermal monitor. +config PHYSICAL_START + hex "Physical address where the kernel is loaded" if EMBEDDED + default "0x100000" + help + This gives the physical address where the kernel is loaded. + Primarily used in the case of kexec on panic where the + fail safe kernel needs to run at a different address than + the panic-ed kernel. + + Don't change this unless you know what you are doing. + +config KEXEC + bool "kexec system call (EXPERIMENTAL)" + depends on EXPERIMENTAL + help + kexec is a system call that implements the ability to shutdown your + current kernel, and to start another kernel. It is like a reboot + but it is indepedent of the system firmware. And like a reboot + you can start any kernel with it, not just Linux. + + The name comes from the similiarity to the exec system call. + + It is an ongoing process to be certain the hardware in a machine + is properly shutdown, so do not be surprised if this code does not + initially work for you. It may help to enable device hotplugging + support. As of this writing the exact hardware interface is + strongly in flux, so no good recommendation can be made. + config SECCOMP bool "Enable seccomp to safely compute untrusted bytecode" depends on PROC_FS @@ -381,6 +417,8 @@ config SECCOMP If unsure, say Y. Only embedded should say N here. +source kernel/Kconfig.hz + endmenu # @@ -480,6 +518,8 @@ config UID16 endmenu +source "net/Kconfig" + source drivers/Kconfig source "drivers/firmware/Kconfig" diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile index 6f90c246c41..4c6ed96d5f7 100644 --- a/arch/x86_64/Makefile +++ b/arch/x86_64/Makefile @@ -21,21 +21,9 @@ # # $Id: Makefile,v 1.31 2002/03/22 15:56:07 ak Exp $ -# -# early bootup linking needs 32bit. You can either use real 32bit tools -# here or 64bit tools in 32bit mode. -# -IA32_CC := $(CC) $(CPPFLAGS) -m32 -O2 -fomit-frame-pointer -IA32_LD := $(LD) -m elf_i386 -IA32_AS := $(CC) $(AFLAGS) -m32 -Wa,--32 -traditional -c -IA32_OBJCOPY := $(CROSS_COMPILE)objcopy -IA32_CPP := $(CROSS_COMPILE)gcc -m32 -E -export IA32_CC IA32_LD IA32_AS IA32_OBJCOPY IA32_CPP - - LDFLAGS := -m elf_x86_64 OBJCOPYFLAGS := -O binary -R .note -R .comment -S -LDFLAGS_vmlinux := -e stext +LDFLAGS_vmlinux := CHECKFLAGS += -D__x86_64__ -m64 @@ -65,7 +53,9 @@ CFLAGS += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,) head-y := arch/x86_64/kernel/head.o arch/x86_64/kernel/head64.o arch/x86_64/kernel/init_task.o libs-y += arch/x86_64/lib/ -core-y += arch/x86_64/kernel/ arch/x86_64/mm/ +core-y += arch/x86_64/kernel/ \ + arch/x86_64/mm/ \ + arch/x86_64/crypto/ core-$(CONFIG_IA32_EMULATION) += arch/x86_64/ia32/ drivers-$(CONFIG_PCI) += arch/x86_64/pci/ drivers-$(CONFIG_OPROFILE) += arch/x86_64/oprofile/ diff --git a/arch/x86_64/boot/compressed/head.S b/arch/x86_64/boot/compressed/head.S index 27264dbd575..6f55565e4d4 100644 --- a/arch/x86_64/boot/compressed/head.S +++ b/arch/x86_64/boot/compressed/head.S @@ -2,8 +2,6 @@ * linux/boot/head.S * * Copyright (C) 1991, 1992, 1993 Linus Torvalds - * - * $Id: head.S,v 1.3 2001/04/20 00:59:28 ak Exp $ */ /* @@ -21,13 +19,14 @@ */ /* - * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 + * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 */ .code32 .text #include <linux/linkage.h> #include <asm/segment.h> +#include <asm/page.h> .code32 .globl startup_32 @@ -77,7 +76,7 @@ startup_32: jnz 3f addl $8,%esp xorl %ebx,%ebx - ljmp $(__KERNEL_CS), $0x100000 + ljmp $(__KERNEL_CS), $__PHYSICAL_START /* * We come here, if we were loaded high. @@ -103,7 +102,7 @@ startup_32: popl %ecx # lcount popl %edx # high_buffer_start popl %eax # hcount - movl $0x100000,%edi + movl $__PHYSICAL_START,%edi cli # make sure we don't get interrupted ljmp $(__KERNEL_CS), $0x1000 # and jump to the move routine @@ -128,7 +127,7 @@ move_routine_start: movsl movl %ebx,%esi # Restore setup pointer xorl %ebx,%ebx - ljmp $(__KERNEL_CS), $0x100000 + ljmp $(__KERNEL_CS), $__PHYSICAL_START move_routine_end: diff --git a/arch/x86_64/boot/compressed/misc.c b/arch/x86_64/boot/compressed/misc.c index c8b9216f9e6..b38d5b8b5fb 100644 --- a/arch/x86_64/boot/compressed/misc.c +++ b/arch/x86_64/boot/compressed/misc.c @@ -11,6 +11,7 @@ #include "miscsetup.h" #include <asm/io.h> +#include <asm/page.h> /* * gzip declarations @@ -92,8 +93,11 @@ static unsigned long output_ptr = 0; static void *malloc(int size); static void free(void *where); +void* memset(void* s, int c, unsigned n); +void* memcpy(void* dest, const void* src, unsigned n); + static void putstr(const char *); - + extern int end; static long free_mem_ptr = (long)&end; static long free_mem_end_ptr; @@ -284,7 +288,7 @@ void setup_normal_output_buffer(void) #else if ((ALT_MEM_K > EXT_MEM_K ? ALT_MEM_K : EXT_MEM_K) < 1024) error("Less than 2MB of memory"); #endif - output_data = (char *)0x100000; /* Points to 1M */ + output_data = (char *)__PHYSICAL_START; /* Normally Points to 1M */ free_mem_end_ptr = (long)real_mode; } @@ -307,8 +311,8 @@ void setup_output_buffer_if_we_run_high(struct moveparams *mv) low_buffer_size = low_buffer_end - LOW_BUFFER_START; high_loaded = 1; free_mem_end_ptr = (long)high_buffer_start; - if ( (0x100000 + low_buffer_size) > ((ulg)high_buffer_start)) { - high_buffer_start = (uch *)(0x100000 + low_buffer_size); + if ( (__PHYSICAL_START + low_buffer_size) > ((ulg)high_buffer_start)) { + high_buffer_start = (uch *)(__PHYSICAL_START + low_buffer_size); mv->hcount = 0; /* say: we need not to move high_buffer */ } else mv->hcount = -1; diff --git a/arch/x86_64/boot/install.sh b/arch/x86_64/boot/install.sh index 90f2452b3b9..198af15a775 100644 --- a/arch/x86_64/boot/install.sh +++ b/arch/x86_64/boot/install.sh @@ -1,6 +1,6 @@ #!/bin/sh # -# arch/i386/boot/install.sh +# arch/x86_64/boot/install.sh # # This file is subject to the terms and conditions of the GNU General Public # License. See the file "COPYING" in the main directory of this archive @@ -21,8 +21,8 @@ # User may have a custom install script -if [ -x ~/bin/installkernel ]; then exec ~/bin/installkernel "$@"; fi -if [ -x /sbin/installkernel ]; then exec /sbin/installkernel "$@"; fi +if [ -x ~/bin/${CROSS_COMPILE}installkernel ]; then exec ~/bin/${CROSS_COMPILE}installkernel "$@"; fi +if [ -x /sbin/${CROSS_COMPILE}installkernel ]; then exec /sbin/${CROSS_COMPILE}installkernel "$@"; fi # Default install - same as make zlilo diff --git a/arch/x86_64/boot/setup.S b/arch/x86_64/boot/setup.S index 75d4d2ad93b..ff58b2832b7 100644 --- a/arch/x86_64/boot/setup.S +++ b/arch/x86_64/boot/setup.S @@ -33,7 +33,7 @@ * Transcribed from Intel (as86) -> AT&T (gas) by Chris Noe, May 1999. * <stiker@northlink.com> * - * Fix to work around buggy BIOSes which dont use carry bit correctly + * Fix to work around buggy BIOSes which don't use carry bit correctly * and/or report extended memory in CX/DX for e801h memory size detection * call. As a result the kernel got wrong figures. The int15/e801h docs * from Ralf Brown interrupt list seem to indicate AX/BX should be used @@ -383,7 +383,7 @@ sse_ok: # a whole bunch of different types, and allows memory holes and # everything. We scan through this memory map and build a list # of the first 32 memory areas, which we return at [E820MAP]. -# This is documented at http://www.teleport.com/~acpi/acpihtml/topic245.htm +# This is documented at http://www.acpi.info/, in the ACPI 2.0 specification. #define SMAP 0x534d4150 @@ -436,7 +436,7 @@ bail820: meme801: stc # fix to work around buggy - xorw %cx,%cx # BIOSes which dont clear/set + xorw %cx,%cx # BIOSes which don't clear/set xorw %dx,%dx # carry on pass/error of # e801h memory size call # or merely pass cx,dx though @@ -733,7 +733,7 @@ flush_instr: # # but we yet haven't reloaded the CS register, so the default size # of the target offset still is 16 bit. -# However, using an operant prefix (0x66), the CPU will properly +# However, using an operand prefix (0x66), the CPU will properly # take our 48 bit far pointer. (INTeL 80386 Programmer's Reference # Manual, Mixing 16-bit and 32-bit code, page 16-6) diff --git a/arch/x86_64/boot/tools/build.c b/arch/x86_64/boot/tools/build.c index c2fa6631317..18b5bac1c42 100644 --- a/arch/x86_64/boot/tools/build.c +++ b/arch/x86_64/boot/tools/build.c @@ -1,6 +1,4 @@ /* - * $Id: build.c,v 1.3 2001/06/26 15:14:50 pavel Exp $ - * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 1997 Martin Mares */ @@ -8,7 +6,8 @@ /* * This file builds a disk-image from three different files: * - * - bootsect: exactly 512 bytes of 8086 machine code, loads the rest + * - bootsect: compatibility mbr which prints an error message if + * someone tries to boot the kernel directly. * - setup: 8086 machine code, sets up system parm * - system: 80386 code for actual system * diff --git a/arch/x86_64/crypto/Makefile b/arch/x86_64/crypto/Makefile new file mode 100644 index 00000000000..426d20f4b72 --- /dev/null +++ b/arch/x86_64/crypto/Makefile @@ -0,0 +1,9 @@ +# +# x86_64/crypto/Makefile +# +# Arch-specific CryptoAPI modules. +# + +obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o + +aes-x86_64-y := aes-x86_64-asm.o aes.o diff --git a/arch/x86_64/crypto/aes-x86_64-asm.S b/arch/x86_64/crypto/aes-x86_64-asm.S new file mode 100644 index 00000000000..483cbb23ab8 --- /dev/null +++ b/arch/x86_64/crypto/aes-x86_64-asm.S @@ -0,0 +1,186 @@ +/* AES (Rijndael) implementation (FIPS PUB 197) for x86_64 + * + * Copyright (C) 2005 Andreas Steinmetz, <ast@domdv.de> + * + * License: + * This code can be distributed under the terms of the GNU General Public + * License (GPL) Version 2 provided that the above header down to and + * including this sentence is retained in full. + */ + +.extern aes_ft_tab +.extern aes_it_tab +.extern aes_fl_tab +.extern aes_il_tab + +.text + +#define R1 %rax +#define R1E %eax +#define R1X %ax +#define R1H %ah +#define R1L %al +#define R2 %rbx +#define R2E %ebx +#define R2X %bx +#define R2H %bh +#define R2L %bl +#define R3 %rcx +#define R3E %ecx +#define R3X %cx +#define R3H %ch +#define R3L %cl +#define R4 %rdx +#define R4E %edx +#define R4X %dx +#define R4H %dh +#define R4L %dl +#define R5 %rsi +#define R5E %esi +#define R6 %rdi +#define R6E %edi +#define R7 %rbp +#define R7E %ebp +#define R8 %r8 +#define R9 %r9 +#define R10 %r10 +#define R11 %r11 + +#define prologue(FUNC,BASE,B128,B192,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11) \ + .global FUNC; \ + .type FUNC,@function; \ + .align 8; \ +FUNC: movq r1,r2; \ + movq r3,r4; \ + leaq BASE+52(r8),r9; \ + movq r10,r11; \ + movl (r7),r5 ## E; \ + movl 4(r7),r1 ## E; \ + movl 8(r7),r6 ## E; \ + movl 12(r7),r7 ## E; \ + movl (r8),r10 ## E; \ + xorl -48(r9),r5 ## E; \ + xorl -44(r9),r1 ## E; \ + xorl -40(r9),r6 ## E; \ + xorl -36(r9),r7 ## E; \ + cmpl $24,r10 ## E; \ + jb B128; \ + leaq 32(r9),r9; \ + je B192; \ + leaq 32(r9),r9; + +#define epilogue(r1,r2,r3,r4,r5,r6,r7,r8,r9) \ + movq r1,r2; \ + movq r3,r4; \ + movl r5 ## E,(r9); \ + movl r6 ## E,4(r9); \ + movl r7 ## E,8(r9); \ + movl r8 ## E,12(r9); \ + ret; + +#define round(TAB,OFFSET,r1,r2,r3,r4,r5,r6,r7,r8,ra,rb,rc,rd) \ + movzbl r2 ## H,r5 ## E; \ + movzbl r2 ## L,r6 ## E; \ + movl TAB+1024(,r5,4),r5 ## E;\ + movw r4 ## X,r2 ## X; \ + movl TAB(,r6,4),r6 ## E; \ + roll $16,r2 ## E; \ + shrl $16,r4 ## E; \ + movzbl r4 ## H,r7 ## E; \ + movzbl r4 ## L,r4 ## E; \ + xorl OFFSET(r8),ra ## E; \ + xorl OFFSET+4(r8),rb ## E; \ + xorl TAB+3072(,r7,4),r5 ## E;\ + xorl TAB+2048(,r4,4),r6 ## E;\ + movzbl r1 ## L,r7 ## E; \ + movzbl r1 ## H,r4 ## E; \ + movl TAB+1024(,r4,4),r4 ## E;\ + movw r3 ## X,r1 ## X; \ + roll $16,r1 ## E; \ + shrl $16,r3 ## E; \ + xorl TAB(,r7,4),r5 ## E; \ + movzbl r3 ## H,r7 ## E; \ + movzbl r3 ## L,r3 ## E; \ + xorl TAB+3072(,r7,4),r4 ## E;\ + xorl TAB+2048(,r3,4),r5 ## E;\ + movzbl r1 ## H,r7 ## E; \ + movzbl r1 ## L,r3 ## E; \ + shrl $16,r1 ## E; \ + xorl TAB+3072(,r7,4),r6 ## E;\ + movl TAB+2048(,r3,4),r3 ## E;\ + movzbl r1 ## H,r7 ## E; \ + movzbl r1 ## L,r1 ## E; \ + xorl TAB+1024(,r7,4),r6 ## E;\ + xorl TAB(,r1,4),r3 ## E; \ + movzbl r2 ## H,r1 ## E; \ + movzbl r2 ## L,r7 ## E; \ + shrl $16,r2 ## E; \ + xorl TAB+3072(,r1,4),r3 ## E;\ + xorl TAB+2048(,r7,4),r4 ## E;\ + movzbl r2 ## H,r1 ## E; \ + movzbl r2 ## L,r2 ## E; \ + xorl OFFSET+8(r8),rc ## E; \ + xorl OFFSET+12(r8),rd ## E; \ + xorl TAB+1024(,r1,4),r3 ## E;\ + xorl TAB(,r2,4),r4 ## E; + +#define move_regs(r1,r2,r3,r4) \ + movl r3 ## E,r1 ## E; \ + movl r4 ## E,r2 ## E; + +#define entry(FUNC,BASE,B128,B192) \ + prologue(FUNC,BASE,B128,B192,R2,R8,R7,R9,R1,R3,R4,R6,R10,R5,R11) + +#define return epilogue(R8,R2,R9,R7,R5,R6,R3,R4,R11) + +#define encrypt_round(TAB,OFFSET) \ + round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R10,R5,R6,R3,R4) \ + move_regs(R1,R2,R5,R6) + +#define encrypt_final(TAB,OFFSET) \ + round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R10,R5,R6,R3,R4) + +#define decrypt_round(TAB,OFFSET) \ + round(TAB,OFFSET,R2,R1,R4,R3,R6,R5,R7,R10,R5,R6,R3,R4) \ + move_regs(R1,R2,R5,R6) + +#define decrypt_final(TAB,OFFSET) \ + round(TAB,OFFSET,R2,R1,R4,R3,R6,R5,R7,R10,R5,R6,R3,R4) + +/* void aes_encrypt(void *ctx, u8 *out, const u8 *in) */ + + entry(aes_encrypt,0,enc128,enc192) + encrypt_round(aes_ft_tab,-96) + encrypt_round(aes_ft_tab,-80) +enc192: encrypt_round(aes_ft_tab,-64) + encrypt_round(aes_ft_tab,-48) +enc128: encrypt_round(aes_ft_tab,-32) + encrypt_round(aes_ft_tab,-16) + encrypt_round(aes_ft_tab, 0) + encrypt_round(aes_ft_tab, 16) + encrypt_round(aes_ft_tab, 32) + encrypt_round(aes_ft_tab, 48) + encrypt_round(aes_ft_tab, 64) + encrypt_round(aes_ft_tab, 80) + encrypt_round(aes_ft_tab, 96) + encrypt_final(aes_fl_tab,112) + return + +/* void aes_decrypt(void *ctx, u8 *out, const u8 *in) */ + + entry(aes_decrypt,240,dec128,dec192) + decrypt_round(aes_it_tab,-96) + decrypt_round(aes_it_tab,-80) +dec192: decrypt_round(aes_it_tab,-64) + decrypt_round(aes_it_tab,-48) +dec128: decrypt_round(aes_it_tab,-32) + decrypt_round(aes_it_tab,-16) + decrypt_round(aes_it_tab, 0) + decrypt_round(aes_it_tab, 16) + decrypt_round(aes_it_tab, 32) + decrypt_round(aes_it_tab, 48) + decrypt_round(aes_it_tab, 64) + decrypt_round(aes_it_tab, 80) + decrypt_round(aes_it_tab, 96) + decrypt_final(aes_il_tab,112) + return diff --git a/arch/x86_64/crypto/aes.c b/arch/x86_64/crypto/aes.c new file mode 100644 index 00000000000..acfdaa28791 --- /dev/null +++ b/arch/x86_64/crypto/aes.c @@ -0,0 +1,325 @@ +/* + * Cryptographic API. + * + * AES Cipher Algorithm. + * + * Based on Brian Gladman's code. + * + * Linux developers: + * Alexander Kjeldaas <astor@fast.no> + * Herbert Valerio Riedel <hvr@hvrlab.org> + * Kyle McMartin <kyle@debian.org> + * Adam J. Richter <adam@yggdrasil.com> (conversion to 2.5 API). + * Andreas Steinmetz <ast@domdv.de> (adapted to x86_64 assembler) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * --------------------------------------------------------------------------- + * Copyright (c) 2002, Dr Brian Gladman <brg@gladman.me.uk>, Worcester, UK. + * All rights reserved. + * + * LICENSE TERMS + * + * The free distribution and use of this software in both source and binary + * form is allowed (with or without changes) provided that: + * + * 1. distributions of this source code include the above copyright + * notice, this list of conditions and the following disclaimer; + * + * 2. distributions in binary form include the above copyright + * notice, this list of conditions and the following disclaimer + * in the documentation and/or other associated materials; + * + * 3. the copyright holder's name is not used to endorse products + * built using this software without specific written permission. + * + * ALTERNATIVELY, provided that this notice is retained in full, this product + * may be distributed under the terms of the GNU General Public License (GPL), + * in which case the provisions of the GPL apply INSTEAD OF those given above. + * + * DISCLAIMER + * + * This software is provided 'as is' with no explicit or implied warranties + * in respect of its properties, including, but not limited to, correctness + * and/or fitness for purpose. + * --------------------------------------------------------------------------- + */ + +/* Some changes from the Gladman version: + s/RIJNDAEL(e_key)/E_KEY/g + s/RIJNDAEL(d_key)/D_KEY/g +*/ + +#include <asm/byteorder.h> +#include <linux/bitops.h> +#include <linux/crypto.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/types.h> + +#define AES_MIN_KEY_SIZE 16 +#define AES_MAX_KEY_SIZE 32 + +#define AES_BLOCK_SIZE 16 + +/* + * #define byte(x, nr) ((unsigned char)((x) >> (nr*8))) + */ +static inline u8 byte(const u32 x, const unsigned n) +{ + return x >> (n << 3); +} + +#define u32_in(x) le32_to_cpu(*(const __le32 *)(x)) + +struct aes_ctx +{ + u32 key_length; + u32 E[60]; + u32 D[60]; +}; + +#define E_KEY ctx->E +#define D_KEY ctx->D + +static u8 pow_tab[256] __initdata; +static u8 log_tab[256] __initdata; +static u8 sbx_tab[256] __initdata; +static u8 isb_tab[256] __initdata; +static u32 rco_tab[10]; +u32 aes_ft_tab[4][256]; +u32 aes_it_tab[4][256]; + +u32 aes_fl_tab[4][256]; +u32 aes_il_tab[4][256]; + +static inline u8 f_mult(u8 a, u8 b) +{ + u8 aa = log_tab[a], cc = aa + log_tab[b]; + + return pow_tab[cc + (cc < aa ? 1 : 0)]; +} + +#define ff_mult(a, b) (a && b ? f_mult(a, b) : 0) + +#define ls_box(x) \ + (aes_fl_tab[0][byte(x, 0)] ^ \ + aes_fl_tab[1][byte(x, 1)] ^ \ + aes_fl_tab[2][byte(x, 2)] ^ \ + aes_fl_tab[3][byte(x, 3)]) + +static void __init gen_tabs(void) +{ + u32 i, t; + u8 p, q; + + /* log and power tables for GF(2**8) finite field with + 0x011b as modular polynomial - the simplest primitive + root is 0x03, used here to generate the tables */ + + for (i = 0, p = 1; i < 256; ++i) { + pow_tab[i] = (u8)p; + log_tab[p] = (u8)i; + + p ^= (p << 1) ^ (p & 0x80 ? 0x01b : 0); + } + + log_tab[1] = 0; + + for (i = 0, p = 1; i < 10; ++i) { + rco_tab[i] = p; + + p = (p << 1) ^ (p & 0x80 ? 0x01b : 0); + } + + for (i = 0; i < 256; ++i) { + p = (i ? pow_tab[255 - log_tab[i]] : 0); + q = ((p >> 7) | (p << 1)) ^ ((p >> 6) | (p << 2)); + p ^= 0x63 ^ q ^ ((q >> 6) | (q << 2)); + sbx_tab[i] = p; + isb_tab[p] = (u8)i; + } + + for (i = 0; i < 256; ++i) { + p = sbx_tab[i]; + + t = p; + aes_fl_tab[0][i] = t; + aes_fl_tab[1][i] = rol32(t, 8); + aes_fl_tab[2][i] = rol32(t, 16); + aes_fl_tab[3][i] = rol32(t, 24); + + t = ((u32)ff_mult(2, p)) | + ((u32)p << 8) | + ((u32)p << 16) | ((u32)ff_mult(3, p) << 24); + + aes_ft_tab[0][i] = t; + aes_ft_tab[1][i] = rol32(t, 8); + aes_ft_tab[2][i] = rol32(t, 16); + aes_ft_tab[3][i] = rol32(t, 24); + + p = isb_tab[i]; + + t = p; + aes_il_tab[0][i] = t; + aes_il_tab[1][i] = rol32(t, 8); + aes_il_tab[2][i] = rol32(t, 16); + aes_il_tab[3][i] = rol32(t, 24); + + t = ((u32)ff_mult(14, p)) | + ((u32)ff_mult(9, p) << 8) | + ((u32)ff_mult(13, p) << 16) | + ((u32)ff_mult(11, p) << 24); + + aes_it_tab[0][i] = t; + aes_it_tab[1][i] = rol32(t, 8); + aes_it_tab[2][i] = rol32(t, 16); + aes_it_tab[3][i] = rol32(t, 24); + } +} + +#define star_x(x) (((x) & 0x7f7f7f7f) << 1) ^ ((((x) & 0x80808080) >> 7) * 0x1b) + +#define imix_col(y, x) \ + u = star_x(x); \ + v = star_x(u); \ + w = star_x(v); \ + t = w ^ (x); \ + (y) = u ^ v ^ w; \ + (y) ^= ror32(u ^ t, 8) ^ \ + ror32(v ^ t, 16) ^ \ + ror32(t, 24) + +/* initialise the key schedule from the user supplied key */ + +#define loop4(i) \ +{ \ + t = ror32(t, 8); t = ls_box(t) ^ rco_tab[i]; \ + t ^= E_KEY[4 * i]; E_KEY[4 * i + 4] = t; \ + t ^= E_KEY[4 * i + 1]; E_KEY[4 * i + 5] = t; \ + t ^= E_KEY[4 * i + 2]; E_KEY[4 * i + 6] = t; \ + t ^= E_KEY[4 * i + 3]; E_KEY[4 * i + 7] = t; \ +} + +#define loop6(i) \ +{ \ + t = ror32(t, 8); t = ls_box(t) ^ rco_tab[i]; \ + t ^= E_KEY[6 * i]; E_KEY[6 * i + 6] = t; \ + t ^= E_KEY[6 * i + 1]; E_KEY[6 * i + 7] = t; \ + t ^= E_KEY[6 * i + 2]; E_KEY[6 * i + 8] = t; \ + t ^= E_KEY[6 * i + 3]; E_KEY[6 * i + 9] = t; \ + t ^= E_KEY[6 * i + 4]; E_KEY[6 * i + 10] = t; \ + t ^= E_KEY[6 * i + 5]; E_KEY[6 * i + 11] = t; \ +} + +#define loop8(i) \ +{ \ + t = ror32(t, 8); ; t = ls_box(t) ^ rco_tab[i]; \ + t ^= E_KEY[8 * i]; E_KEY[8 * i + 8] = t; \ + t ^= E_KEY[8 * i + 1]; E_KEY[8 * i + 9] = t; \ + t ^= E_KEY[8 * i + 2]; E_KEY[8 * i + 10] = t; \ + t ^= E_KEY[8 * i + 3]; E_KEY[8 * i + 11] = t; \ + t = E_KEY[8 * i + 4] ^ ls_box(t); \ + E_KEY[8 * i + 12] = t; \ + t ^= E_KEY[8 * i + 5]; E_KEY[8 * i + 13] = t; \ + t ^= E_KEY[8 * i + 6]; E_KEY[8 * i + 14] = t; \ + t ^= E_KEY[8 * i + 7]; E_KEY[8 * i + 15] = t; \ +} + +static int aes_set_key(void *ctx_arg, const u8 *in_key, unsigned int key_len, + u32 *flags) +{ + struct aes_ctx *ctx = ctx_arg; + u32 i, j, t, u, v, w; + + if (key_len != 16 && key_len != 24 && key_len != 32) { + *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + return -EINVAL; + } + + ctx->key_length = key_len; + + D_KEY[key_len + 24] = E_KEY[0] = u32_in(in_key); + D_KEY[key_len + 25] = E_KEY[1] = u32_in(in_key + 4); + D_KEY[key_len + 26] = E_KEY[2] = u32_in(in_key + 8); + D_KEY[key_len + 27] = E_KEY[3] = u32_in(in_key + 12); + + switch (key_len) { + case 16: + t = E_KEY[3]; + for (i = 0; i < 10; ++i) + loop4(i); + break; + + case 24: + E_KEY[4] = u32_in(in_key + 16); + t = E_KEY[5] = u32_in(in_key + 20); + for (i = 0; i < 8; ++i) + loop6 (i); + break; + + case 32: + E_KEY[4] = u32_in(in_key + 16); + E_KEY[5] = u32_in(in_key + 20); + E_KEY[6] = u32_in(in_key + 24); + t = E_KEY[7] = u32_in(in_key + 28); + for (i = 0; i < 7; ++i) + loop8(i); + break; + } + + D_KEY[0] = E_KEY[key_len + 24]; + D_KEY[1] = E_KEY[key_len + 25]; + D_KEY[2] = E_KEY[key_len + 26]; + D_KEY[3] = E_KEY[key_len + 27]; + + for (i = 4; i < key_len + 24; ++i) { + j = key_len + 24 - (i & ~3) + (i & 3); + imix_col(D_KEY[j], E_KEY[i]); + } + + return 0; +} + +extern void aes_encrypt(void *ctx_arg, u8 *out, const u8 *in); +extern void aes_decrypt(void *ctx_arg, u8 *out, const u8 *in); + +static struct crypto_alg aes_alg = { + .cra_name = "aes", + .cra_flags = CRYPTO_ALG_TYPE_CIPHER, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct aes_ctx), + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(aes_alg.cra_list), + .cra_u = { + .cipher = { + .cia_min_keysize = AES_MIN_KEY_SIZE, + .cia_max_keysize = AES_MAX_KEY_SIZE, + .cia_setkey = aes_set_key, + .cia_encrypt = aes_encrypt, + .cia_decrypt = aes_decrypt + } + } +}; + +static int __init aes_init(void) +{ + gen_tabs(); + return crypto_register_alg(&aes_alg); +} + +static void __exit aes_fini(void) +{ + crypto_unregister_alg(&aes_alg); +} + +module_init(aes_init); +module_exit(aes_fini); + +MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("aes"); diff --git a/arch/x86_64/defconfig b/arch/x86_64/defconfig index 569595b74c7..776f3c866b7 100644 --- a/arch/x86_64/defconfig +++ b/arch/x86_64/defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.12-rc4 -# Fri May 13 06:39:11 2005 +# Linux kernel version: 2.6.13-rc3 +# Fri Jul 22 16:47:31 2005 # CONFIG_X86_64=y CONFIG_64BIT=y @@ -84,14 +84,27 @@ CONFIG_X86_IO_APIC=y CONFIG_X86_LOCAL_APIC=y CONFIG_MTRR=y CONFIG_SMP=y -# CONFIG_PREEMPT is not set CONFIG_SCHED_SMT=y +CONFIG_PREEMPT_NONE=y +# CONFIG_PREEMPT_VOLUNTARY is not set +# CONFIG_PREEMPT is not set +CONFIG_PREEMPT_BKL=y CONFIG_K8_NUMA=y # CONFIG_NUMA_EMU is not set -CONFIG_DISCONTIGMEM=y +CONFIG_ARCH_DISCONTIGMEM_ENABLE=y CONFIG_NUMA=y +CONFIG_ARCH_DISCONTIGMEM_DEFAULT=y +CONFIG_ARCH_SPARSEMEM_ENABLE=y +CONFIG_SELECT_MEMORY_MODEL=y +# CONFIG_FLATMEM_MANUAL is not set +CONFIG_DISCONTIGMEM_MANUAL=y +# CONFIG_SPARSEMEM_MANUAL is not set +CONFIG_DISCONTIGMEM=y +CONFIG_FLAT_NODE_MEM_MAP=y +CONFIG_NEED_MULTIPLE_NODES=y +CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y CONFIG_HAVE_DEC_LOCK=y -CONFIG_NR_CPUS=8 +CONFIG_NR_CPUS=32 CONFIG_HPET_TIMER=y CONFIG_X86_PM_TIMER=y CONFIG_HPET_EMULATE_RTC=y @@ -99,7 +112,13 @@ CONFIG_GART_IOMMU=y CONFIG_SWIOTLB=y CONFIG_X86_MCE=y CONFIG_X86_MCE_INTEL=y +CONFIG_PHYSICAL_START=0x100000 +# CONFIG_KEXEC is not set CONFIG_SECCOMP=y +# CONFIG_HZ_100 is not set +CONFIG_HZ_250=y +# CONFIG_HZ_1000 is not set +CONFIG_HZ=250 CONFIG_GENERIC_HARDIRQS=y CONFIG_GENERIC_IRQ_PROBE=y CONFIG_ISA_DMA_API=y @@ -118,12 +137,11 @@ CONFIG_PM_STD_PARTITION="" CONFIG_ACPI=y CONFIG_ACPI_BOOT=y CONFIG_ACPI_INTERPRETER=y -CONFIG_ACPI_SLEEP=y -CONFIG_ACPI_SLEEP_PROC_FS=y CONFIG_ACPI_AC=y CONFIG_ACPI_BATTERY=y CONFIG_ACPI_BUTTON=y # CONFIG_ACPI_VIDEO is not set +CONFIG_ACPI_HOTKEY=m CONFIG_ACPI_FAN=y CONFIG_ACPI_PROCESSOR=y CONFIG_ACPI_THERMAL=y @@ -154,6 +172,7 @@ CONFIG_CPU_FREQ_GOV_PERFORMANCE=y # CONFIG_CPU_FREQ_GOV_POWERSAVE is not set CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_CPU_FREQ_GOV_ONDEMAND=y +# CONFIG_CPU_FREQ_GOV_CONSERVATIVE is not set # # CPUFreq processor drivers @@ -204,6 +223,76 @@ CONFIG_SYSVIPC_COMPAT=y CONFIG_UID16=y # +# Networking +# +CONFIG_NET=y + +# +# Networking options +# +CONFIG_PACKET=y +# CONFIG_PACKET_MMAP is not set +CONFIG_UNIX=y +# CONFIG_NET_KEY is not set +CONFIG_INET=y +CONFIG_IP_MULTICAST=y +# CONFIG_IP_ADVANCED_ROUTER is not set +CONFIG_IP_FIB_HASH=y +# CONFIG_IP_PNP is not set +# CONFIG_NET_IPIP is not set +# CONFIG_NET_IPGRE is not set +# CONFIG_IP_MROUTE is not set +# CONFIG_ARPD is not set +# CONFIG_SYN_COOKIES is not set +# CONFIG_INET_AH is not set +# CONFIG_INET_ESP is not set +# CONFIG_INET_IPCOMP is not set +# CONFIG_INET_TUNNEL is not set +CONFIG_IP_TCPDIAG=y +CONFIG_IP_TCPDIAG_IPV6=y +# CONFIG_TCP_CONG_ADVANCED is not set +CONFIG_TCP_CONG_BIC=y +CONFIG_IPV6=y +# CONFIG_IPV6_PRIVACY is not set +# CONFIG_INET6_AH is not set +# CONFIG_INET6_ESP is not set +# CONFIG_INET6_IPCOMP is not set +# CONFIG_INET6_TUNNEL is not set +# CONFIG_IPV6_TUNNEL is not set +# CONFIG_NETFILTER is not set + +# +# SCTP Configuration (EXPERIMENTAL) +# +# CONFIG_IP_SCTP is not set +# CONFIG_ATM is not set +# CONFIG_BRIDGE is not set +# CONFIG_VLAN_8021Q is not set +# CONFIG_DECNET is not set +# CONFIG_LLC2 is not set +# CONFIG_IPX is not set +# CONFIG_ATALK is not set +# CONFIG_X25 is not set +# CONFIG_LAPB is not set +# CONFIG_NET_DIVERT is not set +# CONFIG_ECONET is not set +# CONFIG_WAN_ROUTER is not set +# CONFIG_NET_SCHED is not set +# CONFIG_NET_CLS_ROUTE is not set + +# +# Network testing +# +# CONFIG_NET_PKTGEN is not set +CONFIG_NETPOLL=y +# CONFIG_NETPOLL_RX is not set +# CONFIG_NETPOLL_TRAP is not set +CONFIG_NET_POLL_CONTROLLER=y +# CONFIG_HAMRADIO is not set +# CONFIG_IRDA is not set +# CONFIG_BT is not set + +# # Device Drivers # @@ -308,6 +397,7 @@ CONFIG_BLK_DEV_AMD74XX=y # CONFIG_BLK_DEV_HPT366 is not set # CONFIG_BLK_DEV_SC1200 is not set CONFIG_BLK_DEV_PIIX=y +# CONFIG_BLK_DEV_IT821X is not set # CONFIG_BLK_DEV_NS87415 is not set # CONFIG_BLK_DEV_PDC202XX_OLD is not set CONFIG_BLK_DEV_PDC202XX_NEW=y @@ -338,6 +428,7 @@ CONFIG_BLK_DEV_SD=y # CONFIG_CHR_DEV_OSST is not set # CONFIG_BLK_DEV_SR is not set # CONFIG_CHR_DEV_SG is not set +# CONFIG_CHR_DEV_SCH is not set # # Some SCSI devices (e.g. CD jukebox) support multiple LUNs @@ -372,7 +463,6 @@ CONFIG_AIC79XX_DEBUG_MASK=0 # CONFIG_MEGARAID_NEWGEN is not set # CONFIG_MEGARAID_LEGACY is not set CONFIG_SCSI_SATA=y -# CONFIG_SCSI_SATA_AHCI is not set # CONFIG_SCSI_SATA_SVW is not set CONFIG_SCSI_ATA_PIIX=y # CONFIG_SCSI_SATA_NV is not set @@ -410,14 +500,21 @@ CONFIG_SCSI_QLA2XXX=y # # Multi-device support (RAID and LVM) # -# CONFIG_MD is not set +CONFIG_MD=y +# CONFIG_BLK_DEV_MD is not set +CONFIG_BLK_DEV_DM=y +# CONFIG_DM_CRYPT is not set +# CONFIG_DM_SNAPSHOT is not set +# CONFIG_DM_MIRROR is not set +# CONFIG_DM_ZERO is not set +# CONFIG_DM_MULTIPATH is not set # # Fusion MPT device support # -CONFIG_FUSION=y -CONFIG_FUSION_MAX_SGE=40 -# CONFIG_FUSION_CTL is not set +# CONFIG_FUSION is not set +# CONFIG_FUSION_SPI is not set +# CONFIG_FUSION_FC is not set # # IEEE 1394 (FireWire) support @@ -430,75 +527,8 @@ CONFIG_FUSION_MAX_SGE=40 # CONFIG_I2O is not set # -# Networking support -# -CONFIG_NET=y - -# -# Networking options -# -CONFIG_PACKET=y -# CONFIG_PACKET_MMAP is not set -CONFIG_UNIX=y -# CONFIG_NET_KEY is not set -CONFIG_INET=y -CONFIG_IP_MULTICAST=y -# CONFIG_IP_ADVANCED_ROUTER is not set -# CONFIG_IP_PNP is not set -# CONFIG_NET_IPIP is not set -# CONFIG_NET_IPGRE is not set -# CONFIG_IP_MROUTE is not set -# CONFIG_ARPD is not set -# CONFIG_SYN_COOKIES is not set -# CONFIG_INET_AH is not set -# CONFIG_INET_ESP is not set -# CONFIG_INET_IPCOMP is not set -# CONFIG_INET_TUNNEL is not set -CONFIG_IP_TCPDIAG=y -CONFIG_IP_TCPDIAG_IPV6=y -CONFIG_IPV6=y -# CONFIG_IPV6_PRIVACY is not set -# CONFIG_INET6_AH is not set -# CONFIG_INET6_ESP is not set -# CONFIG_INET6_IPCOMP is not set -# CONFIG_INET6_TUNNEL is not set -# CONFIG_IPV6_TUNNEL is not set -# CONFIG_NETFILTER is not set - -# -# SCTP Configuration (EXPERIMENTAL) +# Network device support # -# CONFIG_IP_SCTP is not set -# CONFIG_ATM is not set -# CONFIG_BRIDGE is not set -# CONFIG_VLAN_8021Q is not set -# CONFIG_DECNET is not set -# CONFIG_LLC2 is not set -# CONFIG_IPX is not set -# CONFIG_ATALK is not set -# CONFIG_X25 is not set -# CONFIG_LAPB is not set -# CONFIG_NET_DIVERT is not set -# CONFIG_ECONET is not set -# CONFIG_WAN_ROUTER is not set - -# -# QoS and/or fair queueing -# -# CONFIG_NET_SCHED is not set -# CONFIG_NET_CLS_ROUTE is not set - -# -# Network testing -# -# CONFIG_NET_PKTGEN is not set -CONFIG_NETPOLL=y -# CONFIG_NETPOLL_RX is not set -# CONFIG_NETPOLL_TRAP is not set -CONFIG_NET_POLL_CONTROLLER=y -# CONFIG_HAMRADIO is not set -# CONFIG_IRDA is not set -# CONFIG_BT is not set CONFIG_NETDEVICES=y # CONFIG_DUMMY is not set # CONFIG_BONDING is not set @@ -517,7 +547,9 @@ CONFIG_NET_ETHERNET=y CONFIG_MII=y # CONFIG_HAPPYMEAL is not set # CONFIG_SUNGEM is not set -# CONFIG_NET_VENDOR_3COM is not set +CONFIG_NET_VENDOR_3COM=y +CONFIG_VORTEX=y +# CONFIG_TYPHOON is not set # # Tulip family network device support @@ -532,7 +564,7 @@ CONFIG_NET_PCI=y CONFIG_FORCEDETH=y # CONFIG_DGRS is not set # CONFIG_EEPRO100 is not set -# CONFIG_E100 is not set +CONFIG_E100=y # CONFIG_FEALNX is not set # CONFIG_NATSEMI is not set # CONFIG_NE2K_PCI is not set @@ -553,14 +585,15 @@ CONFIG_8139TOO=y # CONFIG_ACENIC is not set # CONFIG_DL2K is not set CONFIG_E1000=y -# CONFIG_E1000_NAPI is not set # CONFIG_NS83820 is not set # CONFIG_HAMACHI is not set # CONFIG_YELLOWFIN is not set # CONFIG_R8169 is not set +# CONFIG_SKGE is not set # CONFIG_SK98LIN is not set # CONFIG_VIA_VELOCITY is not set CONFIG_TIGON3=y +# CONFIG_BNX2 is not set # # Ethernet (10000 Mbit) @@ -647,7 +680,6 @@ CONFIG_SERIO_I8042=y CONFIG_SERIO_LIBPS2=y # CONFIG_SERIO_RAW is not set # CONFIG_GAMEPORT is not set -CONFIG_SOUND_GAMEPORT=y # # Character devices @@ -716,6 +748,7 @@ CONFIG_MAX_RAW_DEVS=256 # I2C support # # CONFIG_I2C is not set +# CONFIG_I2C_SENSOR is not set # # Dallas's 1-wire bus @@ -723,6 +756,12 @@ CONFIG_MAX_RAW_DEVS=256 # CONFIG_W1 is not set # +# Hardware Monitoring support +# +CONFIG_HWMON=y +# CONFIG_HWMON_DEBUG_CHIP is not set + +# # Misc devices # # CONFIG_IBM_ASM is not set @@ -808,6 +847,7 @@ CONFIG_USB_DEVICEFS=y CONFIG_USB_EHCI_HCD=y # CONFIG_USB_EHCI_SPLIT_ISO is not set # CONFIG_USB_EHCI_ROOT_HUB_TT is not set +# CONFIG_USB_ISP116X_HCD is not set CONFIG_USB_OHCI_HCD=y # CONFIG_USB_OHCI_BIG_ENDIAN is not set CONFIG_USB_OHCI_LITTLE_ENDIAN=y @@ -846,12 +886,15 @@ CONFIG_USB_HIDINPUT=y # CONFIG_USB_HIDDEV is not set # CONFIG_USB_AIPTEK is not set # CONFIG_USB_WACOM is not set +# CONFIG_USB_ACECAD is not set # CONFIG_USB_KBTAB is not set # CONFIG_USB_POWERMATE is not set # CONFIG_USB_MTOUCH is not set +# CONFIG_USB_ITMTOUCH is not set # CONFIG_USB_EGALAX is not set # CONFIG_USB_XPAD is not set # CONFIG_USB_ATI_REMOTE is not set +# CONFIG_USB_KEYSPAN_REMOTE is not set # # USB Imaging devices @@ -902,10 +945,11 @@ CONFIG_USB_MON=y # CONFIG_USB_PHIDGETSERVO is not set # CONFIG_USB_IDMOUSE is not set # CONFIG_USB_SISUSBVGA is not set +# CONFIG_USB_LD is not set # CONFIG_USB_TEST is not set # -# USB ATM/DSL drivers +# USB DSL modem support # # @@ -924,6 +968,10 @@ CONFIG_USB_MON=y # CONFIG_INFINIBAND is not set # +# SN Devices +# + +# # Firmware Drivers # # CONFIG_EDD is not set @@ -935,6 +983,7 @@ CONFIG_EXT2_FS=y CONFIG_EXT2_FS_XATTR=y CONFIG_EXT2_FS_POSIX_ACL=y # CONFIG_EXT2_FS_SECURITY is not set +# CONFIG_EXT2_FS_XIP is not set CONFIG_EXT3_FS=y CONFIG_EXT3_FS_XATTR=y CONFIG_EXT3_FS_POSIX_ACL=y @@ -957,6 +1006,7 @@ CONFIG_FS_POSIX_ACL=y # CONFIG_XFS_FS is not set # CONFIG_MINIX_FS is not set # CONFIG_ROMFS_FS is not set +CONFIG_INOTIFY=y # CONFIG_QUOTA is not set CONFIG_DNOTIFY=y CONFIG_AUTOFS_FS=y @@ -986,7 +1036,6 @@ CONFIG_FAT_DEFAULT_IOCHARSET="iso8859-1" CONFIG_PROC_FS=y CONFIG_PROC_KCORE=y CONFIG_SYSFS=y -# CONFIG_DEVFS_FS is not set # CONFIG_DEVPTS_FS_XATTR is not set CONFIG_TMPFS=y # CONFIG_TMPFS_XATTR is not set @@ -1016,15 +1065,18 @@ CONFIG_RAMFS=y # CONFIG_NFS_FS=y CONFIG_NFS_V3=y +# CONFIG_NFS_V3_ACL is not set # CONFIG_NFS_V4 is not set # CONFIG_NFS_DIRECTIO is not set CONFIG_NFSD=y CONFIG_NFSD_V3=y +# CONFIG_NFSD_V3_ACL is not set # CONFIG_NFSD_V4 is not set CONFIG_NFSD_TCP=y CONFIG_LOCKD=y CONFIG_LOCKD_V4=y CONFIG_EXPORTFS=y +CONFIG_NFS_COMMON=y CONFIG_SUNRPC=y # CONFIG_RPCSEC_GSS_KRB5 is not set # CONFIG_RPCSEC_GSS_SPKM3 is not set diff --git a/arch/x86_64/ia32/Makefile b/arch/x86_64/ia32/Makefile index a12b19da4b5..f76217d8f57 100644 --- a/arch/x86_64/ia32/Makefile +++ b/arch/x86_64/ia32/Makefile @@ -4,14 +4,14 @@ obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_ioctl.o \ ia32_signal.o tls32.o \ - ia32_binfmt.o fpu32.o ptrace32.o syscall32.o + ia32_binfmt.o fpu32.o ptrace32.o syscall32.o syscall32_syscall.o sysv-$(CONFIG_SYSVIPC) := ipc32.o obj-$(CONFIG_IA32_EMULATION) += $(sysv-y) obj-$(CONFIG_IA32_AOUT) += ia32_aout.o -$(obj)/syscall32.o: $(src)/syscall32.c \ +$(obj)/syscall32_syscall.o: \ $(foreach F,sysenter syscall,$(obj)/vsyscall-$F.so) # Teach kbuild about targets diff --git a/arch/x86_64/ia32/ia32_aout.c b/arch/x86_64/ia32/ia32_aout.c index c12edf5d97f..3e6780fa018 100644 --- a/arch/x86_64/ia32/ia32_aout.c +++ b/arch/x86_64/ia32/ia32_aout.c @@ -42,7 +42,7 @@ extern int ia32_setup_arg_pages(struct linux_binprm *bprm, static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs); static int load_aout_library(struct file*); -#if CORE_DUMP +#ifdef CORE_DUMP static int aout_core_dump(long signr, struct pt_regs * regs, struct file *file); /* @@ -103,7 +103,7 @@ static struct linux_binfmt aout_format = { .module = THIS_MODULE, .load_binary = load_aout_binary, .load_shlib = load_aout_library, -#if CORE_DUMP +#ifdef CORE_DUMP .core_dump = aout_core_dump, #endif .min_coredump = PAGE_SIZE @@ -120,7 +120,7 @@ static void set_brk(unsigned long start, unsigned long end) up_write(¤t->mm->mmap_sem); } -#if CORE_DUMP +#ifdef CORE_DUMP /* * These are the only things you should do on a core-file: use only these * macros to write out all the necessary info. diff --git a/arch/x86_64/ia32/ia32_signal.c b/arch/x86_64/ia32/ia32_signal.c index fbd09b5126c..66e2821533d 100644 --- a/arch/x86_64/ia32/ia32_signal.c +++ b/arch/x86_64/ia32/ia32_signal.c @@ -428,8 +428,8 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size) return (void __user *)((rsp - frame_size) & -8UL); } -void ia32_setup_frame(int sig, struct k_sigaction *ka, - compat_sigset_t *set, struct pt_regs * regs) +int ia32_setup_frame(int sig, struct k_sigaction *ka, + compat_sigset_t *set, struct pt_regs * regs) { struct sigframe __user *frame; int err = 0; @@ -514,14 +514,15 @@ void ia32_setup_frame(int sig, struct k_sigaction *ka, current->comm, current->pid, frame, regs->rip, frame->pretcode); #endif - return; + return 1; give_sigsegv: force_sigsegv(sig, current); + return 0; } -void ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, - compat_sigset_t *set, struct pt_regs * regs) +int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, + compat_sigset_t *set, struct pt_regs * regs) { struct rt_sigframe __user *frame; int err = 0; @@ -613,9 +614,9 @@ void ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, current->comm, current->pid, frame, regs->rip, frame->pretcode); #endif - return; + return 1; give_sigsegv: force_sigsegv(sig, current); + return 0; } - diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S index f3ca0db85b5..c45d6a05b98 100644 --- a/arch/x86_64/ia32/ia32entry.S +++ b/arch/x86_64/ia32/ia32entry.S @@ -589,13 +589,17 @@ ia32_sys_call_table: .quad compat_sys_mq_timedreceive /* 280 */ .quad compat_sys_mq_notify .quad compat_sys_mq_getsetattr - .quad quiet_ni_syscall /* reserved for kexec */ + .quad compat_sys_kexec_load /* reserved for kexec */ .quad compat_sys_waitid - .quad quiet_ni_syscall /* sys_altroot */ + .quad quiet_ni_syscall /* 285: sys_altroot */ .quad sys_add_key .quad sys_request_key .quad sys_keyctl - /* don't forget to change IA32_NR_syscalls */ + .quad sys_ioprio_set + .quad sys_ioprio_get /* 290 */ + .quad sys_inotify_init + .quad sys_inotify_add_watch + .quad sys_inotify_rm_watch ia32_syscall_end: .rept IA32_NR_syscalls-(ia32_syscall_end-ia32_sys_call_table)/8 .quad ni_syscall diff --git a/arch/x86_64/ia32/ptrace32.c b/arch/x86_64/ia32/ptrace32.c index b98b6d2462f..2a925e2af39 100644 --- a/arch/x86_64/ia32/ptrace32.c +++ b/arch/x86_64/ia32/ptrace32.c @@ -43,11 +43,11 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 val) switch (regno) { case offsetof(struct user32, regs.fs): if (val && (val & 3) != 3) return -EIO; - child->thread.fs = val & 0xffff; + child->thread.fsindex = val & 0xffff; break; case offsetof(struct user32, regs.gs): if (val && (val & 3) != 3) return -EIO; - child->thread.gs = val & 0xffff; + child->thread.gsindex = val & 0xffff; break; case offsetof(struct user32, regs.ds): if (val && (val & 3) != 3) return -EIO; @@ -138,10 +138,10 @@ static int getreg32(struct task_struct *child, unsigned regno, u32 *val) switch (regno) { case offsetof(struct user32, regs.fs): - *val = child->thread.fs; + *val = child->thread.fsindex; break; case offsetof(struct user32, regs.gs): - *val = child->thread.gs; + *val = child->thread.gsindex; break; case offsetof(struct user32, regs.ds): *val = child->thread.ds; diff --git a/arch/x86_64/ia32/sys_ia32.c b/arch/x86_64/ia32/sys_ia32.c index 68a9ab06ee7..be996d1b691 100644 --- a/arch/x86_64/ia32/sys_ia32.c +++ b/arch/x86_64/ia32/sys_ia32.c @@ -61,6 +61,7 @@ #include <linux/ptrace.h> #include <linux/highuid.h> #include <linux/vmalloc.h> +#include <linux/fsnotify.h> #include <asm/mman.h> #include <asm/types.h> #include <asm/uaccess.h> @@ -984,8 +985,10 @@ asmlinkage long sys32_open(const char __user * filename, int flags, int mode) if (IS_ERR(f)) { put_unused_fd(fd); fd = error; - } else + } else { + fsnotify_open(f->f_dentry); fd_install(fd, f); + } } putname(tmp); } diff --git a/arch/x86_64/ia32/syscall32.c b/arch/x86_64/ia32/syscall32.c index 01d8db1a1c0..adbc5f8089e 100644 --- a/arch/x86_64/ia32/syscall32.c +++ b/arch/x86_64/ia32/syscall32.c @@ -14,16 +14,6 @@ #include <asm/tlbflush.h> #include <asm/ia32_unistd.h> -/* 32bit VDSOs mapped into user space. */ -asm(".section \".init.data\",\"aw\"\n" - "syscall32_syscall:\n" - ".incbin \"arch/x86_64/ia32/vsyscall-syscall.so\"\n" - "syscall32_syscall_end:\n" - "syscall32_sysenter:\n" - ".incbin \"arch/x86_64/ia32/vsyscall-sysenter.so\"\n" - "syscall32_sysenter_end:\n" - ".previous"); - extern unsigned char syscall32_syscall[], syscall32_syscall_end[]; extern unsigned char syscall32_sysenter[], syscall32_sysenter_end[]; extern int sysctl_vsyscall32; @@ -57,6 +47,7 @@ int syscall32_setup_pages(struct linux_binprm *bprm, int exstack) int npages = (VSYSCALL32_END - VSYSCALL32_BASE) >> PAGE_SHIFT; struct vm_area_struct *vma; struct mm_struct *mm = current->mm; + int ret; vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); if (!vma) @@ -78,7 +69,11 @@ int syscall32_setup_pages(struct linux_binprm *bprm, int exstack) vma->vm_mm = mm; down_write(&mm->mmap_sem); - insert_vm_struct(mm, vma); + if ((ret = insert_vm_struct(mm, vma))) { + up_write(&mm->mmap_sem); + kmem_cache_free(vm_area_cachep, vma); + return ret; + } mm->total_vm += npages; up_write(&mm->mmap_sem); return 0; diff --git a/arch/x86_64/ia32/syscall32_syscall.S b/arch/x86_64/ia32/syscall32_syscall.S new file mode 100644 index 00000000000..8f8271bdf13 --- /dev/null +++ b/arch/x86_64/ia32/syscall32_syscall.S @@ -0,0 +1,17 @@ +/* 32bit VDSOs mapped into user space. */ + + .section ".init.data","aw" + + .globl syscall32_syscall + .globl syscall32_syscall_end + +syscall32_syscall: + .incbin "arch/x86_64/ia32/vsyscall-syscall.so" +syscall32_syscall_end: + + .globl syscall32_sysenter + .globl syscall32_sysenter_end + +syscall32_sysenter: + .incbin "arch/x86_64/ia32/vsyscall-sysenter.so" +syscall32_sysenter_end: diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile index 5ca4a4598fd..48f9e2c19cd 100644 --- a/arch/x86_64/kernel/Makefile +++ b/arch/x86_64/kernel/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o mpparse.o \ genapic.o genapic_cluster.o genapic_flat.o +obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o obj-$(CONFIG_PM) += suspend.o obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend_asm.o obj-$(CONFIG_CPU_FREQ) += cpufreq/ diff --git a/arch/x86_64/kernel/acpi/wakeup.S b/arch/x86_64/kernel/acpi/wakeup.S index a4c630034cd..185faa911db 100644 --- a/arch/x86_64/kernel/acpi/wakeup.S +++ b/arch/x86_64/kernel/acpi/wakeup.S @@ -67,7 +67,7 @@ wakeup_code: shll $4, %eax addl $(gdta - wakeup_code), %eax movl %eax, gdt_48a +2 - wakeup_code - lgdt %ds:gdt_48a - wakeup_code # load gdt with whatever is + lgdtl %ds:gdt_48a - wakeup_code # load gdt with whatever is # appropriate movl $1, %eax # protected mode (PE) bit diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c index 504e6347499..c9a6b812e92 100644 --- a/arch/x86_64/kernel/aperture.c +++ b/arch/x86_64/kernel/aperture.c @@ -40,11 +40,7 @@ int fix_aperture __initdata = 1; static u32 __init allocate_aperture(void) { -#ifdef CONFIG_DISCONTIGMEM pg_data_t *nd0 = NODE_DATA(0); -#else - pg_data_t *nd0 = &contig_page_data; -#endif u32 aper_size; void *p; diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index f8e6cc4fecd..375d369570c 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -133,7 +133,7 @@ void __init connect_bsp_APIC(void) } } -void disconnect_bsp_APIC(void) +void disconnect_bsp_APIC(int virt_wire_setup) { if (pic_mode) { /* @@ -146,6 +146,42 @@ void disconnect_bsp_APIC(void) outb(0x70, 0x22); outb(0x00, 0x23); } + else { + /* Go back to Virtual Wire compatibility mode */ + unsigned long value; + + /* For the spurious interrupt use vector F, and enable it */ + value = apic_read(APIC_SPIV); + value &= ~APIC_VECTOR_MASK; + value |= APIC_SPIV_APIC_ENABLED; + value |= 0xf; + apic_write_around(APIC_SPIV, value); + + if (!virt_wire_setup) { + /* For LVT0 make it edge triggered, active high, external and enabled */ + value = apic_read(APIC_LVT0); + value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED ); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); + apic_write_around(APIC_LVT0, value); + } + else { + /* Disable LVT0 */ + apic_write_around(APIC_LVT0, APIC_LVT_MASKED); + } + + /* For LVT1 make it edge triggered, active high, nmi and enabled */ + value = apic_read(APIC_LVT1); + value &= ~( + APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); + apic_write_around(APIC_LVT1, value); + } } void disable_local_APIC(void) @@ -285,7 +321,7 @@ void __init init_bsp_APIC(void) apic_write_around(APIC_LVT1, value); } -void __init setup_local_APIC (void) +void __cpuinit setup_local_APIC (void) { unsigned int value, ver, maxlvt; @@ -534,7 +570,7 @@ static struct sys_device device_lapic = { .cls = &lapic_sysclass, }; -static void __init apic_pm_activate(void) +static void __cpuinit apic_pm_activate(void) { apic_pm_state.active = 1; } @@ -774,14 +810,14 @@ void __init setup_boot_APIC_clock (void) local_irq_enable(); } -void __init setup_secondary_APIC_clock(void) +void __cpuinit setup_secondary_APIC_clock(void) { local_irq_disable(); /* FIXME: Do we need this? --RR */ setup_APIC_timer(calibration_result); local_irq_enable(); } -void __init disable_APIC_timer(void) +void __cpuinit disable_APIC_timer(void) { if (using_apic_timer) { unsigned long v; diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c new file mode 100644 index 00000000000..d7fa4248501 --- /dev/null +++ b/arch/x86_64/kernel/crash.c @@ -0,0 +1,35 @@ +/* + * Architecture specific (x86_64) functions for kexec based crash dumps. + * + * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) + * + * Copyright (C) IBM Corporation, 2004. All rights reserved. + * + */ + +#include <linux/init.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/smp.h> +#include <linux/irq.h> +#include <linux/reboot.h> +#include <linux/kexec.h> + +#include <asm/processor.h> +#include <asm/hardirq.h> +#include <asm/nmi.h> +#include <asm/hw_irq.h> + +note_buf_t crash_notes[NR_CPUS]; + +void machine_crash_shutdown(struct pt_regs *regs) +{ + /* This function is only called after the system + * has paniced or is otherwise in a critical state. + * The minimum amount of code to allow a kexec'd kernel + * to run successfully needs to happen here. + * + * In practice this means shooting down the other cpus in + * an SMP system. + */ +} diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index 7c154dfff64..6ded3a50dfe 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c @@ -16,6 +16,7 @@ #include <linux/bootmem.h> #include <linux/ioport.h> #include <linux/string.h> +#include <linux/kexec.h> #include <asm/page.h> #include <asm/e820.h> #include <asm/proto.h> @@ -191,8 +192,6 @@ void __init e820_reserve_resources(void) int i; for (i = 0; i < e820.nr_map; i++) { struct resource *res; - if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL) - continue; res = alloc_bootmem_low(sizeof(struct resource)); switch (e820.map[i].type) { case E820_RAM: res->name = "System RAM"; break; @@ -212,6 +211,9 @@ void __init e820_reserve_resources(void) */ request_resource(res, &code_resource); request_resource(res, &data_resource); +#ifdef CONFIG_KEXEC + request_resource(res, &crashk_res); +#endif } } } diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c index e3a19e8ebbf..9631c747c5e 100644 --- a/arch/x86_64/kernel/early_printk.c +++ b/arch/x86_64/kernel/early_printk.c @@ -2,20 +2,24 @@ #include <linux/kernel.h> #include <linux/init.h> #include <linux/string.h> +#include <linux/tty.h> #include <asm/io.h> #include <asm/processor.h> /* Simple VGA output */ #ifdef __i386__ +#include <asm/setup.h> #define VGABASE (__ISA_IO_base + 0xb8000) #else +#include <asm/bootsetup.h> #define VGABASE ((void __iomem *)0xffffffff800b8000UL) #endif -#define MAX_YPOS 25 -#define MAX_XPOS 80 +#define MAX_YPOS max_ypos +#define MAX_XPOS max_xpos +static int max_ypos = 25, max_xpos = 80; static int current_ypos = 1, current_xpos = 0; static void early_vga_write(struct console *con, const char *str, unsigned n) @@ -196,7 +200,10 @@ int __init setup_early_printk(char *opt) } else if (!strncmp(buf, "ttyS", 4)) { early_serial_init(buf); early_console = &early_serial_console; - } else if (!strncmp(buf, "vga", 3)) { + } else if (!strncmp(buf, "vga", 3) + && SCREEN_INFO.orig_video_isVGA == 1) { + max_xpos = SCREEN_INFO.orig_video_cols; + max_ypos = SCREEN_INFO.orig_video_lines; early_console = &early_vga_console; } early_console_initialized = 1; diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index 28817490fdc..096d470e280 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -76,7 +76,7 @@ .macro FAKE_STACK_FRAME child_rip /* push in order ss, rsp, eflags, cs, rip */ - xorq %rax, %rax + xorl %eax, %eax pushq %rax /* ss */ CFI_ADJUST_CFA_OFFSET 8 pushq %rax /* rsp */ @@ -423,7 +423,7 @@ ENTRY(stub_rt_sigreturn) testl $3,CS(%rdi) je 1f swapgs -1: addl $1,%gs:pda_irqcount # RED-PEN should check preempt count +1: incl %gs:pda_irqcount # RED-PEN should check preempt count movq %gs:pda_irqstackptr,%rax cmoveq %rax,%rsp pushq %rdi # save old stack @@ -436,7 +436,7 @@ ENTRY(common_interrupt) ret_from_intr: popq %rdi cli - subl $1,%gs:pda_irqcount + decl %gs:pda_irqcount #ifdef CONFIG_DEBUG_INFO movq RBP(%rdi),%rbp #endif @@ -494,7 +494,7 @@ retint_signal: sti SAVE_REST movq $-1,ORIG_RAX(%rsp) - xorq %rsi,%rsi # oldset + xorl %esi,%esi # oldset movq %rsp,%rdi # &pt_regs call do_notify_resume RESTORE_REST @@ -752,7 +752,7 @@ child_rip: movq %rsi, %rdi call *%rax # exit - xorq %rdi, %rdi + xorl %edi, %edi call do_exit /* @@ -918,3 +918,15 @@ ENTRY(machine_check) ENTRY(call_debug) zeroentry do_call_debug +ENTRY(call_softirq) + movq %gs:pda_irqstackptr,%rax + pushq %r15 + movq %rsp,%r15 + incl %gs:pda_irqcount + cmove %rax,%rsp + call __do_softirq + movq %r15,%rsp + decl %gs:pda_irqcount + popq %r15 + ret + diff --git a/arch/x86_64/kernel/genapic.c b/arch/x86_64/kernel/genapic.c index 69b9c25a8fc..30c843a5efd 100644 --- a/arch/x86_64/kernel/genapic.c +++ b/arch/x86_64/kernel/genapic.c @@ -31,6 +31,7 @@ u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; extern struct genapic apic_cluster; extern struct genapic apic_flat; +extern struct genapic apic_physflat; struct genapic *genapic = &apic_flat; @@ -44,12 +45,7 @@ void __init clustered_apic_check(void) u8 clusters, max_cluster; u8 id; u8 cluster_cnt[NUM_APIC_CLUSTERS]; - - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { - /* AMD always uses flat mode right now */ - genapic = &apic_flat; - goto print; - } + int num_cpus = 0; #if defined(CONFIG_ACPI_BUS) /* @@ -64,15 +60,34 @@ void __init clustered_apic_check(void) #endif memset(cluster_cnt, 0, sizeof(cluster_cnt)); - for (i = 0; i < NR_CPUS; i++) { id = bios_cpu_apicid[i]; - if (id != BAD_APICID) - cluster_cnt[APIC_CLUSTERID(id)]++; + if (id == BAD_APICID) + continue; + num_cpus++; + cluster_cnt[APIC_CLUSTERID(id)]++; } + /* Don't use clustered mode on AMD platforms. */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { + genapic = &apic_physflat; +#ifndef CONFIG_CPU_HOTPLUG + /* In the CPU hotplug case we cannot use broadcast mode + because that opens a race when a CPU is removed. + Stay at physflat mode in this case. + It is bad to do this unconditionally though. Once + we have ACPI platform support for CPU hotplug + we should detect hotplug capablity from ACPI tables and + only do this when really needed. -AK */ + if (num_cpus <= 8) + genapic = &apic_flat; +#endif + goto print; + } + clusters = 0; max_cluster = 0; + for (i = 0; i < NUM_APIC_CLUSTERS; i++) { if (cluster_cnt[i] > 0) { ++clusters; diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c index b4cbbad0422..adc96282a9e 100644 --- a/arch/x86_64/kernel/genapic_flat.c +++ b/arch/x86_64/kernel/genapic_flat.c @@ -2,7 +2,7 @@ * Copyright 2004 James Cleverdon, IBM. * Subject to the GNU Public License, v.2 * - * Flat APIC subarch code. Maximum 8 CPUs, logical delivery. + * Flat APIC subarch code. * * Hacked for x86-64 by James Cleverdon from i386 architecture code by * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and @@ -18,7 +18,6 @@ #include <asm/smp.h> #include <asm/ipi.h> - static cpumask_t flat_target_cpus(void) { return cpu_online_map; @@ -45,22 +44,6 @@ static void flat_init_apic_ldr(void) apic_write_around(APIC_LDR, val); } -static void flat_send_IPI_allbutself(int vector) -{ - /* - * if there are no other CPUs in the system then - * we get an APIC send error if we try to broadcast. - * thus we have to avoid sending IPIs in this case. - */ - if (num_online_cpus() > 1) - __send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL); -} - -static void flat_send_IPI_all(int vector) -{ - __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL); -} - static void flat_send_IPI_mask(cpumask_t cpumask, int vector) { unsigned long mask = cpus_addr(cpumask)[0]; @@ -93,6 +76,17 @@ static void flat_send_IPI_mask(cpumask_t cpumask, int vector) local_irq_restore(flags); } +static void flat_send_IPI_allbutself(int vector) +{ + if (((num_online_cpus()) - 1) >= 1) + __send_IPI_shortcut(APIC_DEST_ALLBUT, vector,APIC_DEST_LOGICAL); +} + +static void flat_send_IPI_all(int vector) +{ + __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL); +} + static int flat_apic_id_registered(void) { return physid_isset(GET_APIC_ID(apic_read(APIC_ID)), phys_cpu_present_map); @@ -125,3 +119,63 @@ struct genapic apic_flat = { .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, .phys_pkg_id = phys_pkg_id, }; + +/* + * Physflat mode is used when there are more than 8 CPUs on a AMD system. + * We cannot use logical delivery in this case because the mask + * overflows, so use physical mode. + */ + +static cpumask_t physflat_target_cpus(void) +{ + return cpumask_of_cpu(0); +} + +static void physflat_send_IPI_mask(cpumask_t cpumask, int vector) +{ + send_IPI_mask_sequence(cpumask, vector); +} + +static void physflat_send_IPI_allbutself(int vector) +{ + cpumask_t allbutme = cpu_online_map; + int me = get_cpu(); + cpu_clear(me, allbutme); + physflat_send_IPI_mask(allbutme, vector); + put_cpu(); +} + +static void physflat_send_IPI_all(int vector) +{ + physflat_send_IPI_mask(cpu_online_map, vector); +} + +static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask) +{ + int cpu; + + /* + * We're using fixed IRQ delivery, can only return one phys APIC ID. + * May as well be the first. + */ + cpu = first_cpu(cpumask); + if ((unsigned)cpu < NR_CPUS) + return x86_cpu_to_apicid[cpu]; + else + return BAD_APICID; +} + +struct genapic apic_physflat = { + .name = "physical flat", + .int_delivery_mode = dest_LowestPrio, + .int_dest_mode = (APIC_DEST_PHYSICAL != 0), + .int_delivery_dest = APIC_DEST_PHYSICAL | APIC_DM_LOWEST, + .target_cpus = physflat_target_cpus, + .apic_id_registered = flat_apic_id_registered, + .init_apic_ldr = flat_init_apic_ldr,/*not needed, but shouldn't hurt*/ + .send_IPI_all = physflat_send_IPI_all, + .send_IPI_allbutself = physflat_send_IPI_allbutself, + .send_IPI_mask = physflat_send_IPI_mask, + .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid, + .phys_pkg_id = phys_pkg_id, +}; diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index 9bd2e7a4b81..98ff5eb32b9 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -137,14 +137,14 @@ startup_64: wrmsr /* Setup cr0 */ - xorq %rax, %rax - btsq $31, %rax /* Enable paging */ - btsq $0, %rax /* Enable protected mode */ - btsq $1, %rax /* Enable MP */ - btsq $4, %rax /* Enable ET */ - btsq $5, %rax /* Enable NE */ - btsq $16, %rax /* Enable WP */ - btsq $18, %rax /* Enable AM */ +#define CR0_PM 1 /* protected mode */ +#define CR0_MP (1<<1) +#define CR0_ET (1<<4) +#define CR0_NE (1<<5) +#define CR0_WP (1<<16) +#define CR0_AM (1<<18) +#define CR0_PAGING (1<<31) + movl $CR0_PM|CR0_MP|CR0_ET|CR0_NE|CR0_WP|CR0_AM|CR0_PAGING,%eax /* Make changes effective */ movq %rax, %cr0 @@ -248,23 +248,23 @@ ENTRY(_stext) */ .org 0x1000 ENTRY(init_level4_pgt) - .quad 0x0000000000102007 /* -> level3_ident_pgt */ + .quad 0x0000000000002007 + __PHYSICAL_START /* -> level3_ident_pgt */ .fill 255,8,0 - .quad 0x000000000010a007 + .quad 0x000000000000a007 + __PHYSICAL_START .fill 254,8,0 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ - .quad 0x0000000000103007 /* -> level3_kernel_pgt */ + .quad 0x0000000000003007 + __PHYSICAL_START /* -> level3_kernel_pgt */ .org 0x2000 ENTRY(level3_ident_pgt) - .quad 0x0000000000104007 + .quad 0x0000000000004007 + __PHYSICAL_START .fill 511,8,0 .org 0x3000 ENTRY(level3_kernel_pgt) .fill 510,8,0 /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ - .quad 0x0000000000105007 /* -> level2_kernel_pgt */ + .quad 0x0000000000005007 + __PHYSICAL_START /* -> level2_kernel_pgt */ .fill 1,8,0 .org 0x4000 @@ -337,17 +337,17 @@ ENTRY(empty_bad_pmd_table) .org 0xa000 ENTRY(level3_physmem_pgt) - .quad 0x0000000000105007 /* -> level2_kernel_pgt (so that __va works even before pagetable_init) */ + .quad 0x0000000000005007 + __PHYSICAL_START /* -> level2_kernel_pgt (so that __va works even before pagetable_init) */ .org 0xb000 #ifdef CONFIG_ACPI_SLEEP ENTRY(wakeup_level4_pgt) - .quad 0x0000000000102007 /* -> level3_ident_pgt */ + .quad 0x0000000000002007 + __PHYSICAL_START /* -> level3_ident_pgt */ .fill 255,8,0 - .quad 0x000000000010a007 + .quad 0x000000000000a007 + __PHYSICAL_START .fill 254,8,0 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ - .quad 0x0000000000103007 /* -> level3_kernel_pgt */ + .quad 0x0000000000003007 + __PHYSICAL_START /* -> level3_kernel_pgt */ #endif .data diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c index 0f8c78dcd38..cf6ab147a2a 100644 --- a/arch/x86_64/kernel/head64.c +++ b/arch/x86_64/kernel/head64.c @@ -94,7 +94,7 @@ void __init x86_64_start_kernel(char * real_mode_data) s = strstr(saved_command_line, "earlyprintk="); if (s != NULL) setup_early_printk(s); -#ifdef CONFIG_DISCONTIGMEM +#ifdef CONFIG_NUMA s = strstr(saved_command_line, "numa="); if (s != NULL) numa_setup(s+5); diff --git a/arch/x86_64/kernel/i387.c b/arch/x86_64/kernel/i387.c index ba139cac57c..d9b22b633e3 100644 --- a/arch/x86_64/kernel/i387.c +++ b/arch/x86_64/kernel/i387.c @@ -42,7 +42,7 @@ void mxcsr_feature_mask_init(void) * Called at bootup to set up the initial FPU state that is later cloned * into all processes. */ -void __init fpu_init(void) +void __cpuinit fpu_init(void) { unsigned long oldcr0 = read_cr0(); extern void __bad_fxsave_alignment(void); diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c index 7873d9ba881..a8916909512 100644 --- a/arch/x86_64/kernel/i8259.c +++ b/arch/x86_64/kernel/i8259.c @@ -157,14 +157,13 @@ static unsigned int startup_8259A_irq(unsigned int irq) } static struct hw_interrupt_type i8259A_irq_type = { - "XT-PIC", - startup_8259A_irq, - shutdown_8259A_irq, - enable_8259A_irq, - disable_8259A_irq, - mask_and_ack_8259A, - end_8259A_irq, - NULL + .typename = "XT-PIC", + .startup = startup_8259A_irq, + .shutdown = shutdown_8259A_irq, + .enable = enable_8259A_irq, + .disable = disable_8259A_irq, + .ack = mask_and_ack_8259A, + .end = end_8259A_irq, }; /* @@ -415,10 +414,22 @@ static int i8259A_suspend(struct sys_device *dev, pm_message_t state) return 0; } +static int i8259A_shutdown(struct sys_device *dev) +{ + /* Put the i8259A into a quiescent state that + * the kernel initialization code can get it + * out of. + */ + outb(0xff, 0x21); /* mask all of 8259A-1 */ + outb(0xff, 0xA1); /* mask all of 8259A-1 */ + return 0; +} + static struct sysdev_class i8259_sysdev_class = { set_kset_name("i8259"), .suspend = i8259A_suspend, .resume = i8259A_resume, + .shutdown = i8259A_shutdown, }; static struct sys_device device_i8259A = { diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index afd87e64d0a..d206d7e49cf 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -330,7 +330,7 @@ static int find_irq_entry(int apic, int pin, int type) /* * Find the pin to which IRQ[irq] (ISA) is connected */ -static int __init find_isa_irq_pin(int irq, int type) +static int find_isa_irq_pin(int irq, int type) { int i; @@ -1064,7 +1064,6 @@ void print_all_local_APICs (void) void __apicdebuginit print_PIC(void) { - extern spinlock_t i8259A_lock; unsigned int v; unsigned long flags; @@ -1132,12 +1131,44 @@ static void __init enable_IO_APIC(void) */ void disable_IO_APIC(void) { + int pin; /* * Clear the IO-APIC before rebooting: */ clear_IO_APIC(); - disconnect_bsp_APIC(); + /* + * If the i82559 is routed through an IOAPIC + * Put that IOAPIC in virtual wire mode + * so legacy interrups can be delivered. + */ + pin = find_isa_irq_pin(0, mp_ExtINT); + if (pin != -1) { + struct IO_APIC_route_entry entry; + unsigned long flags; + + memset(&entry, 0, sizeof(entry)); + entry.mask = 0; /* Enabled */ + entry.trigger = 0; /* Edge */ + entry.irr = 0; + entry.polarity = 0; /* High */ + entry.delivery_status = 0; + entry.dest_mode = 0; /* Physical */ + entry.delivery_mode = 7; /* ExtInt */ + entry.vector = 0; + entry.dest.physical.physical_dest = 0; + + + /* + * Add it to the IO-APIC irq-routing table: + */ + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1)); + io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0)); + spin_unlock_irqrestore(&ioapic_lock, flags); + } + + disconnect_bsp_APIC(pin != -1); } /* diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c index 62b112e4deb..849a20aec7c 100644 --- a/arch/x86_64/kernel/irq.c +++ b/arch/x86_64/kernel/irq.c @@ -14,6 +14,7 @@ #include <linux/interrupt.h> #include <linux/seq_file.h> #include <linux/module.h> +#include <linux/delay.h> #include <asm/uaccess.h> #include <asm/io_apic.h> @@ -106,3 +107,50 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs) return 1; } +#ifdef CONFIG_HOTPLUG_CPU +void fixup_irqs(cpumask_t map) +{ + unsigned int irq; + static int warned; + + for (irq = 0; irq < NR_IRQS; irq++) { + cpumask_t mask; + if (irq == 2) + continue; + + cpus_and(mask, irq_affinity[irq], map); + if (any_online_cpu(mask) == NR_CPUS) { + printk("Breaking affinity for irq %i\n", irq); + mask = map; + } + if (irq_desc[irq].handler->set_affinity) + irq_desc[irq].handler->set_affinity(irq, mask); + else if (irq_desc[irq].action && !(warned++)) + printk("Cannot set affinity for irq %i\n", irq); + } + + /* That doesn't seem sufficient. Give it 1ms. */ + local_irq_enable(); + mdelay(1); + local_irq_disable(); +} +#endif + +extern void call_softirq(void); + +asmlinkage void do_softirq(void) +{ + __u32 pending; + unsigned long flags; + + if (in_interrupt()) + return; + + local_irq_save(flags); + pending = local_softirq_pending(); + /* Switch to interrupt stack */ + if (pending) + call_softirq(); + local_irq_restore(flags); +} +EXPORT_SYMBOL(do_softirq); diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c index f77f8a0ff18..5c6dc705148 100644 --- a/arch/x86_64/kernel/kprobes.c +++ b/arch/x86_64/kernel/kprobes.c @@ -27,6 +27,8 @@ * <prasanna@in.ibm.com> adapted for x86_64 * 2005-Mar Roland McGrath <roland@redhat.com> * Fixed to handle %rip-relative addressing mode correctly. + * 2005-May Rusty Lynch <rusty.lynch@intel.com> + * Added function return probes functionality */ #include <linux/config.h> @@ -36,23 +38,19 @@ #include <linux/string.h> #include <linux/slab.h> #include <linux/preempt.h> -#include <linux/moduleloader.h> +#include <asm/cacheflush.h> #include <asm/pgtable.h> #include <asm/kdebug.h> static DECLARE_MUTEX(kprobe_mutex); -/* kprobe_status settings */ -#define KPROBE_HIT_ACTIVE 0x00000001 -#define KPROBE_HIT_SS 0x00000002 - static struct kprobe *current_kprobe; static unsigned long kprobe_status, kprobe_old_rflags, kprobe_saved_rflags; +static struct kprobe *kprobe_prev; +static unsigned long kprobe_status_prev, kprobe_old_rflags_prev, kprobe_saved_rflags_prev; static struct pt_regs jprobe_saved_regs; static long *jprobe_saved_rsp; -static kprobe_opcode_t *get_insn_slot(void); -static void free_insn_slot(kprobe_opcode_t *slot); void jprobe_return_end(void); /* copy of the kernel stack at the probe fire time */ @@ -214,6 +212,21 @@ void arch_copy_kprobe(struct kprobe *p) BUG_ON((s64) (s32) disp != disp); /* Sanity check. */ *ripdisp = disp; } + p->opcode = *p->addr; +} + +void arch_arm_kprobe(struct kprobe *p) +{ + *p->addr = BREAKPOINT_INSTRUCTION; + flush_icache_range((unsigned long) p->addr, + (unsigned long) p->addr + sizeof(kprobe_opcode_t)); +} + +void arch_disarm_kprobe(struct kprobe *p) +{ + *p->addr = p->opcode; + flush_icache_range((unsigned long) p->addr, + (unsigned long) p->addr + sizeof(kprobe_opcode_t)); } void arch_remove_kprobe(struct kprobe *p) @@ -223,10 +236,29 @@ void arch_remove_kprobe(struct kprobe *p) down(&kprobe_mutex); } -static inline void disarm_kprobe(struct kprobe *p, struct pt_regs *regs) +static inline void save_previous_kprobe(void) { - *p->addr = p->opcode; - regs->rip = (unsigned long)p->addr; + kprobe_prev = current_kprobe; + kprobe_status_prev = kprobe_status; + kprobe_old_rflags_prev = kprobe_old_rflags; + kprobe_saved_rflags_prev = kprobe_saved_rflags; +} + +static inline void restore_previous_kprobe(void) +{ + current_kprobe = kprobe_prev; + kprobe_status = kprobe_status_prev; + kprobe_old_rflags = kprobe_old_rflags_prev; + kprobe_saved_rflags = kprobe_saved_rflags_prev; +} + +static inline void set_current_kprobe(struct kprobe *p, struct pt_regs *regs) +{ + current_kprobe = p; + kprobe_saved_rflags = kprobe_old_rflags + = (regs->eflags & (TF_MASK | IF_MASK)); + if (is_IF_modifier(p->ainsn.insn)) + kprobe_saved_rflags &= ~IF_MASK; } static void prepare_singlestep(struct kprobe *p, struct pt_regs *regs) @@ -240,6 +272,25 @@ static void prepare_singlestep(struct kprobe *p, struct pt_regs *regs) regs->rip = (unsigned long)p->ainsn.insn; } +void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs) +{ + unsigned long *sara = (unsigned long *)regs->rsp; + struct kretprobe_instance *ri; + + if ((ri = get_free_rp_inst(rp)) != NULL) { + ri->rp = rp; + ri->task = current; + ri->ret_addr = (kprobe_opcode_t *) *sara; + + /* Replace the return addr with trampoline addr */ + *sara = (unsigned long) &kretprobe_trampoline; + + add_rp_inst(ri); + } else { + rp->nmissed++; + } +} + /* * Interrupts are disabled on entry as trap3 is an interrupt gate and they * remain disabled thorough out this function. @@ -264,9 +315,30 @@ int kprobe_handler(struct pt_regs *regs) regs->eflags |= kprobe_saved_rflags; unlock_kprobes(); goto no_kprobe; + } else if (kprobe_status == KPROBE_HIT_SSDONE) { + /* TODO: Provide re-entrancy from + * post_kprobes_handler() and avoid exception + * stack corruption while single-stepping on + * the instruction of the new probe. + */ + arch_disarm_kprobe(p); + regs->rip = (unsigned long)p->addr; + ret = 1; + } else { + /* We have reentered the kprobe_handler(), since + * another probe was hit while within the + * handler. We here save the original kprobe + * variables and just single step on instruction + * of the new probe without calling any user + * handlers. + */ + save_previous_kprobe(); + set_current_kprobe(p, regs); + p->nmissed++; + prepare_singlestep(p, regs); + kprobe_status = KPROBE_REENTER; + return 1; } - disarm_kprobe(p, regs); - ret = 1; } else { p = current_kprobe; if (p->break_handler && p->break_handler(p, regs)) { @@ -296,11 +368,7 @@ int kprobe_handler(struct pt_regs *regs) } kprobe_status = KPROBE_HIT_ACTIVE; - current_kprobe = p; - kprobe_saved_rflags = kprobe_old_rflags - = (regs->eflags & (TF_MASK | IF_MASK)); - if (is_IF_modifier(p->ainsn.insn)) - kprobe_saved_rflags &= ~IF_MASK; + set_current_kprobe(p, regs); if (p->pre_handler && p->pre_handler(p, regs)) /* handler has already set things up, so skip ss setup */ @@ -317,6 +385,78 @@ no_kprobe: } /* + * For function-return probes, init_kprobes() establishes a probepoint + * here. When a retprobed function returns, this probe is hit and + * trampoline_probe_handler() runs, calling the kretprobe's handler. + */ + void kretprobe_trampoline_holder(void) + { + asm volatile ( ".global kretprobe_trampoline\n" + "kretprobe_trampoline: \n" + "nop\n"); + } + +/* + * Called when we hit the probe point at kretprobe_trampoline + */ +int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct kretprobe_instance *ri = NULL; + struct hlist_head *head; + struct hlist_node *node, *tmp; + unsigned long orig_ret_address = 0; + unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline; + + head = kretprobe_inst_table_head(current); + + /* + * It is possible to have multiple instances associated with a given + * task either because an multiple functions in the call path + * have a return probe installed on them, and/or more then one return + * return probe was registered for a target function. + * + * We can handle this because: + * - instances are always inserted at the head of the list + * - when multiple return probes are registered for the same + * function, the first instance's ret_addr will point to the + * real return address, and all the rest will point to + * kretprobe_trampoline + */ + hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { + if (ri->task != current) + /* another task is sharing our hash bucket */ + continue; + + if (ri->rp && ri->rp->handler) + ri->rp->handler(ri, regs); + + orig_ret_address = (unsigned long)ri->ret_addr; + recycle_rp_inst(ri); + + if (orig_ret_address != trampoline_address) + /* + * This is the real return address. Any other + * instances associated with this task are for + * other calls deeper on the call stack + */ + break; + } + + BUG_ON(!orig_ret_address || (orig_ret_address == trampoline_address)); + regs->rip = orig_ret_address; + + unlock_kprobes(); + preempt_enable_no_resched(); + + /* + * By returning a non-zero value, we are telling + * kprobe_handler() that we have handled unlocking + * and re-enabling preemption. + */ + return 1; +} + +/* * Called after single-stepping. p->addr is the address of the * instruction whose first byte has been replaced by the "int 3" * instruction. To avoid the SMP problems that can occur when we @@ -401,13 +541,22 @@ int post_kprobe_handler(struct pt_regs *regs) if (!kprobe_running()) return 0; - if (current_kprobe->post_handler) + if ((kprobe_status != KPROBE_REENTER) && current_kprobe->post_handler) { + kprobe_status = KPROBE_HIT_SSDONE; current_kprobe->post_handler(current_kprobe, regs, 0); + } resume_execution(current_kprobe, regs); regs->eflags |= kprobe_saved_rflags; - unlock_kprobes(); + /* Restore the original saved kprobes variables and continue. */ + if (kprobe_status == KPROBE_REENTER) { + restore_previous_kprobe(); + goto out; + } else { + unlock_kprobes(); + } +out: preempt_enable_no_resched(); /* @@ -528,111 +677,12 @@ int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) return 0; } -/* - * kprobe->ainsn.insn points to the copy of the instruction to be single-stepped. - * By default on x86_64, pages we get from kmalloc or vmalloc are not - * executable. Single-stepping an instruction on such a page yields an - * oops. So instead of storing the instruction copies in their respective - * kprobe objects, we allocate a page, map it executable, and store all the - * instruction copies there. (We can allocate additional pages if somebody - * inserts a huge number of probes.) Each page can hold up to INSNS_PER_PAGE - * instruction slots, each of which is MAX_INSN_SIZE*sizeof(kprobe_opcode_t) - * bytes. - */ -#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE*sizeof(kprobe_opcode_t))) -struct kprobe_insn_page { - struct hlist_node hlist; - kprobe_opcode_t *insns; /* page of instruction slots */ - char slot_used[INSNS_PER_PAGE]; - int nused; +static struct kprobe trampoline_p = { + .addr = (kprobe_opcode_t *) &kretprobe_trampoline, + .pre_handler = trampoline_probe_handler }; -static struct hlist_head kprobe_insn_pages; - -/** - * get_insn_slot() - Find a slot on an executable page for an instruction. - * We allocate an executable page if there's no room on existing ones. - */ -static kprobe_opcode_t *get_insn_slot(void) -{ - struct kprobe_insn_page *kip; - struct hlist_node *pos; - - hlist_for_each(pos, &kprobe_insn_pages) { - kip = hlist_entry(pos, struct kprobe_insn_page, hlist); - if (kip->nused < INSNS_PER_PAGE) { - int i; - for (i = 0; i < INSNS_PER_PAGE; i++) { - if (!kip->slot_used[i]) { - kip->slot_used[i] = 1; - kip->nused++; - return kip->insns + (i*MAX_INSN_SIZE); - } - } - /* Surprise! No unused slots. Fix kip->nused. */ - kip->nused = INSNS_PER_PAGE; - } - } - - /* All out of space. Need to allocate a new page. Use slot 0.*/ - kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL); - if (!kip) { - return NULL; - } - - /* - * For the %rip-relative displacement fixups to be doable, we - * need our instruction copy to be within +/- 2GB of any data it - * might access via %rip. That is, within 2GB of where the - * kernel image and loaded module images reside. So we allocate - * a page in the module loading area. - */ - kip->insns = module_alloc(PAGE_SIZE); - if (!kip->insns) { - kfree(kip); - return NULL; - } - INIT_HLIST_NODE(&kip->hlist); - hlist_add_head(&kip->hlist, &kprobe_insn_pages); - memset(kip->slot_used, 0, INSNS_PER_PAGE); - kip->slot_used[0] = 1; - kip->nused = 1; - return kip->insns; -} - -/** - * free_insn_slot() - Free instruction slot obtained from get_insn_slot(). - */ -static void free_insn_slot(kprobe_opcode_t *slot) +int __init arch_init_kprobes(void) { - struct kprobe_insn_page *kip; - struct hlist_node *pos; - - hlist_for_each(pos, &kprobe_insn_pages) { - kip = hlist_entry(pos, struct kprobe_insn_page, hlist); - if (kip->insns <= slot - && slot < kip->insns+(INSNS_PER_PAGE*MAX_INSN_SIZE)) { - int i = (slot - kip->insns) / MAX_INSN_SIZE; - kip->slot_used[i] = 0; - kip->nused--; - if (kip->nused == 0) { - /* - * Page is no longer in use. Free it unless - * it's the last one. We keep the last one - * so as not to have to set it up again the - * next time somebody inserts a probe. - */ - hlist_del(&kip->hlist); - if (hlist_empty(&kprobe_insn_pages)) { - INIT_HLIST_NODE(&kip->hlist); - hlist_add_head(&kip->hlist, - &kprobe_insn_pages); - } else { - module_free(NULL, kip->insns); - kfree(kip); - } - } - return; - } - } + return register_kprobe(&trampoline_p); } diff --git a/arch/x86_64/kernel/machine_kexec.c b/arch/x86_64/kernel/machine_kexec.c new file mode 100644 index 00000000000..89fab51e20f --- /dev/null +++ b/arch/x86_64/kernel/machine_kexec.c @@ -0,0 +1,231 @@ +/* + * machine_kexec.c - handle transition of Linux booting another kernel + * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com> + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include <linux/mm.h> +#include <linux/kexec.h> +#include <linux/string.h> +#include <linux/reboot.h> +#include <asm/pgtable.h> +#include <asm/tlbflush.h> +#include <asm/mmu_context.h> +#include <asm/io.h> + +static void init_level2_page(pmd_t *level2p, unsigned long addr) +{ + unsigned long end_addr; + + addr &= PAGE_MASK; + end_addr = addr + PUD_SIZE; + while (addr < end_addr) { + set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); + addr += PMD_SIZE; + } +} + +static int init_level3_page(struct kimage *image, pud_t *level3p, + unsigned long addr, unsigned long last_addr) +{ + unsigned long end_addr; + int result; + + result = 0; + addr &= PAGE_MASK; + end_addr = addr + PGDIR_SIZE; + while ((addr < last_addr) && (addr < end_addr)) { + struct page *page; + pmd_t *level2p; + + page = kimage_alloc_control_pages(image, 0); + if (!page) { + result = -ENOMEM; + goto out; + } + level2p = (pmd_t *)page_address(page); + init_level2_page(level2p, addr); + set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE)); + addr += PUD_SIZE; + } + /* clear the unused entries */ + while (addr < end_addr) { + pud_clear(level3p++); + addr += PUD_SIZE; + } +out: + return result; +} + + +static int init_level4_page(struct kimage *image, pgd_t *level4p, + unsigned long addr, unsigned long last_addr) +{ + unsigned long end_addr; + int result; + + result = 0; + addr &= PAGE_MASK; + end_addr = addr + (PTRS_PER_PGD * PGDIR_SIZE); + while ((addr < last_addr) && (addr < end_addr)) { + struct page *page; + pud_t *level3p; + + page = kimage_alloc_control_pages(image, 0); + if (!page) { + result = -ENOMEM; + goto out; + } + level3p = (pud_t *)page_address(page); + result = init_level3_page(image, level3p, addr, last_addr); + if (result) { + goto out; + } + set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE)); + addr += PGDIR_SIZE; + } + /* clear the unused entries */ + while (addr < end_addr) { + pgd_clear(level4p++); + addr += PGDIR_SIZE; + } +out: + return result; +} + + +static int init_pgtable(struct kimage *image, unsigned long start_pgtable) +{ + pgd_t *level4p; + level4p = (pgd_t *)__va(start_pgtable); + return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT); +} + +static void set_idt(void *newidt, u16 limit) +{ + struct desc_ptr curidt; + + /* x86-64 supports unaliged loads & stores */ + curidt.size = limit; + curidt.address = (unsigned long)newidt; + + __asm__ __volatile__ ( + "lidtq %0\n" + : : "m" (curidt) + ); +}; + + +static void set_gdt(void *newgdt, u16 limit) +{ + struct desc_ptr curgdt; + + /* x86-64 supports unaligned loads & stores */ + curgdt.size = limit; + curgdt.address = (unsigned long)newgdt; + + __asm__ __volatile__ ( + "lgdtq %0\n" + : : "m" (curgdt) + ); +}; + +static void load_segments(void) +{ + __asm__ __volatile__ ( + "\tmovl %0,%%ds\n" + "\tmovl %0,%%es\n" + "\tmovl %0,%%ss\n" + "\tmovl %0,%%fs\n" + "\tmovl %0,%%gs\n" + : : "a" (__KERNEL_DS) + ); +} + +typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page, + unsigned long control_code_buffer, + unsigned long start_address, + unsigned long pgtable) ATTRIB_NORET; + +const extern unsigned char relocate_new_kernel[]; +const extern unsigned long relocate_new_kernel_size; + +int machine_kexec_prepare(struct kimage *image) +{ + unsigned long start_pgtable, control_code_buffer; + int result; + + /* Calculate the offsets */ + start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; + control_code_buffer = start_pgtable + PAGE_SIZE; + + /* Setup the identity mapped 64bit page table */ + result = init_pgtable(image, start_pgtable); + if (result) + return result; + + /* Place the code in the reboot code buffer */ + memcpy(__va(control_code_buffer), relocate_new_kernel, + relocate_new_kernel_size); + + return 0; +} + +void machine_kexec_cleanup(struct kimage *image) +{ + return; +} + +/* + * Do not allocate memory (or fail in any way) in machine_kexec(). + * We are past the point of no return, committed to rebooting now. + */ +NORET_TYPE void machine_kexec(struct kimage *image) +{ + unsigned long page_list; + unsigned long control_code_buffer; + unsigned long start_pgtable; + relocate_new_kernel_t rnk; + + /* Interrupts aren't acceptable while we reboot */ + local_irq_disable(); + + /* Calculate the offsets */ + page_list = image->head; + start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; + control_code_buffer = start_pgtable + PAGE_SIZE; + + /* Set the low half of the page table to my identity mapped + * page table for kexec. Leave the high half pointing at the + * kernel pages. Don't bother to flush the global pages + * as that will happen when I fully switch to my identity mapped + * page table anyway. + */ + memcpy(__va(read_cr3()), __va(start_pgtable), PAGE_SIZE/2); + __flush_tlb(); + + + /* The segment registers are funny things, they are + * automatically loaded from a table, in memory wherever you + * set them to a specific selector, but this table is never + * accessed again unless you set the segment to a different selector. + * + * The more common model are caches where the behide + * the scenes work is done, but is also dropped at arbitrary + * times. + * + * I take advantage of this here by force loading the + * segments, before I zap the gdt with an invalid value. + */ + load_segments(); + /* The gdt & idt are now invalid. + * If you want to load them you must set up your own idt & gdt. + */ + set_gdt(phys_to_virt(0),0); + set_idt(phys_to_virt(0),0); + /* now call it */ + rnk = (relocate_new_kernel_t) control_code_buffer; + (*rnk)(page_list, control_code_buffer, image->start, start_pgtable); +} diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c index 3a89d735a4f..8aa56736cde 100644 --- a/arch/x86_64/kernel/mce.c +++ b/arch/x86_64/kernel/mce.c @@ -15,6 +15,8 @@ #include <linux/sysdev.h> #include <linux/miscdevice.h> #include <linux/fs.h> +#include <linux/cpu.h> +#include <linux/percpu.h> #include <asm/processor.h> #include <asm/msr.h> #include <asm/mce.h> @@ -34,6 +36,7 @@ static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL }; static unsigned long console_logged; static int notify_user; static int rip_msr; +static int mce_bootlog; /* * Lockless MCE logging infrastructure. @@ -195,10 +198,11 @@ void do_machine_check(struct pt_regs * regs, long error_code) rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); mce_get_rip(&m, regs); - if (error_code != -1) + if (error_code >= 0) rdtscll(m.tsc); wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0); - mce_log(&m); + if (error_code != -2) + mce_log(&m); /* Did this bank cause the exception? */ /* Assume that the bank with uncorrectable errors did it, @@ -313,7 +317,7 @@ static void mce_init(void *dummy) /* Log the machine checks left over from the previous reset. This also clears all registers */ - do_machine_check(NULL, -1); + do_machine_check(NULL, mce_bootlog ? -1 : -2); set_in_cr4(X86_CR4_MCE); @@ -327,7 +331,7 @@ static void mce_init(void *dummy) } /* Add per CPU specific workarounds here */ -static void __init mce_cpu_quirks(struct cpuinfo_x86 *c) +static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) { /* This should be disabled by the BIOS, but isn't always */ if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) { @@ -337,7 +341,7 @@ static void __init mce_cpu_quirks(struct cpuinfo_x86 *c) } } -static void __init mce_cpu_features(struct cpuinfo_x86 *c) +static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c) { switch (c->x86_vendor) { case X86_VENDOR_INTEL: @@ -352,7 +356,7 @@ static void __init mce_cpu_features(struct cpuinfo_x86 *c) * Called for each booted CPU to set up machine checks. * Must be called with preempt off. */ -void __init mcheck_init(struct cpuinfo_x86 *c) +void __cpuinit mcheck_init(struct cpuinfo_x86 *c) { static cpumask_t mce_cpus __initdata = CPU_MASK_NONE; @@ -411,7 +415,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff memset(mcelog.entry, 0, next * sizeof(struct mce)); mcelog.next = 0; - synchronize_kernel(); + synchronize_sched(); /* Collect entries that were still getting written before the synchronize. */ @@ -474,11 +478,17 @@ static int __init mcheck_disable(char *str) } /* mce=off disables machine check. Note you can reenable it later - using sysfs */ + using sysfs. + mce=bootlog Log MCEs from before booting. Disabled by default to work + around buggy BIOS that leave bogus MCEs. */ static int __init mcheck_enable(char *str) { + if (*str == '=') + str++; if (!strcmp(str, "off")) mce_dont_init = 1; + else if (!strcmp(str, "bootlog")) + mce_bootlog = 1; else printk("mce= argument %s ignored. Please use /sys", str); return 0; @@ -514,10 +524,7 @@ static struct sysdev_class mce_sysclass = { set_kset_name("machinecheck"), }; -static struct sys_device device_mce = { - .id = 0, - .cls = &mce_sysclass, -}; +static DEFINE_PER_CPU(struct sys_device, device_mce); /* Why are there no generic functions for this? */ #define ACCESSOR(name, var, start) \ @@ -542,27 +549,83 @@ ACCESSOR(bank4ctl,bank[4],mce_restart()) ACCESSOR(tolerant,tolerant,) ACCESSOR(check_interval,check_interval,mce_restart()) +/* Per cpu sysdev init. All of the cpus still share the same ctl bank */ +static __cpuinit int mce_create_device(unsigned int cpu) +{ + int err; + if (!mce_available(&cpu_data[cpu])) + return -EIO; + + per_cpu(device_mce,cpu).id = cpu; + per_cpu(device_mce,cpu).cls = &mce_sysclass; + + err = sysdev_register(&per_cpu(device_mce,cpu)); + + if (!err) { + sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank0ctl); + sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank1ctl); + sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank2ctl); + sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank3ctl); + sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank4ctl); + sysdev_create_file(&per_cpu(device_mce,cpu), &attr_tolerant); + sysdev_create_file(&per_cpu(device_mce,cpu), &attr_check_interval); + } + return err; +} + +#ifdef CONFIG_HOTPLUG_CPU +static __cpuinit void mce_remove_device(unsigned int cpu) +{ + sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank0ctl); + sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank1ctl); + sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank2ctl); + sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank3ctl); + sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank4ctl); + sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant); + sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval); + sysdev_unregister(&per_cpu(device_mce,cpu)); +} +#endif + +/* Get notified when a cpu comes on/off. Be hotplug friendly. */ +static __cpuinit int +mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + + switch (action) { + case CPU_ONLINE: + mce_create_device(cpu); + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_DEAD: + mce_remove_device(cpu); + break; +#endif + } + return NOTIFY_OK; +} + +static struct notifier_block mce_cpu_notifier = { + .notifier_call = mce_cpu_callback, +}; + static __init int mce_init_device(void) { int err; + int i = 0; + if (!mce_available(&boot_cpu_data)) return -EIO; err = sysdev_class_register(&mce_sysclass); - if (!err) - err = sysdev_register(&device_mce); - if (!err) { - /* could create per CPU objects, but it is not worth it. */ - sysdev_create_file(&device_mce, &attr_bank0ctl); - sysdev_create_file(&device_mce, &attr_bank1ctl); - sysdev_create_file(&device_mce, &attr_bank2ctl); - sysdev_create_file(&device_mce, &attr_bank3ctl); - sysdev_create_file(&device_mce, &attr_bank4ctl); - sysdev_create_file(&device_mce, &attr_tolerant); - sysdev_create_file(&device_mce, &attr_check_interval); - } - + + for_each_online_cpu(i) { + mce_create_device(i); + } + + register_cpu_notifier(&mce_cpu_notifier); misc_register(&mce_log_device); return err; - } + device_initcall(mce_init_device); diff --git a/arch/x86_64/kernel/mce_intel.c b/arch/x86_64/kernel/mce_intel.c index 4db9a640069..0be0a795981 100644 --- a/arch/x86_64/kernel/mce_intel.c +++ b/arch/x86_64/kernel/mce_intel.c @@ -42,7 +42,7 @@ done: irq_exit(); } -static void __init intel_init_thermal(struct cpuinfo_x86 *c) +static void __cpuinit intel_init_thermal(struct cpuinfo_x86 *c) { u32 l, h; int tm2 = 0; @@ -93,7 +93,7 @@ static void __init intel_init_thermal(struct cpuinfo_x86 *c) return; } -void __init mce_intel_feature_init(struct cpuinfo_x86 *c) +void __cpuinit mce_intel_feature_init(struct cpuinfo_x86 *c) { intel_init_thermal(c); } diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index 61a63be6b29..79c362d03e2 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -23,6 +23,7 @@ #include <linux/kernel_stat.h> #include <linux/mc146818rtc.h> #include <linux/acpi.h> +#include <linux/module.h> #include <asm/smp.h> #include <asm/mtrr.h> @@ -45,7 +46,8 @@ int acpi_found_madt; int apic_version [MAX_APICS]; unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; -cpumask_t pci_bus_to_cpumask [256] = { [0 ... 255] = CPU_MASK_ALL }; +unsigned char pci_bus_to_node [256]; +EXPORT_SYMBOL(pci_bus_to_node); static int mp_current_pci_id = 0; /* I/O APIC entries */ @@ -107,7 +109,7 @@ static int __init mpf_checksum(unsigned char *mp, int len) static void __init MP_processor_info (struct mpc_config_processor *m) { - int ver; + int ver, cpu; static int found_bsp=0; if (!(m->mpc_cpuflag & CPU_ENABLED)) @@ -129,7 +131,7 @@ static void __init MP_processor_info (struct mpc_config_processor *m) return; } - num_processors++; + cpu = num_processors++; if (m->mpc_apicid > MAX_APICS) { printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n", @@ -153,13 +155,18 @@ static void __init MP_processor_info (struct mpc_config_processor *m) * in same order as logical cpu numbers. Hence the first * entry is BSP, and so on. */ + cpu = 0; + bios_cpu_apicid[0] = m->mpc_apicid; x86_cpu_to_apicid[0] = m->mpc_apicid; found_bsp = 1; - } else { - bios_cpu_apicid[num_processors - found_bsp] = m->mpc_apicid; - x86_cpu_to_apicid[num_processors - found_bsp] = m->mpc_apicid; - } + } else + cpu = num_processors - found_bsp; + bios_cpu_apicid[cpu] = m->mpc_apicid; + x86_cpu_to_apicid[cpu] = m->mpc_apicid; + + cpu_set(cpu, cpu_possible_map); + cpu_set(cpu, cpu_present_map); } static void __init MP_bus_info (struct mpc_config_bus *m) @@ -904,11 +911,20 @@ void __init mp_config_acpi_legacy_irqs (void) return; } +#define MAX_GSI_NUM 4096 + int mp_register_gsi(u32 gsi, int edge_level, int active_high_low) { int ioapic = -1; int ioapic_pin = 0; int idx, bit = 0; + static int pci_irq = 16; + /* + * Mapping between Global System Interrupts, which + * represent all possible interrupts, to the IRQs + * assigned to actual devices. + */ + static int gsi_to_irq[MAX_GSI_NUM]; if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) return gsi; @@ -943,11 +959,34 @@ int mp_register_gsi(u32 gsi, int edge_level, int active_high_low) if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) { Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n", mp_ioapic_routing[ioapic].apic_id, ioapic_pin); - return gsi; + return gsi_to_irq[gsi]; } mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit); + if (edge_level) { + /* + * For PCI devices assign IRQs in order, avoiding gaps + * due to unused I/O APIC pins. + */ + int irq = gsi; + if (gsi < MAX_GSI_NUM) { + if (gsi > 15) + gsi = pci_irq++; +#ifdef CONFIG_ACPI_BUS + /* + * Don't assign IRQ used by ACPI SCI + */ + if (gsi == acpi_fadt.sci_int) + gsi = pci_irq++; +#endif + gsi_to_irq[irq] = gsi; + } else { + printk(KERN_ERR "GSI %u is too high\n", gsi); + return gsi; + } + } + io_apic_set_pci_routing(ioapic, ioapic_pin, gsi, edge_level == ACPI_EDGE_SENSITIVE ? 0 : 1, active_high_low == ACPI_ACTIVE_HIGH ? 0 : 1); diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index 31c0f2e6ac9..4e44d6e6b7e 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -98,7 +98,7 @@ static unsigned int nmi_p4_cccr_val; (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \ P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE) -static __init inline int nmi_known_cpu(void) +static __cpuinit inline int nmi_known_cpu(void) { switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: @@ -110,7 +110,7 @@ static __init inline int nmi_known_cpu(void) } /* Run after command line and cpu_init init, but before all other checks */ -void __init nmi_watchdog_default(void) +void __cpuinit nmi_watchdog_default(void) { if (nmi_watchdog != NMI_DEFAULT) return; diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index dce8bab4306..7577f9d7a75 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -8,7 +8,8 @@ * * X86-64 port * Andi Kleen. - * + * + * CPU hotplug support - ashok.raj@intel.com * $Id: process.c,v 1.38 2002/01/15 10:08:03 ak Exp $ */ @@ -18,6 +19,7 @@ #include <stdarg.h> +#include <linux/cpu.h> #include <linux/errno.h> #include <linux/sched.h> #include <linux/kernel.h> @@ -34,6 +36,7 @@ #include <linux/ptrace.h> #include <linux/utsname.h> #include <linux/random.h> +#include <linux/kprobes.h> #include <asm/uaccess.h> #include <asm/pgtable.h> @@ -153,6 +156,29 @@ void cpu_idle_wait(void) } EXPORT_SYMBOL_GPL(cpu_idle_wait); +#ifdef CONFIG_HOTPLUG_CPU +DECLARE_PER_CPU(int, cpu_state); + +#include <asm/nmi.h> +/* We don't actually take CPU down, just spin without interrupts. */ +static inline void play_dead(void) +{ + idle_task_exit(); + wbinvd(); + mb(); + /* Ack it */ + __get_cpu_var(cpu_state) = CPU_DEAD; + + while (1) + safe_halt(); +} +#else +static inline void play_dead(void) +{ + BUG(); +} +#endif /* CONFIG_HOTPLUG_CPU */ + /* * The idle thread. There's no useful work to be * done, so just try to conserve power and have a @@ -173,6 +199,8 @@ void cpu_idle (void) idle = pm_idle; if (!idle) idle = default_idle; + if (cpu_is_offline(smp_processor_id())) + play_dead(); idle(); } @@ -203,7 +231,7 @@ static void mwait_idle(void) } } -void __init select_idle_routine(const struct cpuinfo_x86 *c) +void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) { static int printed; if (cpu_has(c, X86_FEATURE_MWAIT)) { @@ -293,6 +321,14 @@ void exit_thread(void) { struct task_struct *me = current; struct thread_struct *t = &me->thread; + + /* + * Remove function-return probe instances associated with this task + * and put them back on the free list. Do not insert an exit probe for + * this function, it will be disabled by kprobe_flush_task if you do. + */ + kprobe_flush_task(me); + if (me->thread.io_bitmap_ptr) { struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); @@ -312,6 +348,13 @@ void flush_thread(void) struct task_struct *tsk = current; struct thread_info *t = current_thread_info(); + /* + * Remove function-return probe instances associated with this task + * and put them back on the free list. Do not insert an exit probe for + * this function, it will be disabled by kprobe_flush_task if you do. + */ + kprobe_flush_task(tsk); + if (t->flags & _TIF_ABI_PENDING) t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32); @@ -439,6 +482,33 @@ out: } /* + * This function selects if the context switch from prev to next + * has to tweak the TSC disable bit in the cr4. + */ +static inline void disable_tsc(struct task_struct *prev_p, + struct task_struct *next_p) +{ + struct thread_info *prev, *next; + + /* + * gcc should eliminate the ->thread_info dereference if + * has_secure_computing returns 0 at compile time (SECCOMP=n). + */ + prev = prev_p->thread_info; + next = next_p->thread_info; + + if (has_secure_computing(prev) || has_secure_computing(next)) { + /* slow path here */ + if (has_secure_computing(prev) && + !has_secure_computing(next)) { + write_cr4(read_cr4() & ~X86_CR4_TSD); + } else if (!has_secure_computing(prev) && + has_secure_computing(next)) + write_cr4(read_cr4() | X86_CR4_TSD); + } +} + +/* * This special macro can be used to load a debugging register */ #define loaddebug(thread,r) set_debug(thread->debugreg ## r, r) @@ -556,6 +626,8 @@ struct task_struct *__switch_to(struct task_struct *prev_p, struct task_struct * } } + disable_tsc(prev_p, next_p); + return prev_p; } diff --git a/arch/x86_64/kernel/reboot.c b/arch/x86_64/kernel/reboot.c index be4b36f762c..47f95687905 100644 --- a/arch/x86_64/kernel/reboot.c +++ b/arch/x86_64/kernel/reboot.c @@ -66,34 +66,6 @@ static int __init reboot_setup(char *str) __setup("reboot=", reboot_setup); -#ifdef CONFIG_SMP -static void smp_halt(void) -{ - int cpuid = safe_smp_processor_id(); - static int first_entry = 1; - - if (reboot_force) - return; - - if (first_entry) { - first_entry = 0; - smp_call_function((void *)machine_restart, NULL, 1, 0); - } - - smp_stop_cpu(); - - /* AP calling this. Just halt */ - if (cpuid != boot_cpu_id) { - for (;;) - asm("hlt"); - } - - /* Wait for all other CPUs to have run smp_stop_cpu */ - while (!cpus_empty(cpu_online_map)) - rep_nop(); -} -#endif - static inline void kb_wait(void) { int i; @@ -103,25 +75,44 @@ static inline void kb_wait(void) break; } -void machine_restart(char * __unused) +void machine_shutdown(void) { - int i; + /* Stop the cpus and apics */ +#ifdef CONFIG_SMP + int reboot_cpu_id; - printk("machine restart\n"); + /* The boot cpu is always logical cpu 0 */ + reboot_cpu_id = 0; -#ifdef CONFIG_SMP - smp_halt(); + /* Make certain the cpu I'm about to reboot on is online */ + if (!cpu_isset(reboot_cpu_id, cpu_online_map)) { + reboot_cpu_id = smp_processor_id(); + } + + /* Make certain I only run on the appropriate processor */ + set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id)); + + /* O.K Now that I'm on the appropriate processor, + * stop all of the others. + */ + smp_send_stop(); #endif - if (!reboot_force) { - local_irq_disable(); + local_irq_disable(); + #ifndef CONFIG_SMP - disable_local_APIC(); + disable_local_APIC(); #endif - disable_IO_APIC(); - local_irq_enable(); - } - + + disable_IO_APIC(); + + local_irq_enable(); +} + +void machine_emergency_restart(void) +{ + int i; + /* Tell the BIOS if we want cold or warm reboot */ *((unsigned short *)__va(0x472)) = reboot_mode; @@ -146,18 +137,26 @@ void machine_restart(char * __unused) } } -EXPORT_SYMBOL(machine_restart); +void machine_restart(char * __unused) +{ + printk("machine restart\n"); + + if (!reboot_force) { + machine_shutdown(); + } + machine_emergency_restart(); +} void machine_halt(void) { } -EXPORT_SYMBOL(machine_halt); - void machine_power_off(void) { + if (!reboot_force) { + machine_shutdown(); + } if (pm_power_off) pm_power_off(); } -EXPORT_SYMBOL(machine_power_off); diff --git a/arch/x86_64/kernel/relocate_kernel.S b/arch/x86_64/kernel/relocate_kernel.S new file mode 100644 index 00000000000..d24fa9b72a2 --- /dev/null +++ b/arch/x86_64/kernel/relocate_kernel.S @@ -0,0 +1,143 @@ +/* + * relocate_kernel.S - put the kernel image in place to boot + * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com> + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include <linux/linkage.h> + + /* + * Must be relocatable PIC code callable as a C function, that once + * it starts can not use the previous processes stack. + */ + .globl relocate_new_kernel + .code64 +relocate_new_kernel: + /* %rdi page_list + * %rsi reboot_code_buffer + * %rdx start address + * %rcx page_table + * %r8 arg5 + * %r9 arg6 + */ + + /* zero out flags, and disable interrupts */ + pushq $0 + popfq + + /* set a new stack at the bottom of our page... */ + lea 4096(%rsi), %rsp + + /* store the parameters back on the stack */ + pushq %rdx /* store the start address */ + + /* Set cr0 to a known state: + * 31 1 == Paging enabled + * 18 0 == Alignment check disabled + * 16 0 == Write protect disabled + * 3 0 == No task switch + * 2 0 == Don't do FP software emulation. + * 0 1 == Proctected mode enabled + */ + movq %cr0, %rax + andq $~((1<<18)|(1<<16)|(1<<3)|(1<<2)), %rax + orl $((1<<31)|(1<<0)), %eax + movq %rax, %cr0 + + /* Set cr4 to a known state: + * 10 0 == xmm exceptions disabled + * 9 0 == xmm registers instructions disabled + * 8 0 == performance monitoring counter disabled + * 7 0 == page global disabled + * 6 0 == machine check exceptions disabled + * 5 1 == physical address extension enabled + * 4 0 == page size extensions disabled + * 3 0 == Debug extensions disabled + * 2 0 == Time stamp disable (disabled) + * 1 0 == Protected mode virtual interrupts disabled + * 0 0 == VME disabled + */ + + movq $((1<<5)), %rax + movq %rax, %cr4 + + jmp 1f +1: + + /* Switch to the identity mapped page tables, + * and flush the TLB. + */ + movq %rcx, %cr3 + + /* Do the copies */ + movq %rdi, %rcx /* Put the page_list in %rcx */ + xorq %rdi, %rdi + xorq %rsi, %rsi + jmp 1f + +0: /* top, read another word for the indirection page */ + + movq (%rbx), %rcx + addq $8, %rbx +1: + testq $0x1, %rcx /* is it a destination page? */ + jz 2f + movq %rcx, %rdi + andq $0xfffffffffffff000, %rdi + jmp 0b +2: + testq $0x2, %rcx /* is it an indirection page? */ + jz 2f + movq %rcx, %rbx + andq $0xfffffffffffff000, %rbx + jmp 0b +2: + testq $0x4, %rcx /* is it the done indicator? */ + jz 2f + jmp 3f +2: + testq $0x8, %rcx /* is it the source indicator? */ + jz 0b /* Ignore it otherwise */ + movq %rcx, %rsi /* For ever source page do a copy */ + andq $0xfffffffffffff000, %rsi + + movq $512, %rcx + rep ; movsq + jmp 0b +3: + + /* To be certain of avoiding problems with self-modifying code + * I need to execute a serializing instruction here. + * So I flush the TLB by reloading %cr3 here, it's handy, + * and not processor dependent. + */ + movq %cr3, %rax + movq %rax, %cr3 + + /* set all of the registers to known values */ + /* leave %rsp alone */ + + xorq %rax, %rax + xorq %rbx, %rbx + xorq %rcx, %rcx + xorq %rdx, %rdx + xorq %rsi, %rsi + xorq %rdi, %rdi + xorq %rbp, %rbp + xorq %r8, %r8 + xorq %r9, %r9 + xorq %r10, %r9 + xorq %r11, %r11 + xorq %r12, %r12 + xorq %r13, %r13 + xorq %r14, %r14 + xorq %r15, %r15 + + ret +relocate_new_kernel_end: + + .globl relocate_new_kernel_size +relocate_new_kernel_size: + .quad relocate_new_kernel_end - relocate_new_kernel diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 99f038ede23..116a491e296 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -40,6 +40,9 @@ #include <linux/acpi.h> #include <linux/kallsyms.h> #include <linux/edd.h> +#include <linux/mmzone.h> +#include <linux/kexec.h> + #include <asm/mtrr.h> #include <asm/uaccess.h> #include <asm/system.h> @@ -348,7 +351,7 @@ static __init void parse_cmdline_early (char ** cmdline_p) if (!memcmp(from, "mem=", 4)) parse_memopt(from+4, &from); -#ifdef CONFIG_DISCONTIGMEM +#ifdef CONFIG_NUMA if (!memcmp(from, "numa=", 5)) numa_setup(from+5); #endif @@ -365,6 +368,27 @@ static __init void parse_cmdline_early (char ** cmdline_p) if (!memcmp(from, "noexec=", 7)) nonx_setup(from + 7); +#ifdef CONFIG_KEXEC + /* crashkernel=size@addr specifies the location to reserve for + * a crash kernel. By reserving this memory we guarantee + * that linux never set's it up as a DMA target. + * Useful for holding code to do something appropriate + * after a kernel panic. + */ + else if (!memcmp(from, "crashkernel=", 12)) { + unsigned long size, base; + size = memparse(from+12, &from); + if (*from == '@') { + base = memparse(from+1, &from); + /* FIXME: Do I want a sanity check + * to validate the memory range? + */ + crashk_res.start = base; + crashk_res.end = base + size - 1; + } + } +#endif + next_char: c = *(from++); if (!c) @@ -377,17 +401,20 @@ static __init void parse_cmdline_early (char ** cmdline_p) *cmdline_p = command_line; } -#ifndef CONFIG_DISCONTIGMEM -static void __init contig_initmem_init(void) +#ifndef CONFIG_NUMA +static void __init +contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn) { - unsigned long bootmap_size, bootmap; - bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT; - bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size); - if (bootmap == -1L) - panic("Cannot find bootmem map of size %ld\n",bootmap_size); - bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn); - e820_bootmem_free(&contig_page_data, 0, end_pfn << PAGE_SHIFT); - reserve_bootmem(bootmap, bootmap_size); + unsigned long bootmap_size, bootmap; + + memory_present(0, start_pfn, end_pfn); + bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT; + bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size); + if (bootmap == -1L) + panic("Cannot find bootmem map of size %ld\n",bootmap_size); + bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn); + e820_bootmem_free(NODE_DATA(0), 0, end_pfn << PAGE_SHIFT); + reserve_bootmem(bootmap, bootmap_size); } #endif @@ -554,10 +581,10 @@ void __init setup_arch(char **cmdline_p) acpi_numa_init(); #endif -#ifdef CONFIG_DISCONTIGMEM +#ifdef CONFIG_NUMA numa_initmem_init(0, end_pfn); #else - contig_initmem_init(); + contig_initmem_init(0, end_pfn); #endif /* Reserve direct mapping */ @@ -618,6 +645,15 @@ void __init setup_arch(char **cmdline_p) } } #endif +#ifdef CONFIG_KEXEC + if (crashk_res.start != crashk_res.end) { + reserve_bootmem(crashk_res.start, + crashk_res.end - crashk_res.start + 1); + } +#endif + + sparse_init(); + paging_init(); check_ioapic(); @@ -669,7 +705,7 @@ void __init setup_arch(char **cmdline_p) #endif } -static int __init get_model_name(struct cpuinfo_x86 *c) +static int __cpuinit get_model_name(struct cpuinfo_x86 *c) { unsigned int *v; @@ -685,7 +721,7 @@ static int __init get_model_name(struct cpuinfo_x86 *c) } -static void __init display_cacheinfo(struct cpuinfo_x86 *c) +static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) { unsigned int n, dummy, eax, ebx, ecx, edx; @@ -729,8 +765,6 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c) int cpu = smp_processor_id(); int node = 0; unsigned bits; - if (c->x86_num_cores == 1) - return; bits = 0; while ((1 << bits) < c->x86_num_cores) @@ -796,7 +830,7 @@ static int __init init_amd(struct cpuinfo_x86 *c) return r; } -static void __init detect_ht(struct cpuinfo_x86 *c) +static void __cpuinit detect_ht(struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP u32 eax, ebx, ecx, edx; @@ -857,7 +891,7 @@ static void __init detect_ht(struct cpuinfo_x86 *c) /* * find out the number of processor cores on the die */ -static int __init intel_num_cpu_cores(struct cpuinfo_x86 *c) +static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c) { unsigned int eax; @@ -875,7 +909,7 @@ static int __init intel_num_cpu_cores(struct cpuinfo_x86 *c) return 1; } -static void __init init_intel(struct cpuinfo_x86 *c) +static void __cpuinit init_intel(struct cpuinfo_x86 *c) { /* Cache sizes */ unsigned n; @@ -895,7 +929,7 @@ static void __init init_intel(struct cpuinfo_x86 *c) c->x86_num_cores = intel_num_cpu_cores(c); } -void __init get_cpu_vendor(struct cpuinfo_x86 *c) +void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) { char *v = c->x86_vendor_id; @@ -916,7 +950,7 @@ struct cpu_model_info { /* Do some early cpuid on the boot CPU to get some parameter that are needed before check_bugs. Everything advanced is in identify_cpu below. */ -void __init early_identify_cpu(struct cpuinfo_x86 *c) +void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) { u32 tfms; @@ -970,7 +1004,7 @@ void __init early_identify_cpu(struct cpuinfo_x86 *c) /* * This does the hard work of actually picking apart the CPU stuff... */ -void __init identify_cpu(struct cpuinfo_x86 *c) +void __cpuinit identify_cpu(struct cpuinfo_x86 *c) { int i; u32 xlvl; @@ -1040,14 +1074,17 @@ void __init identify_cpu(struct cpuinfo_x86 *c) #ifdef CONFIG_X86_MCE mcheck_init(c); #endif + if (c == &boot_cpu_data) + mtrr_bp_init(); + else + mtrr_ap_init(); #ifdef CONFIG_NUMA - if (c != &boot_cpu_data) - numa_add_cpu(c - cpu_data); + numa_add_cpu(smp_processor_id()); #endif } -void __init print_cpu_info(struct cpuinfo_x86 *c) +void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) { if (c->x86_model_id[0]) printk("%s", c->x86_model_id); diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c index 678b7ac33b8..34082c1cc41 100644 --- a/arch/x86_64/kernel/setup64.c +++ b/arch/x86_64/kernel/setup64.c @@ -12,6 +12,7 @@ #include <linux/string.h> #include <linux/bootmem.h> #include <linux/bitops.h> +#include <linux/module.h> #include <asm/bootsetup.h> #include <asm/pda.h> #include <asm/pgtable.h> @@ -22,22 +23,15 @@ #include <asm/smp.h> #include <asm/i387.h> #include <asm/percpu.h> -#include <asm/mtrr.h> #include <asm/proto.h> -#include <asm/mman.h> -#include <asm/numa.h> +#include <asm/sections.h> char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,}; -cpumask_t cpu_initialized __initdata = CPU_MASK_NONE; +cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; struct x8664_pda cpu_pda[NR_CPUS] __cacheline_aligned; -extern struct task_struct init_task; - -extern unsigned char __per_cpu_start[], __per_cpu_end[]; - -extern struct desc_ptr cpu_gdt_descr[]; struct desc_ptr idt_descr = { 256 * 16, (unsigned long) idt_table }; char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned"))); @@ -101,7 +95,7 @@ void __init setup_per_cpu_areas(void) #endif for (i = 0; i < NR_CPUS; i++) { - unsigned char *ptr; + char *ptr; if (!NODE_DATA(cpu_to_node(i))) { printk("cpu with no node %d, num_online_nodes %d\n", @@ -171,7 +165,7 @@ void syscall_init(void) wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000); } -void __init check_efer(void) +void __cpuinit check_efer(void) { unsigned long efer; @@ -188,13 +182,9 @@ void __init check_efer(void) * 'CPU state barrier', nothing should get across. * A lot of state is already set up in PDA init. */ -void __init cpu_init (void) +void __cpuinit cpu_init (void) { -#ifdef CONFIG_SMP int cpu = stack_smp_processor_id(); -#else - int cpu = smp_processor_id(); -#endif struct tss_struct *t = &per_cpu(init_tss, cpu); unsigned long v; char *estacks = NULL; @@ -214,7 +204,7 @@ void __init cpu_init (void) printk("Initializing CPU#%d\n", cpu); - clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); + clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); /* * Initialize the per-CPU GDT with the boot GDT, diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c index 429c0269dc4..98590a989f3 100644 --- a/arch/x86_64/kernel/signal.c +++ b/arch/x86_64/kernel/signal.c @@ -34,9 +34,9 @@ #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) -void ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, +int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *set, struct pt_regs * regs); -void ia32_setup_frame(int sig, struct k_sigaction *ka, +int ia32_setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, struct pt_regs * regs); asmlinkage long @@ -238,7 +238,7 @@ get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size) return (void __user *)round_down(rsp - size, 16); } -static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, +static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *set, struct pt_regs * regs) { struct rt_sigframe __user *frame; @@ -327,20 +327,23 @@ static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, current->comm, current->pid, frame, regs->rip, frame->pretcode); #endif - return; + return 1; give_sigsegv: force_sigsegv(sig, current); + return 0; } /* * OK, we're invoking a handler */ -static void +static int handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, sigset_t *oldset, struct pt_regs *regs) { + int ret; + #ifdef DEBUG_SIG printk("handle_signal pid:%d sig:%lu rip:%lx rsp:%lx regs=%p\n", current->pid, sig, @@ -384,20 +387,22 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, #ifdef CONFIG_IA32_EMULATION if (test_thread_flag(TIF_IA32)) { if (ka->sa.sa_flags & SA_SIGINFO) - ia32_setup_rt_frame(sig, ka, info, oldset, regs); + ret = ia32_setup_rt_frame(sig, ka, info, oldset, regs); else - ia32_setup_frame(sig, ka, oldset, regs); + ret = ia32_setup_frame(sig, ka, oldset, regs); } else #endif - setup_rt_frame(sig, ka, info, oldset, regs); + ret = setup_rt_frame(sig, ka, info, oldset, regs); - if (!(ka->sa.sa_flags & SA_NODEFER)) { + if (ret && !(ka->sa.sa_flags & SA_NODEFER)) { spin_lock_irq(¤t->sighand->siglock); sigorsets(¤t->blocked,¤t->blocked,&ka->sa.sa_mask); sigaddset(¤t->blocked,sig); recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); } + + return ret; } /* @@ -417,10 +422,10 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset) * kernel mode. Just return without doing anything * if so. */ - if ((regs->cs & 3) != 3) + if (!user_mode(regs)) return 1; - if (try_to_freeze(0)) + if (try_to_freeze()) goto no_signal; if (!oldset) @@ -434,11 +439,10 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset) * inside the kernel. */ if (current->thread.debugreg7) - asm volatile("movq %0,%%db7" : : "r" (current->thread.debugreg7)); + set_debugreg(current->thread.debugreg7, 7); /* Whee! Actually deliver the signal. */ - handle_signal(signr, &info, &ka, oldset, regs); - return 1; + return handle_signal(signr, &info, &ka, oldset, regs); } no_signal: diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c index 1e379ed17b1..e5958220d6b 100644 --- a/arch/x86_64/kernel/smp.c +++ b/arch/x86_64/kernel/smp.c @@ -129,10 +129,9 @@ asmlinkage void smp_invalidate_interrupt (void) } else leave_mm(cpu); } +out: ack_APIC_irq(); cpu_clear(cpu, flush_cpumask); - -out: put_cpu_no_resched(); } @@ -283,6 +282,79 @@ struct call_data_struct { static struct call_data_struct * call_data; +void lock_ipi_call_lock(void) +{ + spin_lock_irq(&call_lock); +} + +void unlock_ipi_call_lock(void) +{ + spin_unlock_irq(&call_lock); +} + +/* + * this function sends a 'generic call function' IPI to one other CPU + * in the system. + */ +static void __smp_call_function_single (int cpu, void (*func) (void *info), void *info, + int nonatomic, int wait) +{ + struct call_data_struct data; + int cpus = 1; + + data.func = func; + data.info = info; + atomic_set(&data.started, 0); + data.wait = wait; + if (wait) + atomic_set(&data.finished, 0); + + call_data = &data; + wmb(); + /* Send a message to all other CPUs and wait for them to respond */ + send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR); + + /* Wait for response */ + while (atomic_read(&data.started) != cpus) + cpu_relax(); + + if (!wait) + return; + + while (atomic_read(&data.finished) != cpus) + cpu_relax(); +} + +/* + * smp_call_function_single - Run a function on another CPU + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @nonatomic: Currently unused. + * @wait: If true, wait until function has completed on other CPUs. + * + * Retrurns 0 on success, else a negative status code. + * + * Does not return until the remote CPU is nearly ready to execute <func> + * or is or has executed. + */ + +int smp_call_function_single (int cpu, void (*func) (void *info), void *info, + int nonatomic, int wait) +{ + /* prevent preemption and reschedule on another processor */ + int me = get_cpu(); + if (cpu == me) { + WARN_ON(1); + put_cpu(); + return -EBUSY; + } + spin_lock_bh(&call_lock); + __smp_call_function_single(cpu, func, info, nonatomic, wait); + spin_unlock_bh(&call_lock); + put_cpu(); + return 0; +} + /* * this function sends a 'generic call function' IPI to all other CPUs * in the system. diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index f1ec0f34594..6e4807d64d4 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -34,6 +34,7 @@ * Andi Kleen : Converted to new state machine. * Various cleanups. * Probably mostly hotplug CPU ready now. + * Ashok Raj : CPU hotplug support */ @@ -58,11 +59,6 @@ #include <asm/proto.h> #include <asm/nmi.h> -/* Change for real CPU hotplug. Note other files need to be fixed - first too. */ -#define __cpuinit __init -#define __cpuinitdata __initdata - /* Number of siblings per CPU package */ int smp_num_siblings = 1; /* Package ID of each logical CPU */ @@ -103,6 +99,19 @@ EXPORT_SYMBOL(cpu_core_map); extern unsigned char trampoline_data[]; extern unsigned char trampoline_end[]; +/* State of each CPU */ +DEFINE_PER_CPU(int, cpu_state) = { 0 }; + +/* + * Store all idle threads, this can be reused instead of creating + * a new thread. Also avoids complicated thread destroy functionality + * for idle threads. + */ +struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ; + +#define get_idle_for_cpu(x) (idle_thread_array[(x)]) +#define set_idle_for_cpu(x,p) (idle_thread_array[(x)] = (p)) + /* * Currently trivial. Write the real->protected mode * bootstrap into the page concerned. The caller @@ -202,9 +211,6 @@ static __cpuinit void sync_master(void *arg) { unsigned long flags, i; - if (smp_processor_id() != boot_cpu_id) - return; - go[MASTER] = 0; local_irq_save(flags); @@ -253,12 +259,12 @@ get_delta(long *rt, long *master) return tcenter - best_tm; } -static __cpuinit void sync_tsc(void) +static __cpuinit void sync_tsc(unsigned int master) { int i, done = 0; long delta, adj, adjust_latency = 0; unsigned long flags, rt, master_time_stamp, bound; -#if DEBUG_TSC_SYNC +#ifdef DEBUG_TSC_SYNC static struct syncdebug { long rt; /* roundtrip time */ long master; /* master's timestamp */ @@ -267,9 +273,17 @@ static __cpuinit void sync_tsc(void) } t[NUM_ROUNDS] __cpuinitdata; #endif + printk(KERN_INFO "CPU %d: Syncing TSC to CPU %u.\n", + smp_processor_id(), master); + go[MASTER] = 1; - smp_call_function(sync_master, NULL, 1, 0); + /* It is dangerous to broadcast IPI as cpus are coming up, + * as they may not be ready to accept them. So since + * we only need to send the ipi to the boot cpu direct + * the message, and avoid the race. + */ + smp_call_function_single(master, sync_master, NULL, 1, 0); while (go[MASTER]) /* wait for master to be ready */ no_cpu_relax(); @@ -294,7 +308,7 @@ static __cpuinit void sync_tsc(void) rdtscll(t); wrmsrl(MSR_IA32_TSC, t + adj); } -#if DEBUG_TSC_SYNC +#ifdef DEBUG_TSC_SYNC t[i].rt = rt; t[i].master = master_time_stamp; t[i].diff = delta; @@ -304,7 +318,7 @@ static __cpuinit void sync_tsc(void) } spin_unlock_irqrestore(&tsc_sync_lock, flags); -#if DEBUG_TSC_SYNC +#ifdef DEBUG_TSC_SYNC for (i = 0; i < NUM_ROUNDS; ++i) printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n", t[i].rt, t[i].master, t[i].diff, t[i].lat); @@ -313,16 +327,14 @@ static __cpuinit void sync_tsc(void) printk(KERN_INFO "CPU %d: synchronized TSC with CPU %u (last diff %ld cycles, " "maxerr %lu cycles)\n", - smp_processor_id(), boot_cpu_id, delta, rt); + smp_processor_id(), master, delta, rt); } static void __cpuinit tsc_sync_wait(void) { if (notscsync || !cpu_has_tsc) return; - printk(KERN_INFO "CPU %d: Syncing TSC to CPU %u.\n", smp_processor_id(), - boot_cpu_id); - sync_tsc(); + sync_tsc(boot_cpu_id); } static __init int notscsync_setup(char *s) @@ -418,6 +430,33 @@ void __cpuinit smp_callin(void) cpu_set(cpuid, cpu_callin_map); } +static inline void set_cpu_sibling_map(int cpu) +{ + int i; + + if (smp_num_siblings > 1) { + for_each_cpu(i) { + if (cpu_core_id[cpu] == cpu_core_id[i]) { + cpu_set(i, cpu_sibling_map[cpu]); + cpu_set(cpu, cpu_sibling_map[i]); + } + } + } else { + cpu_set(cpu, cpu_sibling_map[cpu]); + } + + if (current_cpu_data.x86_num_cores > 1) { + for_each_cpu(i) { + if (phys_proc_id[cpu] == phys_proc_id[i]) { + cpu_set(i, cpu_core_map[cpu]); + cpu_set(cpu, cpu_core_map[i]); + } + } + } else { + cpu_core_map[cpu] = cpu_sibling_map[cpu]; + } +} + /* * Setup code on secondary processor (after comming out of the trampoline) */ @@ -448,9 +487,28 @@ void __cpuinit start_secondary(void) enable_APIC_timer(); /* + * The sibling maps must be set before turing the online map on for + * this cpu + */ + set_cpu_sibling_map(smp_processor_id()); + + /* + * We need to hold call_lock, so there is no inconsistency + * between the time smp_call_function() determines number of + * IPI receipients, and the time when the determination is made + * for which cpus receive the IPI in genapic_flat.c. Holding this + * lock helps us to not include this cpu in a currently in progress + * smp_call_function(). + */ + lock_ipi_call_lock(); + + /* * Allow the master to continue. */ cpu_set(smp_processor_id(), cpu_online_map); + per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; + unlock_ipi_call_lock(); + mb(); /* Wait for TSC sync to not schedule things before. @@ -464,7 +522,7 @@ void __cpuinit start_secondary(void) extern volatile unsigned long init_rsp; extern void (*initial_code)(void); -#if APIC_DEBUG +#ifdef APIC_DEBUG static void inquire_remote_apic(int apicid) { unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; @@ -628,36 +686,81 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta return (send_status | accept_status); } +struct create_idle { + struct task_struct *idle; + struct completion done; + int cpu; +}; + +void do_fork_idle(void *_c_idle) +{ + struct create_idle *c_idle = _c_idle; + + c_idle->idle = fork_idle(c_idle->cpu); + complete(&c_idle->done); +} + /* * Boot one CPU. */ static int __cpuinit do_boot_cpu(int cpu, int apicid) { - struct task_struct *idle; unsigned long boot_error; int timeout; unsigned long start_rip; + struct create_idle c_idle = { + .cpu = cpu, + .done = COMPLETION_INITIALIZER(c_idle.done), + }; + DECLARE_WORK(work, do_fork_idle, &c_idle); + + c_idle.idle = get_idle_for_cpu(cpu); + + if (c_idle.idle) { + c_idle.idle->thread.rsp = (unsigned long) (((struct pt_regs *) + (THREAD_SIZE + (unsigned long) c_idle.idle->thread_info)) - 1); + init_idle(c_idle.idle, cpu); + goto do_rest; + } + /* - * We can't use kernel_thread since we must avoid to - * reschedule the child. + * During cold boot process, keventd thread is not spun up yet. + * When we do cpu hot-add, we create idle threads on the fly, we should + * not acquire any attributes from the calling context. Hence the clean + * way to create kernel_threads() is to do that from keventd(). + * We do the current_is_keventd() due to the fact that ACPI notifier + * was also queuing to keventd() and when the caller is already running + * in context of keventd(), we would end up with locking up the keventd + * thread. */ - idle = fork_idle(cpu); - if (IS_ERR(idle)) { + if (!keventd_up() || current_is_keventd()) + work.func(work.data); + else { + schedule_work(&work); + wait_for_completion(&c_idle.done); + } + + if (IS_ERR(c_idle.idle)) { printk("failed fork for CPU %d\n", cpu); - return PTR_ERR(idle); + return PTR_ERR(c_idle.idle); } - cpu_pda[cpu].pcurrent = idle; + set_idle_for_cpu(cpu, c_idle.idle); + +do_rest: + + cpu_pda[cpu].pcurrent = c_idle.idle; start_rip = setup_trampoline(); - init_rsp = idle->thread.rsp; + init_rsp = c_idle.idle->thread.rsp; per_cpu(init_tss,cpu).rsp0 = init_rsp; initial_code = start_secondary; - clear_ti_thread_flag(idle->thread_info, TIF_FORK); + clear_ti_thread_flag(c_idle.idle->thread_info, TIF_FORK); - printk(KERN_INFO "Booting processor %d/%d rip %lx rsp %lx\n", cpu, apicid, - start_rip, init_rsp); + printk(KERN_INFO "Booting processor %d/%d APIC 0x%x\n", cpu, + cpus_weight(cpu_present_map), + apicid); /* * This grunge runs the startup process for @@ -724,7 +827,7 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid) else /* trampoline code not run */ printk("Not responding.\n"); -#if APIC_DEBUG +#ifdef APIC_DEBUG inquire_remote_apic(apicid); #endif } @@ -746,51 +849,6 @@ cycles_t cacheflush_time; unsigned long cache_decay_ticks; /* - * Construct cpu_sibling_map[], so that we can tell the sibling CPU - * on SMT systems efficiently. - */ -static __cpuinit void detect_siblings(void) -{ - int cpu; - - for (cpu = 0; cpu < NR_CPUS; cpu++) { - cpus_clear(cpu_sibling_map[cpu]); - cpus_clear(cpu_core_map[cpu]); - } - - for_each_online_cpu (cpu) { - struct cpuinfo_x86 *c = cpu_data + cpu; - int siblings = 0; - int i; - if (smp_num_siblings > 1) { - for_each_online_cpu (i) { - if (cpu_core_id[cpu] == cpu_core_id[i]) { - siblings++; - cpu_set(i, cpu_sibling_map[cpu]); - } - } - } else { - siblings++; - cpu_set(cpu, cpu_sibling_map[cpu]); - } - - if (siblings != smp_num_siblings) { - printk(KERN_WARNING - "WARNING: %d siblings found for CPU%d, should be %d\n", - siblings, cpu, smp_num_siblings); - smp_num_siblings = siblings; - } - if (c->x86_num_cores > 1) { - for_each_online_cpu(i) { - if (phys_proc_id[cpu] == phys_proc_id[i]) - cpu_set(i, cpu_core_map[cpu]); - } - } else - cpu_core_map[cpu] = cpu_sibling_map[cpu]; - } -} - -/* * Cleanup possible dangling ends... */ static __cpuinit void smp_cleanup_boot(void) @@ -823,7 +881,7 @@ static __cpuinit void smp_cleanup_boot(void) * * RED-PEN audit/test this more. I bet there is more state messed up here. */ -static __cpuinit void disable_smp(void) +static __init void disable_smp(void) { cpu_present_map = cpumask_of_cpu(0); cpu_possible_map = cpumask_of_cpu(0); @@ -838,7 +896,7 @@ static __cpuinit void disable_smp(void) /* * Handle user cpus=... parameter. */ -static __cpuinit void enforce_max_cpus(unsigned max_cpus) +static __init void enforce_max_cpus(unsigned max_cpus) { int i, k; k = 0; @@ -852,10 +910,31 @@ static __cpuinit void enforce_max_cpus(unsigned max_cpus) } } +#ifdef CONFIG_HOTPLUG_CPU +/* + * cpu_possible_map should be static, it cannot change as cpu's + * are onlined, or offlined. The reason is per-cpu data-structures + * are allocated by some modules at init time, and dont expect to + * do this dynamically on cpu arrival/departure. + * cpu_present_map on the other hand can change dynamically. + * In case when cpu_hotplug is not compiled, then we resort to current + * behaviour, which is cpu_possible == cpu_present. + * If cpu-hotplug is supported, then we need to preallocate for all + * those NR_CPUS, hence cpu_possible_map represents entire NR_CPUS range. + * - Ashok Raj + */ +static void prefill_possible_map(void) +{ + int i; + for (i = 0; i < NR_CPUS; i++) + cpu_set(i, cpu_possible_map); +} +#endif + /* * Various sanity checks. */ -static int __cpuinit smp_sanity_check(unsigned max_cpus) +static int __init smp_sanity_check(unsigned max_cpus) { if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { printk("weird, boot CPU (#%d) not listed by the BIOS.\n", @@ -913,28 +992,17 @@ static int __cpuinit smp_sanity_check(unsigned max_cpus) * Prepare for SMP bootup. The MP table or ACPI has been read * earlier. Just do some sanity checking here and enable APIC mode. */ -void __cpuinit smp_prepare_cpus(unsigned int max_cpus) +void __init smp_prepare_cpus(unsigned int max_cpus) { - int i; - nmi_watchdog_default(); current_cpu_data = boot_cpu_data; current_thread_info()->cpu = 0; /* needed? */ enforce_max_cpus(max_cpus); - /* - * Fill in cpu_present_mask - */ - for (i = 0; i < NR_CPUS; i++) { - int apicid = cpu_present_to_apicid(i); - if (physid_isset(apicid, phys_cpu_present_map)) { - cpu_set(i, cpu_present_map); - /* possible map would be different if we supported real - CPU hotplug. */ - cpu_set(i, cpu_possible_map); - } - } +#ifdef CONFIG_HOTPLUG_CPU + prefill_possible_map(); +#endif if (smp_sanity_check(max_cpus) < 0) { printk(KERN_INFO "SMP disabled\n"); @@ -978,13 +1046,13 @@ void __init smp_prepare_boot_cpu(void) int me = smp_processor_id(); cpu_set(me, cpu_online_map); cpu_set(me, cpu_callout_map); + cpu_set(0, cpu_sibling_map[0]); + cpu_set(0, cpu_core_map[0]); + per_cpu(cpu_state, me) = CPU_ONLINE; } /* * Entry point to boot a CPU. - * - * This is all __cpuinit, not __devinit for now because we don't support - * CPU hotplug (yet). */ int __cpuinit __cpu_up(unsigned int cpu) { @@ -1001,6 +1069,15 @@ int __cpuinit __cpu_up(unsigned int cpu) return -EINVAL; } + /* + * Already booted CPU? + */ + if (cpu_isset(cpu, cpu_callin_map)) { + Dprintk("do_boot_cpu %d Already started\n", cpu); + return -ENOSYS; + } + + per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; /* Boot it! */ err = do_boot_cpu(cpu, apicid); if (err < 0) { @@ -1013,23 +1090,117 @@ int __cpuinit __cpu_up(unsigned int cpu) while (!cpu_isset(cpu, cpu_online_map)) cpu_relax(); - return 0; + err = 0; + + return err; } /* * Finish the SMP boot. */ -void __cpuinit smp_cpus_done(unsigned int max_cpus) +void __init smp_cpus_done(unsigned int max_cpus) { +#ifndef CONFIG_HOTPLUG_CPU zap_low_mappings(); +#endif smp_cleanup_boot(); #ifdef CONFIG_X86_IO_APIC setup_ioapic_dest(); #endif - detect_siblings(); time_init_gtod(); check_nmi_watchdog(); } + +#ifdef CONFIG_HOTPLUG_CPU + +static void remove_siblinginfo(int cpu) +{ + int sibling; + + for_each_cpu_mask(sibling, cpu_sibling_map[cpu]) + cpu_clear(cpu, cpu_sibling_map[sibling]); + for_each_cpu_mask(sibling, cpu_core_map[cpu]) + cpu_clear(cpu, cpu_core_map[sibling]); + cpus_clear(cpu_sibling_map[cpu]); + cpus_clear(cpu_core_map[cpu]); + phys_proc_id[cpu] = BAD_APICID; + cpu_core_id[cpu] = BAD_APICID; +} + +void remove_cpu_from_maps(void) +{ + int cpu = smp_processor_id(); + + cpu_clear(cpu, cpu_callout_map); + cpu_clear(cpu, cpu_callin_map); + clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */ +} + +int __cpu_disable(void) +{ + int cpu = smp_processor_id(); + + /* + * Perhaps use cpufreq to drop frequency, but that could go + * into generic code. + * + * We won't take down the boot processor on i386 due to some + * interrupts only being able to be serviced by the BSP. + * Especially so if we're not using an IOAPIC -zwane + */ + if (cpu == 0) + return -EBUSY; + + disable_APIC_timer(); + + /* + * HACK: + * Allow any queued timer interrupts to get serviced + * This is only a temporary solution until we cleanup + * fixup_irqs as we do for IA64. + */ + local_irq_enable(); + mdelay(1); + + local_irq_disable(); + remove_siblinginfo(cpu); + + /* It's now safe to remove this processor from the online map */ + cpu_clear(cpu, cpu_online_map); + remove_cpu_from_maps(); + fixup_irqs(cpu_online_map); + return 0; +} + +void __cpu_die(unsigned int cpu) +{ + /* We don't do anything here: idle task is faking death itself. */ + unsigned int i; + + for (i = 0; i < 10; i++) { + /* They ack this in play_dead by setting CPU_DEAD */ + if (per_cpu(cpu_state, cpu) == CPU_DEAD) { + printk ("CPU %d is now offline\n", cpu); + return; + } + msleep(100); + } + printk(KERN_ERR "CPU %u didn't die...\n", cpu); +} + +#else /* ... !CONFIG_HOTPLUG_CPU */ + +int __cpu_disable(void) +{ + return -ENOSYS; +} + +void __cpu_die(unsigned int cpu) +{ + /* We said "no" in __cpu_disable */ + BUG(); +} +#endif /* CONFIG_HOTPLUG_CPU */ diff --git a/arch/x86_64/kernel/suspend.c b/arch/x86_64/kernel/suspend.c index ebaa1e37d65..0612640d91b 100644 --- a/arch/x86_64/kernel/suspend.c +++ b/arch/x86_64/kernel/suspend.c @@ -44,7 +44,6 @@ void __save_processor_state(struct saved_context *ctxt) */ asm volatile ("sgdt %0" : "=m" (ctxt->gdt_limit)); asm volatile ("sidt %0" : "=m" (ctxt->idt_limit)); - asm volatile ("sldt %0" : "=m" (ctxt->ldt)); asm volatile ("str %0" : "=m" (ctxt->tr)); /* XMM0..XMM15 should be handled by kernel_fpu_begin(). */ @@ -69,6 +68,7 @@ void __save_processor_state(struct saved_context *ctxt) asm volatile ("movq %%cr2, %0" : "=r" (ctxt->cr2)); asm volatile ("movq %%cr3, %0" : "=r" (ctxt->cr3)); asm volatile ("movq %%cr4, %0" : "=r" (ctxt->cr4)); + asm volatile ("movq %%cr8, %0" : "=r" (ctxt->cr8)); } void save_processor_state(void) @@ -90,12 +90,20 @@ void __restore_processor_state(struct saved_context *ctxt) /* * control registers */ + asm volatile ("movq %0, %%cr8" :: "r" (ctxt->cr8)); asm volatile ("movq %0, %%cr4" :: "r" (ctxt->cr4)); asm volatile ("movq %0, %%cr3" :: "r" (ctxt->cr3)); asm volatile ("movq %0, %%cr2" :: "r" (ctxt->cr2)); asm volatile ("movq %0, %%cr0" :: "r" (ctxt->cr0)); /* + * now restore the descriptor tables to their proper values + * ltr is done i fix_processor_context(). + */ + asm volatile ("lgdt %0" :: "m" (ctxt->gdt_limit)); + asm volatile ("lidt %0" :: "m" (ctxt->idt_limit)); + + /* * segment registers */ asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds)); @@ -108,17 +116,10 @@ void __restore_processor_state(struct saved_context *ctxt) wrmsrl(MSR_GS_BASE, ctxt->gs_base); wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); - /* - * now restore the descriptor tables to their proper values - * ltr is done i fix_processor_context(). - */ - asm volatile ("lgdt %0" :: "m" (ctxt->gdt_limit)); - asm volatile ("lidt %0" :: "m" (ctxt->idt_limit)); - asm volatile ("lldt %0" :: "m" (ctxt->ldt)); - fix_processor_context(); do_fpu_end(); + mtrr_ap_init(); } void restore_processor_state(void) diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index fb8c809b4cd..66bf6ddeb0c 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -64,6 +64,7 @@ static int notsc __initdata = 0; unsigned int cpu_khz; /* TSC clocks / usec, not used here */ static unsigned long hpet_period; /* fsecs / HPET clock */ unsigned long hpet_tick; /* HPET clocks / interrupt */ +static int hpet_use_timer; unsigned long vxtime_hz = PIT_TICK_RATE; int report_lost_ticks; /* command line option */ unsigned long long monotonic_base; @@ -105,7 +106,9 @@ static inline unsigned int do_gettimeoffset_tsc(void) static inline unsigned int do_gettimeoffset_hpet(void) { - return ((hpet_readl(HPET_COUNTER) - vxtime.last) * vxtime.quot) >> 32; + /* cap counter read to one tick to avoid inconsistencies */ + unsigned long counter = hpet_readl(HPET_COUNTER) - vxtime.last; + return (min(counter,hpet_tick) * vxtime.quot) >> 32; } unsigned int (*do_gettimeoffset)(void) = do_gettimeoffset_tsc; @@ -301,7 +304,7 @@ unsigned long long monotonic_clock(void) last_offset = vxtime.last; base = monotonic_base; - this_offset = hpet_readl(HPET_T0_CMP) - hpet_tick; + this_offset = hpet_readl(HPET_COUNTER); } while (read_seqretry(&xtime_lock, seq)); offset = (this_offset - last_offset); @@ -377,7 +380,14 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) write_seqlock(&xtime_lock); - if (vxtime.hpet_address) { + if (vxtime.hpet_address) + offset = hpet_readl(HPET_COUNTER); + + if (hpet_use_timer) { + /* if we're using the hpet timer functionality, + * we can more accurately know the counter value + * when the timer interrupt occured. + */ offset = hpet_readl(HPET_T0_CMP) - hpet_tick; delay = hpet_readl(HPET_COUNTER) - offset; } else { @@ -803,17 +813,18 @@ static int hpet_timer_stop_set_go(unsigned long tick) * Set up timer 0, as periodic with first interrupt to happen at hpet_tick, * and period also hpet_tick. */ - - hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | + if (hpet_use_timer) { + hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | HPET_TN_32BIT, HPET_T0_CFG); - hpet_writel(hpet_tick, HPET_T0_CMP); - hpet_writel(hpet_tick, HPET_T0_CMP); /* AK: why twice? */ - + hpet_writel(hpet_tick, HPET_T0_CMP); + hpet_writel(hpet_tick, HPET_T0_CMP); /* AK: why twice? */ + cfg |= HPET_CFG_LEGACY; + } /* * Go! */ - cfg |= HPET_CFG_ENABLE | HPET_CFG_LEGACY; + cfg |= HPET_CFG_ENABLE; hpet_writel(cfg, HPET_CFG); return 0; @@ -834,8 +845,7 @@ static int hpet_init(void) id = hpet_readl(HPET_ID); - if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER) || - !(id & HPET_ID_LEGSUP)) + if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER)) return -1; hpet_period = hpet_readl(HPET_PERIOD); @@ -845,6 +855,8 @@ static int hpet_init(void) hpet_tick = (1000000000L * (USEC_PER_SEC / HZ) + hpet_period / 2) / hpet_period; + hpet_use_timer = (id & HPET_ID_LEGSUP); + return hpet_timer_stop_set_go(hpet_tick); } @@ -901,9 +913,11 @@ void __init time_init(void) set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec); - if (!hpet_init()) { + if (!hpet_init()) vxtime_hz = (1000000000000000L + hpet_period / 2) / hpet_period; + + if (hpet_use_timer) { cpu_khz = hpet_calibrate_tsc(); timename = "HPET"; #ifdef CONFIG_X86_PM_TIMER @@ -968,7 +982,7 @@ void __init time_init_gtod(void) if (unsynchronized_tsc()) notsc = 1; if (vxtime.hpet_address && notsc) { - timetype = "HPET"; + timetype = hpet_use_timer ? "HPET" : "PIT/HPET"; vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick; vxtime.mode = VXTIME_HPET; do_gettimeoffset = do_gettimeoffset_hpet; @@ -983,7 +997,7 @@ void __init time_init_gtod(void) printk(KERN_INFO "Disabling vsyscall due to use of PM timer\n"); #endif } else { - timetype = vxtime.hpet_address ? "HPET/TSC" : "PIT/TSC"; + timetype = hpet_use_timer ? "HPET/TSC" : "PIT/TSC"; vxtime.mode = VXTIME_TSC; } diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 3dfec8fdabc..6ead433a388 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -274,7 +274,7 @@ EXPORT_SYMBOL(dump_stack); void show_registers(struct pt_regs *regs) { int i; - int in_kernel = (regs->cs & 3) == 0; + int in_kernel = !user_mode(regs); unsigned long rsp; const int cpu = safe_smp_processor_id(); struct task_struct *cur = cpu_pda[cpu].pcurrent; @@ -318,7 +318,7 @@ void handle_BUG(struct pt_regs *regs) struct bug_frame f; char tmp; - if (regs->cs & 3) + if (user_mode(regs)) return; if (__copy_from_user(&f, (struct bug_frame *) regs->rip, sizeof(struct bug_frame))) @@ -437,7 +437,7 @@ static void do_trap(int trapnr, int signr, char *str, } #endif - if ((regs->cs & 3) != 0) { + if (user_mode(regs)) { struct task_struct *tsk = current; if (exception_trace && unhandled_signal(tsk, signr)) @@ -522,7 +522,7 @@ asmlinkage void do_general_protection(struct pt_regs * regs, long error_code) } #endif - if ((regs->cs & 3)!=0) { + if (user_mode(regs)) { struct task_struct *tsk = current; if (exception_trace && unhandled_signal(tsk, SIGSEGV)) @@ -586,9 +586,12 @@ static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs) asmlinkage void default_do_nmi(struct pt_regs *regs) { unsigned char reason = 0; + int cpu; + + cpu = smp_processor_id(); /* Only the BSP gets external NMIs from the system. */ - if (!smp_processor_id()) + if (!cpu) reason = get_nmi_reason(); if (!(reason & 0xc0)) { @@ -638,7 +641,7 @@ asmlinkage struct pt_regs *sync_regs(struct pt_regs *eregs) if (eregs == (struct pt_regs *)eregs->rsp) ; /* Exception from user space */ - else if (eregs->cs & 3) + else if (user_mode(eregs)) regs = ((struct pt_regs *)current->thread.rsp0) - 1; /* Exception from kernel and interrupts are enabled. Move to kernel process stack. */ @@ -669,7 +672,7 @@ asmlinkage void do_debug(struct pt_regs * regs, unsigned long error_code) } #endif - asm("movq %%db6,%0" : "=r" (condition)); + get_debugreg(condition, 6); if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, SIGTRAP) == NOTIFY_STOP) @@ -697,7 +700,7 @@ asmlinkage void do_debug(struct pt_regs * regs, unsigned long error_code) * allowing programs to debug themselves without the ptrace() * interface. */ - if ((regs->cs & 3) == 0) + if (!user_mode(regs)) goto clear_TF_reenable; /* * Was the TF flag set by a debugger? If so, clear it now, @@ -715,13 +718,13 @@ asmlinkage void do_debug(struct pt_regs * regs, unsigned long error_code) info.si_signo = SIGTRAP; info.si_errno = 0; info.si_code = TRAP_BRKPT; - if ((regs->cs & 3) == 0) + if (!user_mode(regs)) goto clear_dr7; info.si_addr = (void __user *)regs->rip; force_sig_info(SIGTRAP, &info, tsk); clear_dr7: - asm volatile("movq %0,%%db7"::"r"(0UL)); + set_debugreg(0UL, 7); return; clear_TF_reenable: @@ -756,7 +759,7 @@ asmlinkage void do_coprocessor_error(struct pt_regs *regs) unsigned short cwd, swd; conditional_sti(regs); - if ((regs->cs & 3) == 0 && + if (!user_mode(regs) && kernel_math_error(regs, "kernel x87 math error")) return; @@ -822,7 +825,7 @@ asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) unsigned short mxcsr; conditional_sti(regs); - if ((regs->cs & 3) == 0 && + if (!user_mode(regs) && kernel_math_error(regs, "kernel simd math error")) return; diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S index 59ebd5beda8..2a94f9b60b2 100644 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ b/arch/x86_64/kernel/vmlinux.lds.S @@ -2,7 +2,10 @@ * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>; */ +#define LOAD_OFFSET __START_KERNEL_map + #include <asm-generic/vmlinux.lds.h> +#include <asm/page.h> #include <linux/config.h> OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64") @@ -11,28 +14,30 @@ ENTRY(phys_startup_64) jiffies_64 = jiffies; SECTIONS { - . = 0xffffffff80100000; + . = __START_KERNEL; phys_startup_64 = startup_64 - LOAD_OFFSET; _text = .; /* Text and read-only data */ - .text : { + .text : AT(ADDR(.text) - LOAD_OFFSET) { *(.text) SCHED_TEXT LOCK_TEXT *(.fixup) *(.gnu.warning) } = 0x9090 - .text.lock : { *(.text.lock) } /* out-of-line lock text */ + /* out-of-line lock text */ + .text.lock : AT(ADDR(.text.lock) - LOAD_OFFSET) { *(.text.lock) } _etext = .; /* End of text section */ . = ALIGN(16); /* Exception table */ __start___ex_table = .; - __ex_table : { *(__ex_table) } + __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) } __stop___ex_table = .; RODATA - .data : { /* Data */ + /* Data */ + .data : AT(ADDR(.data) - LOAD_OFFSET) { *(.data) CONSTRUCTORS } @@ -40,62 +45,99 @@ SECTIONS _edata = .; /* End of data section */ __bss_start = .; /* BSS */ - .bss : { + .bss : AT(ADDR(.bss) - LOAD_OFFSET) { *(.bss.page_aligned) *(.bss) } __bss_end = .; + . = ALIGN(PAGE_SIZE); + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { + *(.data.cacheline_aligned) + } . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .data.cacheline_aligned : { *(.data.cacheline_aligned) } + .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { + *(.data.read_mostly) + } + +#define VSYSCALL_ADDR (-10*1024*1024) +#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) +#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) + +#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR) +#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) -#define AFTER(x) BINALIGN(LOADADDR(x) + SIZEOF(x), 16) -#define BINALIGN(x,y) (((x) + (y) - 1) & ~((y) - 1)) -#define CACHE_ALIGN(x) BINALIGN(x, CONFIG_X86_L1_CACHE_BYTES) +#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR) +#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) + + . = VSYSCALL_ADDR; + .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } + __vsyscall_0 = VSYSCALL_VIRT_ADDR; - .vsyscall_0 -10*1024*1024: AT ((LOADADDR(.data.cacheline_aligned) + SIZEOF(.data.cacheline_aligned) + 4095) & ~(4095)) { *(.vsyscall_0) } - __vsyscall_0 = LOADADDR(.vsyscall_0); . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .xtime_lock : AT CACHE_ALIGN(AFTER(.vsyscall_0)) { *(.xtime_lock) } - xtime_lock = LOADADDR(.xtime_lock); - .vxtime : AT AFTER(.xtime_lock) { *(.vxtime) } - vxtime = LOADADDR(.vxtime); - .wall_jiffies : AT AFTER(.vxtime) { *(.wall_jiffies) } - wall_jiffies = LOADADDR(.wall_jiffies); - .sys_tz : AT AFTER(.wall_jiffies) { *(.sys_tz) } - sys_tz = LOADADDR(.sys_tz); - .sysctl_vsyscall : AT AFTER(.sys_tz) { *(.sysctl_vsyscall) } - sysctl_vsyscall = LOADADDR(.sysctl_vsyscall); - .xtime : AT AFTER(.sysctl_vsyscall) { *(.xtime) } - xtime = LOADADDR(.xtime); + .xtime_lock : AT(VLOAD(.xtime_lock)) { *(.xtime_lock) } + xtime_lock = VVIRT(.xtime_lock); + + .vxtime : AT(VLOAD(.vxtime)) { *(.vxtime) } + vxtime = VVIRT(.vxtime); + + .wall_jiffies : AT(VLOAD(.wall_jiffies)) { *(.wall_jiffies) } + wall_jiffies = VVIRT(.wall_jiffies); + + .sys_tz : AT(VLOAD(.sys_tz)) { *(.sys_tz) } + sys_tz = VVIRT(.sys_tz); + + .sysctl_vsyscall : AT(VLOAD(.sysctl_vsyscall)) { *(.sysctl_vsyscall) } + sysctl_vsyscall = VVIRT(.sysctl_vsyscall); + + .xtime : AT(VLOAD(.xtime)) { *(.xtime) } + xtime = VVIRT(.xtime); + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .jiffies : AT CACHE_ALIGN(AFTER(.xtime)) { *(.jiffies) } - jiffies = LOADADDR(.jiffies); - .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT (LOADADDR(.vsyscall_0) + 1024) { *(.vsyscall_1) } - . = LOADADDR(.vsyscall_0) + 4096; + .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) } + jiffies = VVIRT(.jiffies); + + .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { *(.vsyscall_1) } + .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) { *(.vsyscall_2) } + .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { *(.vsyscall_3) } + + . = VSYSCALL_VIRT_ADDR + 4096; + +#undef VSYSCALL_ADDR +#undef VSYSCALL_PHYS_ADDR +#undef VSYSCALL_VIRT_ADDR +#undef VLOAD_OFFSET +#undef VLOAD +#undef VVIRT_OFFSET +#undef VVIRT . = ALIGN(8192); /* init_task */ - .data.init_task : { *(.data.init_task) } + .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { + *(.data.init_task) + } . = ALIGN(4096); - .data.page_aligned : { *(.data.page_aligned) } + .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { + *(.data.page_aligned) + } . = ALIGN(4096); /* Init code and data */ __init_begin = .; - .init.text : { + .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { _sinittext = .; *(.init.text) _einittext = .; } __initdata_begin = .; - .init.data : { *(.init.data) } + .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) } __initdata_end = .; . = ALIGN(16); __setup_start = .; - .init.setup : { *(.init.setup) } + .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) } __setup_end = .; __initcall_start = .; - .initcall.init : { + .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { *(.initcall1.init) *(.initcall2.init) *(.initcall3.init) @@ -106,32 +148,38 @@ SECTIONS } __initcall_end = .; __con_initcall_start = .; - .con_initcall.init : { *(.con_initcall.init) } + .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { + *(.con_initcall.init) + } __con_initcall_end = .; SECURITY_INIT . = ALIGN(8); __alt_instructions = .; - .altinstructions : { *(.altinstructions) } + .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { + *(.altinstructions) + } __alt_instructions_end = .; - .altinstr_replacement : { *(.altinstr_replacement) } + .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { + *(.altinstr_replacement) + } /* .exit.text is discard at runtime, not link time, to deal with references from .altinstructions and .eh_frame */ - .exit.text : { *(.exit.text) } - .exit.data : { *(.exit.data) } + .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) } + .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) } . = ALIGN(4096); __initramfs_start = .; - .init.ramfs : { *(.init.ramfs) } + .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) } __initramfs_end = .; . = ALIGN(32); __per_cpu_start = .; - .data.percpu : { *(.data.percpu) } + .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) } __per_cpu_end = .; . = ALIGN(4096); __init_end = .; . = ALIGN(4096); __nosave_begin = .; - .data_nosave : { *(.data.nosave) } + .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) } . = ALIGN(4096); __nosave_end = .; diff --git a/arch/x86_64/lib/csum-copy.S b/arch/x86_64/lib/csum-copy.S index 01808ec3783..72fd55ee896 100644 --- a/arch/x86_64/lib/csum-copy.S +++ b/arch/x86_64/lib/csum-copy.S @@ -188,8 +188,8 @@ csum_partial_copy_generic: source movw (%rdi),%bx adcl %ebx,%eax - dest decl %ecx + dest movw %bx,(%rsi) leaq 2(%rdi),%rdi leaq 2(%rsi),%rsi diff --git a/arch/x86_64/lib/delay.c b/arch/x86_64/lib/delay.c index aed61a668a1..841bd738a18 100644 --- a/arch/x86_64/lib/delay.c +++ b/arch/x86_64/lib/delay.c @@ -12,12 +12,17 @@ #include <linux/sched.h> #include <linux/delay.h> #include <asm/delay.h> +#include <asm/msr.h> #ifdef CONFIG_SMP #include <asm/smp.h> #endif -int x86_udelay_tsc = 0; /* Delay via TSC */ +int read_current_timer(unsigned long *timer_value) +{ + rdtscll(*timer_value); + return 0; +} void __delay(unsigned long loops) { diff --git a/arch/x86_64/mm/Makefile b/arch/x86_64/mm/Makefile index 66c354ad80c..1d232a87f11 100644 --- a/arch/x86_64/mm/Makefile +++ b/arch/x86_64/mm/Makefile @@ -4,7 +4,7 @@ obj-y := init.o fault.o ioremap.o extable.o pageattr.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o -obj-$(CONFIG_DISCONTIGMEM) += numa.o +obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_K8_NUMA) += k8topology.o obj-$(CONFIG_ACPI_NUMA) += srat.o diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c index 57d3ab15a5c..493819e543a 100644 --- a/arch/x86_64/mm/fault.c +++ b/arch/x86_64/mm/fault.c @@ -23,7 +23,6 @@ #include <linux/vt_kern.h> /* For unblank_screen() */ #include <linux/compiler.h> #include <linux/module.h> -#include <linux/kprobes.h> #include <asm/system.h> #include <asm/uaccess.h> @@ -74,7 +73,7 @@ static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr, instr = (unsigned char *)convert_rip_to_linear(current, regs); max_instr = instr + 15; - if ((regs->cs & 3) != 0 && instr >= (unsigned char *)TASK_SIZE) + if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) return 0; while (scan_more && instr < max_instr) { @@ -106,7 +105,7 @@ static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr, /* Could check the LDT for lm, but for now it's good enough to assume that long mode only uses well known segments or kernel. */ - scan_more = ((regs->cs & 3) == 0) || (regs->cs == __USER_CS); + scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS); break; case 0x60: @@ -440,13 +439,13 @@ good_area: * the fault. */ switch (handle_mm_fault(mm, vma, address, write)) { - case 1: + case VM_FAULT_MINOR: tsk->min_flt++; break; - case 2: + case VM_FAULT_MAJOR: tsk->maj_flt++; break; - case 0: + case VM_FAULT_SIGBUS: goto do_sigbus; default: goto out_of_memory; diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index dbe53b4c7e6..72e4b364ed7 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -318,7 +318,7 @@ void zap_low_mappings(void) flush_tlb_all(); } -#ifndef CONFIG_DISCONTIGMEM +#ifndef CONFIG_NUMA void __init paging_init(void) { { @@ -427,13 +427,16 @@ void __init mem_init(void) reservedpages = 0; /* this will put all low memory onto the freelists */ -#ifdef CONFIG_DISCONTIGMEM +#ifdef CONFIG_NUMA totalram_pages += numa_free_all_bootmem(); tmp = 0; /* should count reserved pages here for all nodes */ #else + +#ifdef CONFIG_FLATMEM max_mapnr = end_pfn; if (!mem_map) BUG(); +#endif totalram_pages += free_all_bootmem(); @@ -515,7 +518,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) void __init reserve_bootmem_generic(unsigned long phys, unsigned len) { /* Should check here against the e820 map to avoid double free */ -#ifdef CONFIG_DISCONTIGMEM +#ifdef CONFIG_NUMA int nid = phys_to_nid(phys); reserve_bootmem_node(NODE_DATA(nid), phys, len); #else diff --git a/arch/x86_64/mm/ioremap.c b/arch/x86_64/mm/ioremap.c index 58aac23760e..6972df480d2 100644 --- a/arch/x86_64/mm/ioremap.c +++ b/arch/x86_64/mm/ioremap.c @@ -178,7 +178,7 @@ void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned l if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS) return (__force void __iomem *)phys_to_virt(phys_addr); -#ifndef CONFIG_DISCONTIGMEM +#ifdef CONFIG_FLATMEM /* * Don't allow anybody to remap normal RAM that we're using.. */ diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index fd9f25d7a6c..70cb2904a90 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c @@ -36,36 +36,45 @@ int numa_off __initdata; int __init compute_hash_shift(struct node *nodes, int numnodes) { int i; - int shift = 24; - u64 addr; + int shift = 20; + unsigned long addr,maxend=0; - /* When in doubt use brute force. */ - while (shift < 48) { - memset(memnodemap,0xff,sizeof(*memnodemap) * NODEMAPSIZE); - for (i = 0; i < numnodes; i++) { - if (nodes[i].start == nodes[i].end) - continue; - for (addr = nodes[i].start; - addr < nodes[i].end; - addr += (1UL << shift)) { - if (memnodemap[addr >> shift] != 0xff && - memnodemap[addr >> shift] != i) { - printk(KERN_INFO - "node %d shift %d addr %Lx conflict %d\n", - i, shift, addr, memnodemap[addr>>shift]); - goto next; - } - memnodemap[addr >> shift] = i; + for (i = 0; i < numnodes; i++) + if ((nodes[i].start != nodes[i].end) && (nodes[i].end > maxend)) + maxend = nodes[i].end; + + while ((1UL << shift) < (maxend / NODEMAPSIZE)) + shift++; + + printk (KERN_DEBUG"Using %d for the hash shift. Max adder is %lx \n", + shift,maxend); + memset(memnodemap,0xff,sizeof(*memnodemap) * NODEMAPSIZE); + for (i = 0; i < numnodes; i++) { + if (nodes[i].start == nodes[i].end) + continue; + for (addr = nodes[i].start; + addr < nodes[i].end; + addr += (1UL << shift)) { + if (memnodemap[addr >> shift] != 0xff) { + printk(KERN_INFO + "Your memory is not aligned you need to rebuild your kernel " + "with a bigger NODEMAPSIZE shift=%d adder=%lu\n", + shift,addr); + return -1; } + memnodemap[addr >> shift] = i; } - return shift; - next: - shift++; } - memset(memnodemap,0,sizeof(*memnodemap) * NODEMAPSIZE); - return -1; + return shift; } +#ifdef CONFIG_SPARSEMEM +int early_pfn_to_nid(unsigned long pfn) +{ + return phys_to_nid(pfn << PAGE_SHIFT); +} +#endif + /* Initialize bootmem allocator for a node */ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) { @@ -80,6 +89,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en start_pfn = start >> PAGE_SHIFT; end_pfn = end >> PAGE_SHIFT; + memory_present(nodeid, start_pfn, end_pfn); nodedata_phys = find_e820_area(start, end, pgdat_size); if (nodedata_phys == -1L) panic("Cannot find memory pgdat in node %d\n", nodeid); @@ -243,7 +253,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); } -__init void numa_add_cpu(int cpu) +__cpuinit void numa_add_cpu(int cpu) { /* BP is initialized elsewhere */ if (cpu) diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c index 5d01b31472e..8e3d097a9dd 100644 --- a/arch/x86_64/mm/srat.c +++ b/arch/x86_64/mm/srat.c @@ -20,6 +20,9 @@ static struct acpi_table_slit *acpi_slit; +/* Internal processor count */ +static unsigned int __initdata num_processors = 0; + static nodemask_t nodes_parsed __initdata; static nodemask_t nodes_found __initdata; static struct node nodes[MAX_NUMNODES] __initdata; @@ -101,16 +104,18 @@ acpi_numa_processor_affinity_init(struct acpi_table_processor_affinity *pa) bad_srat(); return; } - if (pa->apic_id >= NR_CPUS) { - printk(KERN_ERR "SRAT: lapic %u too large.\n", - pa->apic_id); + if (num_processors >= NR_CPUS) { + printk(KERN_ERR "SRAT: Processor #%d (lapic %u) INVALID. (Max ID: %d).\n", + num_processors, pa->apic_id, NR_CPUS); bad_srat(); return; } - cpu_to_node[pa->apic_id] = node; + cpu_to_node[num_processors] = node; acpi_numa = 1; - printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n", - pxm, pa->apic_id, node); + printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> CPU %u -> Node %u\n", + pxm, pa->apic_id, num_processors, node); + + num_processors++; } /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ @@ -124,7 +129,6 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) if (srat_disabled() || ma->flags.enabled == 0) return; - /* hotplug bit is ignored for now */ pxm = ma->proximity_domain; node = setup_node(pxm); if (node < 0) { @@ -134,6 +138,10 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) } start = ma->base_addr_lo | ((u64)ma->base_addr_hi << 32); end = start + (ma->length_lo | ((u64)ma->length_hi << 32)); + /* It is fine to add this area to the nodes data it will be used later*/ + if (ma->flags.hot_pluggable == 1) + printk(KERN_INFO "SRAT: hot plug zone found %lx - %lx \n", + start, end); i = conflicting_nodes(start, end); if (i >= 0) { printk(KERN_ERR diff --git a/arch/x86_64/pci/k8-bus.c b/arch/x86_64/pci/k8-bus.c index 62349c78db5..c2c38b57993 100644 --- a/arch/x86_64/pci/k8-bus.c +++ b/arch/x86_64/pci/k8-bus.c @@ -29,7 +29,7 @@ __init static int fill_mp_bus_to_cpumask(void) { struct pci_dev *nb_dev = NULL; - int i, j, printed; + int i, j; u32 ldtbus, nid; static int lbnr[3] = { LDT_BUS_NUMBER_REGISTER_0, @@ -53,25 +53,11 @@ fill_mp_bus_to_cpumask(void) for (j = SECONDARY_LDT_BUS_NUMBER(ldtbus); j <= SUBORDINATE_LDT_BUS_NUMBER(ldtbus); j++) - pci_bus_to_cpumask[j] = - node_to_cpumask(NODE_ID(nid)); + pci_bus_to_node[j] = NODE_ID(nid); } } } - /* quick sanity check */ - printed = 0; - for (i = 0; i < 256; i++) { - if (cpus_empty(pci_bus_to_cpumask[i])) { - pci_bus_to_cpumask[i] = CPU_MASK_ALL; - if (printed) - continue; - printk(KERN_ERR - "k8-bus.c: some busses have empty cpu mask\n"); - printed = 1; - } - } - return 0; } diff --git a/arch/x86_64/pci/mmconfig.c b/arch/x86_64/pci/mmconfig.c index b693c232fd0..657e88aa090 100644 --- a/arch/x86_64/pci/mmconfig.c +++ b/arch/x86_64/pci/mmconfig.c @@ -7,25 +7,50 @@ #include <linux/pci.h> #include <linux/init.h> +#include <linux/acpi.h> #include "pci.h" #define MMCONFIG_APER_SIZE (256*1024*1024) -/* The physical address of the MMCONFIG aperture. Set from ACPI tables. */ -u32 pci_mmcfg_base_addr; - /* Static virtual mapping of the MMCONFIG aperture */ -char *pci_mmcfg_virt; +struct mmcfg_virt { + struct acpi_table_mcfg_config *cfg; + char *virt; +}; +static struct mmcfg_virt *pci_mmcfg_virt; -static inline char *pci_dev_base(unsigned int bus, unsigned int devfn) +static char *get_virt(unsigned int seg, int bus) { - return pci_mmcfg_virt + ((bus << 20) | (devfn << 12)); + int cfg_num = -1; + struct acpi_table_mcfg_config *cfg; + + while (1) { + ++cfg_num; + if (cfg_num >= pci_mmcfg_config_num) { + /* something bad is going on, no cfg table is found. */ + /* so we fall back to the old way we used to do this */ + /* and just rely on the first entry to be correct. */ + return pci_mmcfg_virt[0].virt; + } + cfg = pci_mmcfg_virt[cfg_num].cfg; + if (cfg->pci_segment_group_number != seg) + continue; + if ((cfg->start_bus_number <= bus) && + (cfg->end_bus_number >= bus)) + return pci_mmcfg_virt[cfg_num].virt; + } +} + +static inline char *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn) +{ + + return get_virt(seg, bus) + ((bus << 20) | (devfn << 12)); } static int pci_mmcfg_read(unsigned int seg, unsigned int bus, unsigned int devfn, int reg, int len, u32 *value) { - char *addr = pci_dev_base(bus, devfn); + char *addr = pci_dev_base(seg, bus, devfn); if (unlikely(!value || (bus > 255) || (devfn > 255) || (reg > 4095))) return -EINVAL; @@ -48,7 +73,7 @@ static int pci_mmcfg_read(unsigned int seg, unsigned int bus, static int pci_mmcfg_write(unsigned int seg, unsigned int bus, unsigned int devfn, int reg, int len, u32 value) { - char *addr = pci_dev_base(bus,devfn); + char *addr = pci_dev_base(seg, bus, devfn); if (unlikely((bus > 255) || (devfn > 255) || (reg > 4095))) return -EINVAL; @@ -75,9 +100,15 @@ static struct pci_raw_ops pci_mmcfg = { static int __init pci_mmcfg_init(void) { + int i; + if ((pci_probe & PCI_PROBE_MMCONF) == 0) return 0; - if (!pci_mmcfg_base_addr) + + acpi_table_parse(ACPI_MCFG, acpi_parse_mcfg); + if ((pci_mmcfg_config_num == 0) || + (pci_mmcfg_config == NULL) || + (pci_mmcfg_config[0].base_address == 0)) return 0; /* Kludge for now. Don't use mmconfig on AMD systems because @@ -88,13 +119,22 @@ static int __init pci_mmcfg_init(void) return 0; /* RED-PEN i386 doesn't do _nocache right now */ - pci_mmcfg_virt = ioremap_nocache(pci_mmcfg_base_addr, MMCONFIG_APER_SIZE); - if (!pci_mmcfg_virt) { - printk("PCI: Cannot map mmconfig aperture\n"); + pci_mmcfg_virt = kmalloc(sizeof(*pci_mmcfg_virt) * pci_mmcfg_config_num, GFP_KERNEL); + if (pci_mmcfg_virt == NULL) { + printk("PCI: Can not allocate memory for mmconfig structures\n"); return 0; - } + } + for (i = 0; i < pci_mmcfg_config_num; ++i) { + pci_mmcfg_virt[i].cfg = &pci_mmcfg_config[i]; + pci_mmcfg_virt[i].virt = ioremap_nocache(pci_mmcfg_config[i].base_address, MMCONFIG_APER_SIZE); + if (!pci_mmcfg_virt[i].virt) { + printk("PCI: Cannot map mmconfig aperture for segment %d\n", + pci_mmcfg_config[i].pci_segment_group_number); + return 0; + } + printk(KERN_INFO "PCI: Using MMCONFIG at %x\n", pci_mmcfg_config[i].base_address); + } - printk(KERN_INFO "PCI: Using MMCONFIG at %x\n", pci_mmcfg_base_addr); raw_pci_ops = &pci_mmcfg; pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; |