diff options
Diffstat (limited to 'arch/x86_64')
73 files changed, 1896 insertions, 2297 deletions
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index 5ce94430c01..45f82ae6d38 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig @@ -32,6 +32,10 @@ config GENERIC_TIME_VSYSCALL bool default y +config GENERIC_CMOS_UPDATE + bool + default y + config ZONE_DMA32 bool default y @@ -56,6 +60,14 @@ config ZONE_DMA bool default y +config QUICKLIST + bool + default y + +config NR_QUICK + int + default 2 + config ISA bool @@ -427,6 +439,10 @@ config NR_CPUS This is purely to save memory - each supported CPU requires memory in the static kernel configuration. +config PHYSICAL_ALIGN + hex + default "0x200000" + config HOTPLUG_CPU bool "Support for suspend on SMP and hot-pluggable CPUs (EXPERIMENTAL)" depends on SMP && HOTPLUG && EXPERIMENTAL @@ -770,8 +786,8 @@ menu "Instrumentation Support" source "arch/x86_64/oprofile/Kconfig" config KPROBES - bool "Kprobes (EXPERIMENTAL)" - depends on KALLSYMS && EXPERIMENTAL && MODULES + bool "Kprobes" + depends on KALLSYMS && MODULES help Kprobes allows you to trap at almost any kernel address and execute a callback function. register_kprobe() establishes diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile index 29617ae3926..128561d3e87 100644 --- a/arch/x86_64/Makefile +++ b/arch/x86_64/Makefile @@ -76,7 +76,8 @@ head-y := arch/x86_64/kernel/head.o arch/x86_64/kernel/head64.o arch/x86_64/kern libs-y += arch/x86_64/lib/ core-y += arch/x86_64/kernel/ \ arch/x86_64/mm/ \ - arch/x86_64/crypto/ + arch/x86_64/crypto/ \ + arch/x86_64/vdso/ core-$(CONFIG_IA32_EMULATION) += arch/x86_64/ia32/ drivers-$(CONFIG_PCI) += arch/x86_64/pci/ drivers-$(CONFIG_OPROFILE) += arch/x86_64/oprofile/ diff --git a/arch/x86_64/boot/.gitignore b/arch/x86_64/boot/.gitignore index 495f20c085d..18465143cfa 100644 --- a/arch/x86_64/boot/.gitignore +++ b/arch/x86_64/boot/.gitignore @@ -1,3 +1,5 @@ bootsect bzImage setup +setup.bin +setup.elf diff --git a/arch/x86_64/boot/Makefile b/arch/x86_64/boot/Makefile index ee6f6505f95..67096389de1 100644 --- a/arch/x86_64/boot/Makefile +++ b/arch/x86_64/boot/Makefile @@ -1,135 +1,9 @@ # # arch/x86_64/boot/Makefile # -# This file is subject to the terms and conditions of the GNU General Public -# License. See the file "COPYING" in the main directory of this archive -# for more details. -# -# Copyright (C) 1994 by Linus Torvalds -# - -# ROOT_DEV specifies the default root-device when making the image. -# This can be either FLOPPY, CURRENT, /dev/xxxx or empty, in which case -# the default of FLOPPY is used by 'build'. - -ROOT_DEV := CURRENT - -# If you want to preset the SVGA mode, uncomment the next line and -# set SVGA_MODE to whatever number you want. -# Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode. -# The number is the same as you would ordinarily press at bootup. - -SVGA_MODE := -DSVGA_MODE=NORMAL_VGA - -# If you want the RAM disk device, define this to be the size in blocks. - -#RAMDISK := -DRAMDISK=512 - -targets := vmlinux.bin bootsect bootsect.o \ - setup setup.o bzImage mtools.conf - -EXTRA_CFLAGS := -m32 - -hostprogs-y := tools/build -HOST_EXTRACFLAGS += $(LINUXINCLUDE) -subdir- := compressed/ #Let make clean descend in compressed/ -# --------------------------------------------------------------------------- - -$(obj)/bzImage: IMAGE_OFFSET := 0x100000 -$(obj)/bzImage: EXTRA_AFLAGS := $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__ -$(obj)/bzImage: BUILDFLAGS := -b - -quiet_cmd_image = BUILD $@ -cmd_image = $(obj)/tools/build $(BUILDFLAGS) $(obj)/bootsect $(obj)/setup \ - $(obj)/vmlinux.bin $(ROOT_DEV) > $@ - -$(obj)/bzImage: $(obj)/bootsect $(obj)/setup \ - $(obj)/vmlinux.bin $(obj)/tools/build FORCE - $(call if_changed,image) - @echo 'Kernel: $@ is ready' ' (#'`cat .version`')' - -$(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE - $(call if_changed,objcopy) - -LDFLAGS_bootsect := -Ttext 0x0 -s --oformat binary -LDFLAGS_setup := -Ttext 0x0 -s --oformat binary -e begtext - -$(obj)/setup $(obj)/bootsect: %: %.o FORCE - $(call if_changed,ld) - -$(obj)/compressed/vmlinux: FORCE - $(Q)$(MAKE) $(build)=$(obj)/compressed IMAGE_OFFSET=$(IMAGE_OFFSET) $@ - -# Set this if you want to pass append arguments to the zdisk/fdimage/isoimage kernel -FDARGS = -# Set this if you want an initrd included with the zdisk/fdimage/isoimage kernel -FDINITRD = - -image_cmdline = default linux $(FDARGS) $(if $(FDINITRD),initrd=initrd.img,) - -$(obj)/mtools.conf: $(src)/mtools.conf.in - sed -e 's|@OBJ@|$(obj)|g' < $< > $@ - -# This requires write access to /dev/fd0 -zdisk: $(BOOTIMAGE) $(obj)/mtools.conf - MTOOLSRC=$(obj)/mtools.conf mformat a: ; sync - syslinux /dev/fd0 ; sync - echo '$(image_cmdline)' | \ - MTOOLSRC=$(obj)/mtools.conf mcopy - a:syslinux.cfg - if [ -f '$(FDINITRD)' ] ; then \ - MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' a:initrd.img ; \ - fi - MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) a:linux ; sync - -# These require being root or having syslinux 2.02 or higher installed -fdimage fdimage144: $(BOOTIMAGE) $(obj)/mtools.conf - dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=1440 - MTOOLSRC=$(obj)/mtools.conf mformat v: ; sync - syslinux $(obj)/fdimage ; sync - echo '$(image_cmdline)' | \ - MTOOLSRC=$(obj)/mtools.conf mcopy - v:syslinux.cfg - if [ -f '$(FDINITRD)' ] ; then \ - MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' v:initrd.img ; \ - fi - MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) v:linux ; sync - -fdimage288: $(BOOTIMAGE) $(obj)/mtools.conf - dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=2880 - MTOOLSRC=$(obj)/mtools.conf mformat w: ; sync - syslinux $(obj)/fdimage ; sync - echo '$(image_cmdline)' | \ - MTOOLSRC=$(obj)/mtools.conf mcopy - w:syslinux.cfg - if [ -f '$(FDINITRD)' ] ; then \ - MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' w:initrd.img ; \ - fi - MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) w:linux ; sync - -isoimage: $(BOOTIMAGE) - -rm -rf $(obj)/isoimage - mkdir $(obj)/isoimage - for i in lib lib64 share end ; do \ - if [ -f /usr/$$i/syslinux/isolinux.bin ] ; then \ - cp /usr/$$i/syslinux/isolinux.bin $(obj)/isoimage ; \ - break ; \ - fi ; \ - if [ $$i = end ] ; then exit 1 ; fi ; \ - done - cp $(BOOTIMAGE) $(obj)/isoimage/linux - echo '$(image_cmdline)' > $(obj)/isoimage/isolinux.cfg - if [ -f '$(FDINITRD)' ] ; then \ - cp '$(FDINITRD)' $(obj)/isoimage/initrd.img ; \ - fi - mkisofs -J -r -o $(obj)/image.iso -b isolinux.bin -c boot.cat \ - -no-emul-boot -boot-load-size 4 -boot-info-table \ - $(obj)/isoimage - rm -rf $(obj)/isoimage - -zlilo: $(BOOTIMAGE) - if [ -f $(INSTALL_PATH)/vmlinuz ]; then mv $(INSTALL_PATH)/vmlinuz $(INSTALL_PATH)/vmlinuz.old; fi - if [ -f $(INSTALL_PATH)/System.map ]; then mv $(INSTALL_PATH)/System.map $(INSTALL_PATH)/System.old; fi - cat $(BOOTIMAGE) > $(INSTALL_PATH)/vmlinuz - cp System.map $(INSTALL_PATH)/ - if [ -x /sbin/lilo ]; then /sbin/lilo; else /etc/lilo/install; fi +# The actual boot code is shared with i386 including the Makefile. +# So tell kbuild that we fetch the code from i386 and include the +# Makefile from i386 too. -install: - sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(BOOTIMAGE) System.map "$(INSTALL_PATH)" +src := arch/i386/boot +include $(src)/Makefile diff --git a/arch/x86_64/boot/bootsect.S b/arch/x86_64/boot/bootsect.S deleted file mode 100644 index 011b7a4993d..00000000000 --- a/arch/x86_64/boot/bootsect.S +++ /dev/null @@ -1,98 +0,0 @@ -/* - * bootsect.S Copyright (C) 1991, 1992 Linus Torvalds - * - * modified by Drew Eckhardt - * modified by Bruce Evans (bde) - * modified by Chris Noe (May 1999) (as86 -> gas) - * gutted by H. Peter Anvin (Jan 2003) - * - * BIG FAT NOTE: We're in real mode using 64k segments. Therefore segment - * addresses must be multiplied by 16 to obtain their respective linear - * addresses. To avoid confusion, linear addresses are written using leading - * hex while segment addresses are written as segment:offset. - * - */ - -#include <asm/boot.h> - -SETUPSECTS = 4 /* default nr of setup-sectors */ -BOOTSEG = 0x07C0 /* original address of boot-sector */ -INITSEG = DEF_INITSEG /* we move boot here - out of the way */ -SETUPSEG = DEF_SETUPSEG /* setup starts here */ -SYSSEG = DEF_SYSSEG /* system loaded at 0x10000 (65536) */ -SYSSIZE = DEF_SYSSIZE /* system size: # of 16-byte clicks */ - /* to be loaded */ -ROOT_DEV = 0 /* ROOT_DEV is now written by "build" */ -SWAP_DEV = 0 /* SWAP_DEV is now written by "build" */ - -#ifndef SVGA_MODE -#define SVGA_MODE ASK_VGA -#endif - -#ifndef RAMDISK -#define RAMDISK 0 -#endif - -#ifndef ROOT_RDONLY -#define ROOT_RDONLY 1 -#endif - -.code16 -.text - -.global _start -_start: - - # Normalize the start address - jmpl $BOOTSEG, $start2 - -start2: - movw %cs, %ax - movw %ax, %ds - movw %ax, %es - movw %ax, %ss - movw $0x7c00, %sp - sti - cld - - movw $bugger_off_msg, %si - -msg_loop: - lodsb - andb %al, %al - jz die - movb $0xe, %ah - movw $7, %bx - int $0x10 - jmp msg_loop - -die: - # Allow the user to press a key, then reboot - xorw %ax, %ax - int $0x16 - int $0x19 - - # int 0x19 should never return. In case it does anyway, - # invoke the BIOS reset code... - ljmp $0xf000,$0xfff0 - - -bugger_off_msg: - .ascii "Direct booting from floppy is no longer supported.\r\n" - .ascii "Please use a boot loader program instead.\r\n" - .ascii "\n" - .ascii "Remove disk and press any key to reboot . . .\r\n" - .byte 0 - - - # Kernel attributes; used by setup - - .org 497 -setup_sects: .byte SETUPSECTS -root_flags: .word ROOT_RDONLY -syssize: .word SYSSIZE -swap_dev: .word SWAP_DEV -ram_size: .word RAMDISK -vid_mode: .word SVGA_MODE -root_dev: .word ROOT_DEV -boot_flag: .word 0xAA55 diff --git a/arch/x86_64/boot/compressed/Makefile b/arch/x86_64/boot/compressed/Makefile index 705a3e33d7e..877c0bdbbc6 100644 --- a/arch/x86_64/boot/compressed/Makefile +++ b/arch/x86_64/boot/compressed/Makefile @@ -3,15 +3,14 @@ # # create a compressed vmlinux image from the original vmlinux # -# Note all the files here are compiled/linked as 32bit executables. -# targets := vmlinux vmlinux.bin vmlinux.bin.gz head.o misc.o piggy.o -EXTRA_AFLAGS := -traditional -# cannot use EXTRA_CFLAGS because base CFLAGS contains -mkernel which conflicts with -# -m32 -CFLAGS := -m64 -D__KERNEL__ -Iinclude -O2 -fno-strict-aliasing -fPIC -mcmodel=small -fno-builtin +CFLAGS := -m64 -D__KERNEL__ $(LINUXINCLUDE) -O2 \ + -fno-strict-aliasing -fPIC -mcmodel=small \ + $(call cc-option, -ffreestanding) \ + $(call cc-option, -fno-stack-protector) +AFLAGS := $(CFLAGS) -D__ASSEMBLY__ LDFLAGS := -m elf_x86_64 LDFLAGS_vmlinux := -T diff --git a/arch/x86_64/boot/compressed/head.S b/arch/x86_64/boot/compressed/head.S index f9d5692a010..1312bfaff30 100644 --- a/arch/x86_64/boot/compressed/head.S +++ b/arch/x86_64/boot/compressed/head.S @@ -46,10 +46,10 @@ startup_32: * at and where we were actually loaded at. This can only be done * with a short local call on x86. Nothing else will tell us what * address we are running at. The reserved chunk of the real-mode - * data at 0x34-0x3f are used as the stack for this calculation. - * Only 4 bytes are needed. + * data at 0x1e4 (defined as a scratch field) are used as the stack + * for this calculation. Only 4 bytes are needed. */ - leal 0x40(%esi), %esp + leal (0x1e4+4)(%esi), %esp call 1f 1: popl %ebp subl $1b, %ebp diff --git a/arch/x86_64/boot/install.sh b/arch/x86_64/boot/install.sh deleted file mode 100644 index baaa2369bdb..00000000000 --- a/arch/x86_64/boot/install.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/sh -. $srctree/arch/i386/boot/install.sh diff --git a/arch/x86_64/boot/mtools.conf.in b/arch/x86_64/boot/mtools.conf.in deleted file mode 100644 index efd6d2490c1..00000000000 --- a/arch/x86_64/boot/mtools.conf.in +++ /dev/null @@ -1,17 +0,0 @@ -# -# mtools configuration file for "make (b)zdisk" -# - -# Actual floppy drive -drive a: - file="/dev/fd0" - -# 1.44 MB floppy disk image -drive v: - file="@OBJ@/fdimage" cylinders=80 heads=2 sectors=18 filter - -# 2.88 MB floppy disk image (mostly for virtual uses) -drive w: - file="@OBJ@/fdimage" cylinders=80 heads=2 sectors=36 filter - - diff --git a/arch/x86_64/boot/setup.S b/arch/x86_64/boot/setup.S deleted file mode 100644 index e9e33f94969..00000000000 --- a/arch/x86_64/boot/setup.S +++ /dev/null @@ -1,826 +0,0 @@ -/* - * setup.S Copyright (C) 1991, 1992 Linus Torvalds - * - * setup.s is responsible for getting the system data from the BIOS, - * and putting them into the appropriate places in system memory. - * both setup.s and system has been loaded by the bootblock. - * - * This code asks the bios for memory/disk/other parameters, and - * puts them in a "safe" place: 0x90000-0x901FF, ie where the - * boot-block used to be. It is then up to the protected mode - * system to read them from there before the area is overwritten - * for buffer-blocks. - * - * Move PS/2 aux init code to psaux.c - * (troyer@saifr00.cfsat.Honeywell.COM) 03Oct92 - * - * some changes and additional features by Christoph Niemann, - * March 1993/June 1994 (Christoph.Niemann@linux.org) - * - * add APM BIOS checking by Stephen Rothwell, May 1994 - * (sfr@canb.auug.org.au) - * - * High load stuff, initrd support and position independency - * by Hans Lermen & Werner Almesberger, February 1996 - * <lermen@elserv.ffm.fgan.de>, <almesber@lrc.epfl.ch> - * - * Video handling moved to video.S by Martin Mares, March 1996 - * <mj@k332.feld.cvut.cz> - * - * Extended memory detection scheme retwiddled by orc@pell.chi.il.us (david - * parsons) to avoid loadlin confusion, July 1997 - * - * Transcribed from Intel (as86) -> AT&T (gas) by Chris Noe, May 1999. - * <stiker@northlink.com> - * - * Fix to work around buggy BIOSes which don't use carry bit correctly - * and/or report extended memory in CX/DX for e801h memory size detection - * call. As a result the kernel got wrong figures. The int15/e801h docs - * from Ralf Brown interrupt list seem to indicate AX/BX should be used - * anyway. So to avoid breaking many machines (presumably there was a reason - * to orginally use CX/DX instead of AX/BX), we do a kludge to see - * if CX/DX have been changed in the e801 call and if so use AX/BX . - * Michael Miller, April 2001 <michaelm@mjmm.org> - * - * Added long mode checking and SSE force. March 2003, Andi Kleen. - */ - -#include <asm/segment.h> -#include <linux/utsrelease.h> -#include <linux/compile.h> -#include <asm/boot.h> -#include <asm/e820.h> -#include <asm/page.h> -#include <asm/setup.h> - -/* Signature words to ensure LILO loaded us right */ -#define SIG1 0xAA55 -#define SIG2 0x5A5A - -INITSEG = DEF_INITSEG # 0x9000, we move boot here, out of the way -SYSSEG = DEF_SYSSEG # 0x1000, system loaded at 0x10000 (65536). -SETUPSEG = DEF_SETUPSEG # 0x9020, this is the current segment - # ... and the former contents of CS - -DELTA_INITSEG = SETUPSEG - INITSEG # 0x0020 - -.code16 -.globl begtext, begdata, begbss, endtext, enddata, endbss - -.text -begtext: -.data -begdata: -.bss -begbss: -.text - -start: - jmp trampoline - -# This is the setup header, and it must start at %cs:2 (old 0x9020:2) - - .ascii "HdrS" # header signature - .word 0x0206 # header version number (>= 0x0105) - # or else old loadlin-1.5 will fail) -realmode_swtch: .word 0, 0 # default_switch, SETUPSEG -start_sys_seg: .word SYSSEG - .word kernel_version # pointing to kernel version string - # above section of header is compatible - # with loadlin-1.5 (header v1.5). Don't - # change it. - -type_of_loader: .byte 0 # = 0, old one (LILO, Loadlin, - # Bootlin, SYSLX, bootsect...) - # See Documentation/i386/boot.txt for - # assigned ids - -# flags, unused bits must be zero (RFU) bit within loadflags -loadflags: -LOADED_HIGH = 1 # If set, the kernel is loaded high -CAN_USE_HEAP = 0x80 # If set, the loader also has set - # heap_end_ptr to tell how much - # space behind setup.S can be used for - # heap purposes. - # Only the loader knows what is free -#ifndef __BIG_KERNEL__ - .byte 0 -#else - .byte LOADED_HIGH -#endif - -setup_move_size: .word 0x8000 # size to move, when setup is not - # loaded at 0x90000. We will move setup - # to 0x90000 then just before jumping - # into the kernel. However, only the - # loader knows how much data behind - # us also needs to be loaded. - -code32_start: # here loaders can put a different - # start address for 32-bit code. -#ifndef __BIG_KERNEL__ - .long 0x1000 # 0x1000 = default for zImage -#else - .long 0x100000 # 0x100000 = default for big kernel -#endif - -ramdisk_image: .long 0 # address of loaded ramdisk image - # Here the loader puts the 32-bit - # address where it loaded the image. - # This only will be read by the kernel. - -ramdisk_size: .long 0 # its size in bytes - -bootsect_kludge: - .long 0 # obsolete - -heap_end_ptr: .word modelist+1024 # (Header version 0x0201 or later) - # space from here (exclusive) down to - # end of setup code can be used by setup - # for local heap purposes. - -pad1: .word 0 -cmd_line_ptr: .long 0 # (Header version 0x0202 or later) - # If nonzero, a 32-bit pointer - # to the kernel command line. - # The command line should be - # located between the start of - # setup and the end of low - # memory (0xa0000), or it may - # get overwritten before it - # gets read. If this field is - # used, there is no longer - # anything magical about the - # 0x90000 segment; the setup - # can be located anywhere in - # low memory 0x10000 or higher. - -ramdisk_max: .long 0xffffffff -kernel_alignment: .long 0x200000 # physical addr alignment required for - # protected mode relocatable kernel -#ifdef CONFIG_RELOCATABLE -relocatable_kernel: .byte 1 -#else -relocatable_kernel: .byte 0 -#endif -pad2: .byte 0 -pad3: .word 0 - -cmdline_size: .long COMMAND_LINE_SIZE-1 #length of the command line, - #added with boot protocol - #version 2.06 - -trampoline: call start_of_setup - .align 16 - # The offset at this point is 0x240 - .space (0xeff-0x240+1) # E820 & EDD space (ending at 0xeff) -# End of setup header ##################################################### - -start_of_setup: -# Bootlin depends on this being done early - movw $0x01500, %ax - movb $0x81, %dl - int $0x13 - -#ifdef SAFE_RESET_DISK_CONTROLLER -# Reset the disk controller. - movw $0x0000, %ax - movb $0x80, %dl - int $0x13 -#endif - -# Set %ds = %cs, we know that SETUPSEG = %cs at this point - movw %cs, %ax # aka SETUPSEG - movw %ax, %ds -# Check signature at end of setup - cmpw $SIG1, setup_sig1 - jne bad_sig - - cmpw $SIG2, setup_sig2 - jne bad_sig - - jmp good_sig1 - -# Routine to print asciiz string at ds:si -prtstr: - lodsb - andb %al, %al - jz fin - - call prtchr - jmp prtstr - -fin: ret - -# Space printing -prtsp2: call prtspc # Print double space -prtspc: movb $0x20, %al # Print single space (note: fall-thru) - -prtchr: - pushw %ax - pushw %cx - movw $0007,%bx - movw $0x01, %cx - movb $0x0e, %ah - int $0x10 - popw %cx - popw %ax - ret - -beep: movb $0x07, %al - jmp prtchr - -no_sig_mess: .string "No setup signature found ..." - -good_sig1: - jmp good_sig - -# We now have to find the rest of the setup code/data -bad_sig: - movw %cs, %ax # SETUPSEG - subw $DELTA_INITSEG, %ax # INITSEG - movw %ax, %ds - xorb %bh, %bh - movb (497), %bl # get setup sect from bootsect - subw $4, %bx # LILO loads 4 sectors of setup - shlw $8, %bx # convert to words (1sect=2^8 words) - movw %bx, %cx - shrw $3, %bx # convert to segment - addw $SYSSEG, %bx - movw %bx, %cs:start_sys_seg -# Move rest of setup code/data to here - movw $2048, %di # four sectors loaded by LILO - subw %si, %si - movw %cs, %ax # aka SETUPSEG - movw %ax, %es - movw $SYSSEG, %ax - movw %ax, %ds - rep - movsw - movw %cs, %ax # aka SETUPSEG - movw %ax, %ds - cmpw $SIG1, setup_sig1 - jne no_sig - - cmpw $SIG2, setup_sig2 - jne no_sig - - jmp good_sig - -no_sig: - lea no_sig_mess, %si - call prtstr - -no_sig_loop: - jmp no_sig_loop - -good_sig: - movw %cs, %ax # aka SETUPSEG - subw $DELTA_INITSEG, %ax # aka INITSEG - movw %ax, %ds -# Check if an old loader tries to load a big-kernel - testb $LOADED_HIGH, %cs:loadflags # Do we have a big kernel? - jz loader_ok # No, no danger for old loaders. - - cmpb $0, %cs:type_of_loader # Do we have a loader that - # can deal with us? - jnz loader_ok # Yes, continue. - - pushw %cs # No, we have an old loader, - popw %ds # die. - lea loader_panic_mess, %si - call prtstr - - jmp no_sig_loop - -loader_panic_mess: .string "Wrong loader, giving up..." - -loader_ok: - /* check for long mode. */ - /* we have to do this before the VESA setup, otherwise the user - can't see the error message. */ - - pushw %ds - movw %cs,%ax - movw %ax,%ds - - call verify_cpu - testl %eax,%eax - jz sse_ok - -no_longmode: - call beep - lea long_mode_panic,%si - call prtstr -no_longmode_loop: - jmp no_longmode_loop -long_mode_panic: - .string "Your CPU does not support long mode. Use a 32bit distribution." - .byte 0 - -#include "../kernel/verify_cpu.S" -sse_ok: - popw %ds - -# tell BIOS we want to go to long mode - movl $0xec00,%eax # declare target operating mode - movl $2,%ebx # long mode - int $0x15 - -# Get memory size (extended mem, kB) - - xorl %eax, %eax - movl %eax, (0x1e0) -#ifndef STANDARD_MEMORY_BIOS_CALL - movb %al, (E820NR) -# Try three different memory detection schemes. First, try -# e820h, which lets us assemble a memory map, then try e801h, -# which returns a 32-bit memory size, and finally 88h, which -# returns 0-64m - -# method E820H: -# the memory map from hell. e820h returns memory classified into -# a whole bunch of different types, and allows memory holes and -# everything. We scan through this memory map and build a list -# of the first 32 memory areas, which we return at [E820MAP]. -# This is documented at http://www.acpi.info/, in the ACPI 2.0 specification. - -#define SMAP 0x534d4150 - -meme820: - xorl %ebx, %ebx # continuation counter - movw $E820MAP, %di # point into the whitelist - # so we can have the bios - # directly write into it. - -jmpe820: - movl $0x0000e820, %eax # e820, upper word zeroed - movl $SMAP, %edx # ascii 'SMAP' - movl $20, %ecx # size of the e820rec - pushw %ds # data record. - popw %es - int $0x15 # make the call - jc bail820 # fall to e801 if it fails - - cmpl $SMAP, %eax # check the return is `SMAP' - jne bail820 # fall to e801 if it fails - -# cmpl $1, 16(%di) # is this usable memory? -# jne again820 - - # If this is usable memory, we save it by simply advancing %di by - # sizeof(e820rec). - # -good820: - movb (E820NR), %al # up to 128 entries - cmpb $E820MAX, %al - jae bail820 - - incb (E820NR) - movw %di, %ax - addw $20, %ax - movw %ax, %di -again820: - cmpl $0, %ebx # check to see if - jne jmpe820 # %ebx is set to EOF -bail820: - - -# method E801H: -# memory size is in 1k chunksizes, to avoid confusing loadlin. -# we store the 0xe801 memory size in a completely different place, -# because it will most likely be longer than 16 bits. -# (use 1e0 because that's what Larry Augustine uses in his -# alternative new memory detection scheme, and it's sensible -# to write everything into the same place.) - -meme801: - stc # fix to work around buggy - xorw %cx,%cx # BIOSes which don't clear/set - xorw %dx,%dx # carry on pass/error of - # e801h memory size call - # or merely pass cx,dx though - # without changing them. - movw $0xe801, %ax - int $0x15 - jc mem88 - - cmpw $0x0, %cx # Kludge to handle BIOSes - jne e801usecxdx # which report their extended - cmpw $0x0, %dx # memory in AX/BX rather than - jne e801usecxdx # CX/DX. The spec I have read - movw %ax, %cx # seems to indicate AX/BX - movw %bx, %dx # are more reasonable anyway... - -e801usecxdx: - andl $0xffff, %edx # clear sign extend - shll $6, %edx # and go from 64k to 1k chunks - movl %edx, (0x1e0) # store extended memory size - andl $0xffff, %ecx # clear sign extend - addl %ecx, (0x1e0) # and add lower memory into - # total size. - -# Ye Olde Traditional Methode. Returns the memory size (up to 16mb or -# 64mb, depending on the bios) in ax. -mem88: - -#endif - movb $0x88, %ah - int $0x15 - movw %ax, (2) - -# Set the keyboard repeat rate to the max - movw $0x0305, %ax - xorw %bx, %bx - int $0x16 - -# Check for video adapter and its parameters and allow the -# user to browse video modes. - call video # NOTE: we need %ds pointing - # to bootsector - -# Get hd0 data... - xorw %ax, %ax - movw %ax, %ds - ldsw (4 * 0x41), %si - movw %cs, %ax # aka SETUPSEG - subw $DELTA_INITSEG, %ax # aka INITSEG - pushw %ax - movw %ax, %es - movw $0x0080, %di - movw $0x10, %cx - pushw %cx - cld - rep - movsb -# Get hd1 data... - xorw %ax, %ax - movw %ax, %ds - ldsw (4 * 0x46), %si - popw %cx - popw %es - movw $0x0090, %di - rep - movsb -# Check that there IS a hd1 :-) - movw $0x01500, %ax - movb $0x81, %dl - int $0x13 - jc no_disk1 - - cmpb $3, %ah - je is_disk1 - -no_disk1: - movw %cs, %ax # aka SETUPSEG - subw $DELTA_INITSEG, %ax # aka INITSEG - movw %ax, %es - movw $0x0090, %di - movw $0x10, %cx - xorw %ax, %ax - cld - rep - stosb -is_disk1: - -# Check for PS/2 pointing device - movw %cs, %ax # aka SETUPSEG - subw $DELTA_INITSEG, %ax # aka INITSEG - movw %ax, %ds - movb $0, (0x1ff) # default is no pointing device - int $0x11 # int 0x11: equipment list - testb $0x04, %al # check if mouse installed - jz no_psmouse - - movb $0xAA, (0x1ff) # device present -no_psmouse: - -#include "../../i386/boot/edd.S" - -# Now we want to move to protected mode ... - cmpw $0, %cs:realmode_swtch - jz rmodeswtch_normal - - lcall *%cs:realmode_swtch - - jmp rmodeswtch_end - -rmodeswtch_normal: - pushw %cs - call default_switch - -rmodeswtch_end: -# we get the code32 start address and modify the below 'jmpi' -# (loader may have changed it) - movl %cs:code32_start, %eax - movl %eax, %cs:code32 - -# Now we move the system to its rightful place ... but we check if we have a -# big-kernel. In that case we *must* not move it ... - testb $LOADED_HIGH, %cs:loadflags - jz do_move0 # .. then we have a normal low - # loaded zImage - # .. or else we have a high - # loaded bzImage - jmp end_move # ... and we skip moving - -do_move0: - movw $0x100, %ax # start of destination segment - movw %cs, %bp # aka SETUPSEG - subw $DELTA_INITSEG, %bp # aka INITSEG - movw %cs:start_sys_seg, %bx # start of source segment - cld -do_move: - movw %ax, %es # destination segment - incb %ah # instead of add ax,#0x100 - movw %bx, %ds # source segment - addw $0x100, %bx - subw %di, %di - subw %si, %si - movw $0x800, %cx - rep - movsw - cmpw %bp, %bx # assume start_sys_seg > 0x200, - # so we will perhaps read one - # page more than needed, but - # never overwrite INITSEG - # because destination is a - # minimum one page below source - jb do_move - -end_move: -# then we load the segment descriptors - movw %cs, %ax # aka SETUPSEG - movw %ax, %ds - -# Check whether we need to be downward compatible with version <=201 - cmpl $0, cmd_line_ptr - jne end_move_self # loader uses version >=202 features - cmpb $0x20, type_of_loader - je end_move_self # bootsect loader, we know of it - -# Boot loader doesnt support boot protocol version 2.02. -# If we have our code not at 0x90000, we need to move it there now. -# We also then need to move the params behind it (commandline) -# Because we would overwrite the code on the current IP, we move -# it in two steps, jumping high after the first one. - movw %cs, %ax - cmpw $SETUPSEG, %ax - je end_move_self - - cli # make sure we really have - # interrupts disabled ! - # because after this the stack - # should not be used - subw $DELTA_INITSEG, %ax # aka INITSEG - movw %ss, %dx - cmpw %ax, %dx - jb move_self_1 - - addw $INITSEG, %dx - subw %ax, %dx # this will go into %ss after - # the move -move_self_1: - movw %ax, %ds - movw $INITSEG, %ax # real INITSEG - movw %ax, %es - movw %cs:setup_move_size, %cx - std # we have to move up, so we use - # direction down because the - # areas may overlap - movw %cx, %di - decw %di - movw %di, %si - subw $move_self_here+0x200, %cx - rep - movsb - ljmp $SETUPSEG, $move_self_here - -move_self_here: - movw $move_self_here+0x200, %cx - rep - movsb - movw $SETUPSEG, %ax - movw %ax, %ds - movw %dx, %ss -end_move_self: # now we are at the right place - lidt idt_48 # load idt with 0,0 - xorl %eax, %eax # Compute gdt_base - movw %ds, %ax # (Convert %ds:gdt to a linear ptr) - shll $4, %eax - addl $gdt, %eax - movl %eax, (gdt_48+2) - lgdt gdt_48 # load gdt with whatever is - # appropriate - -# that was painless, now we enable a20 - call empty_8042 - - movb $0xD1, %al # command write - outb %al, $0x64 - call empty_8042 - - movb $0xDF, %al # A20 on - outb %al, $0x60 - call empty_8042 - -# -# You must preserve the other bits here. Otherwise embarrasing things -# like laptops powering off on boot happen. Corrected version by Kira -# Brown from Linux 2.2 -# - inb $0x92, %al # - orb $02, %al # "fast A20" version - outb %al, $0x92 # some chips have only this - -# wait until a20 really *is* enabled; it can take a fair amount of -# time on certain systems; Toshiba Tecras are known to have this -# problem. The memory location used here (0x200) is the int 0x80 -# vector, which should be safe to use. - - xorw %ax, %ax # segment 0x0000 - movw %ax, %fs - decw %ax # segment 0xffff (HMA) - movw %ax, %gs -a20_wait: - incw %ax # unused memory location <0xfff0 - movw %ax, %fs:(0x200) # we use the "int 0x80" vector - cmpw %gs:(0x210), %ax # and its corresponding HMA addr - je a20_wait # loop until no longer aliased - -# make sure any possible coprocessor is properly reset.. - xorw %ax, %ax - outb %al, $0xf0 - call delay - - outb %al, $0xf1 - call delay - -# well, that went ok, I hope. Now we mask all interrupts - the rest -# is done in init_IRQ(). - movb $0xFF, %al # mask all interrupts for now - outb %al, $0xA1 - call delay - - movb $0xFB, %al # mask all irq's but irq2 which - outb %al, $0x21 # is cascaded - -# Well, that certainly wasn't fun :-(. Hopefully it works, and we don't -# need no steenking BIOS anyway (except for the initial loading :-). -# The BIOS-routine wants lots of unnecessary data, and it's less -# "interesting" anyway. This is how REAL programmers do it. -# -# Well, now's the time to actually move into protected mode. To make -# things as simple as possible, we do no register set-up or anything, -# we let the gnu-compiled 32-bit programs do that. We just jump to -# absolute address 0x1000 (or the loader supplied one), -# in 32-bit protected mode. -# -# Note that the short jump isn't strictly needed, although there are -# reasons why it might be a good idea. It won't hurt in any case. - movw $1, %ax # protected mode (PE) bit - lmsw %ax # This is it! - jmp flush_instr - -flush_instr: - xorw %bx, %bx # Flag to indicate a boot - xorl %esi, %esi # Pointer to real-mode code - movw %cs, %si - subw $DELTA_INITSEG, %si - shll $4, %esi # Convert to 32-bit pointer -# NOTE: For high loaded big kernels we need a -# jmpi 0x100000,__KERNEL_CS -# -# but we yet haven't reloaded the CS register, so the default size -# of the target offset still is 16 bit. -# However, using an operand prefix (0x66), the CPU will properly -# take our 48 bit far pointer. (INTeL 80386 Programmer's Reference -# Manual, Mixing 16-bit and 32-bit code, page 16-6) - - .byte 0x66, 0xea # prefix + jmpi-opcode -code32: .long 0x1000 # will be set to 0x100000 - # for big kernels - .word __KERNEL_CS - -# Here's a bunch of information about your current kernel.. -kernel_version: .ascii UTS_RELEASE - .ascii " (" - .ascii LINUX_COMPILE_BY - .ascii "@" - .ascii LINUX_COMPILE_HOST - .ascii ") " - .ascii UTS_VERSION - .byte 0 - -# This is the default real mode switch routine. -# to be called just before protected mode transition -default_switch: - cli # no interrupts allowed ! - movb $0x80, %al # disable NMI for bootup - # sequence - outb %al, $0x70 - lret - - -# This routine checks that the keyboard command queue is empty -# (after emptying the output buffers) -# -# Some machines have delusions that the keyboard buffer is always full -# with no keyboard attached... -# -# If there is no keyboard controller, we will usually get 0xff -# to all the reads. With each IO taking a microsecond and -# a timeout of 100,000 iterations, this can take about half a -# second ("delay" == outb to port 0x80). That should be ok, -# and should also be plenty of time for a real keyboard controller -# to empty. -# - -empty_8042: - pushl %ecx - movl $100000, %ecx - -empty_8042_loop: - decl %ecx - jz empty_8042_end_loop - - call delay - - inb $0x64, %al # 8042 status port - testb $1, %al # output buffer? - jz no_output - - call delay - inb $0x60, %al # read it - jmp empty_8042_loop - -no_output: - testb $2, %al # is input buffer full? - jnz empty_8042_loop # yes - loop -empty_8042_end_loop: - popl %ecx - ret - -# Read the cmos clock. Return the seconds in al -gettime: - pushw %cx - movb $0x02, %ah - int $0x1a - movb %dh, %al # %dh contains the seconds - andb $0x0f, %al - movb %dh, %ah - movb $0x04, %cl - shrb %cl, %ah - aad - popw %cx - ret - -# Delay is needed after doing I/O -delay: - outb %al,$0x80 - ret - -# Descriptor tables -gdt: - .word 0, 0, 0, 0 # dummy - - .word 0, 0, 0, 0 # unused - - .word 0xFFFF # 4Gb - (0x100000*0x1000 = 4Gb) - .word 0 # base address = 0 - .word 0x9A00 # code read/exec - .word 0x00CF # granularity = 4096, 386 - # (+5th nibble of limit) - - .word 0xFFFF # 4Gb - (0x100000*0x1000 = 4Gb) - .word 0 # base address = 0 - .word 0x9200 # data read/write - .word 0x00CF # granularity = 4096, 386 - # (+5th nibble of limit) -gdt_end: -idt_48: - .word 0 # idt limit = 0 - .word 0, 0 # idt base = 0L -gdt_48: - .word gdt_end-gdt-1 # gdt limit - .word 0, 0 # gdt base (filled in later) - -# Include video setup & detection code - -#include "../../i386/boot/video.S" - -# Setup signature -- must be last -setup_sig1: .word SIG1 -setup_sig2: .word SIG2 - -# After this point, there is some free space which is used by the video mode -# handling code to store the temporary mode table (not used by the kernel). - -modelist: - -.text -endtext: -.data -enddata: -.bss -endbss: diff --git a/arch/x86_64/boot/tools/build.c b/arch/x86_64/boot/tools/build.c deleted file mode 100644 index eae86691709..00000000000 --- a/arch/x86_64/boot/tools/build.c +++ /dev/null @@ -1,185 +0,0 @@ -/* - * Copyright (C) 1991, 1992 Linus Torvalds - * Copyright (C) 1997 Martin Mares - */ - -/* - * This file builds a disk-image from three different files: - * - * - bootsect: compatibility mbr which prints an error message if - * someone tries to boot the kernel directly. - * - setup: 8086 machine code, sets up system parm - * - system: 80386 code for actual system - * - * It does some checking that all files are of the correct type, and - * just writes the result to stdout, removing headers and padding to - * the right amount. It also writes some system data to stderr. - */ - -/* - * Changes by tytso to allow root device specification - * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 - * Cross compiling fixes by Gertjan van Wingerde, July 1996 - * Rewritten by Martin Mares, April 1997 - */ - -#include <stdio.h> -#include <string.h> -#include <stdlib.h> -#include <stdarg.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <sys/sysmacros.h> -#include <unistd.h> -#include <fcntl.h> -#include <asm/boot.h> - -typedef unsigned char byte; -typedef unsigned short word; -typedef unsigned long u32; - -#define DEFAULT_MAJOR_ROOT 0 -#define DEFAULT_MINOR_ROOT 0 - -/* Minimal number of setup sectors (see also bootsect.S) */ -#define SETUP_SECTS 4 - -byte buf[1024]; -int fd; -int is_big_kernel; - -void die(const char * str, ...) -{ - va_list args; - va_start(args, str); - vfprintf(stderr, str, args); - fputc('\n', stderr); - exit(1); -} - -void file_open(const char *name) -{ - if ((fd = open(name, O_RDONLY, 0)) < 0) - die("Unable to open `%s': %m", name); -} - -void usage(void) -{ - die("Usage: build [-b] bootsect setup system [rootdev] [> image]"); -} - -int main(int argc, char ** argv) -{ - unsigned int i, c, sz, setup_sectors; - u32 sys_size; - byte major_root, minor_root; - struct stat sb; - - if (argc > 2 && !strcmp(argv[1], "-b")) - { - is_big_kernel = 1; - argc--, argv++; - } - if ((argc < 4) || (argc > 5)) - usage(); - if (argc > 4) { - if (!strcmp(argv[4], "CURRENT")) { - if (stat("/", &sb)) { - perror("/"); - die("Couldn't stat /"); - } - major_root = major(sb.st_dev); - minor_root = minor(sb.st_dev); - } else if (strcmp(argv[4], "FLOPPY")) { - if (stat(argv[4], &sb)) { - perror(argv[4]); - die("Couldn't stat root device."); - } - major_root = major(sb.st_rdev); - minor_root = minor(sb.st_rdev); - } else { - major_root = 0; - minor_root = 0; - } - } else { - major_root = DEFAULT_MAJOR_ROOT; - minor_root = DEFAULT_MINOR_ROOT; - } - fprintf(stderr, "Root device is (%d, %d)\n", major_root, minor_root); - - file_open(argv[1]); - i = read(fd, buf, sizeof(buf)); - fprintf(stderr,"Boot sector %d bytes.\n",i); - if (i != 512) - die("Boot block must be exactly 512 bytes"); - if (buf[510] != 0x55 || buf[511] != 0xaa) - die("Boot block hasn't got boot flag (0xAA55)"); - buf[508] = minor_root; - buf[509] = major_root; - if (write(1, buf, 512) != 512) - die("Write call failed"); - close (fd); - - file_open(argv[2]); /* Copy the setup code */ - for (i=0 ; (c=read(fd, buf, sizeof(buf)))>0 ; i+=c ) - if (write(1, buf, c) != c) - die("Write call failed"); - if (c != 0) - die("read-error on `setup'"); - close (fd); - - setup_sectors = (i + 511) / 512; /* Pad unused space with zeros */ - /* for compatibility with ancient versions of LILO. */ - if (setup_sectors < SETUP_SECTS) - setup_sectors = SETUP_SECTS; - fprintf(stderr, "Setup is %d bytes.\n", i); - memset(buf, 0, sizeof(buf)); - while (i < setup_sectors * 512) { - c = setup_sectors * 512 - i; - if (c > sizeof(buf)) - c = sizeof(buf); - if (write(1, buf, c) != c) - die("Write call failed"); - i += c; - } - - file_open(argv[3]); - if (fstat (fd, &sb)) - die("Unable to stat `%s': %m", argv[3]); - sz = sb.st_size; - fprintf (stderr, "System is %d kB\n", sz/1024); - sys_size = (sz + 15) / 16; - if (!is_big_kernel && sys_size > DEF_SYSSIZE) - die("System is too big. Try using bzImage or modules."); - while (sz > 0) { - int l, n; - - l = (sz > sizeof(buf)) ? sizeof(buf) : sz; - if ((n=read(fd, buf, l)) != l) { - if (n < 0) - die("Error reading %s: %m", argv[3]); - else - die("%s: Unexpected EOF", argv[3]); - } - if (write(1, buf, l) != l) - die("Write failed"); - sz -= l; - } - close(fd); - - if (lseek(1, 497, SEEK_SET) != 497) /* Write sizes to the bootsector */ - die("Output: seek failed"); - buf[0] = setup_sectors; - if (write(1, buf, 1) != 1) - die("Write of setup sector count failed"); - if (lseek(1, 500, SEEK_SET) != 500) - die("Output: seek failed"); - buf[0] = (sys_size & 0xff); - buf[1] = ((sys_size >> 8) & 0xff); - buf[2] = ((sys_size >> 16) & 0xff); - buf[3] = ((sys_size >> 24) & 0xff); - if (write(1, buf, 4) != 4) - die("Write of image length failed"); - - return 0; /* Everything is OK */ -} diff --git a/arch/x86_64/defconfig b/arch/x86_64/defconfig index 40178e5c310..b7c4cd04bfc 100644 --- a/arch/x86_64/defconfig +++ b/arch/x86_64/defconfig @@ -1,19 +1,22 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.22-rc2 -# Mon May 21 13:23:40 2007 +# Linux kernel version: 2.6.22-git14 +# Fri Jul 20 09:53:15 2007 # CONFIG_X86_64=y CONFIG_64BIT=y CONFIG_X86=y CONFIG_GENERIC_TIME=y CONFIG_GENERIC_TIME_VSYSCALL=y +CONFIG_GENERIC_CMOS_UPDATE=y CONFIG_ZONE_DMA32=y CONFIG_LOCKDEP_SUPPORT=y CONFIG_STACKTRACE_SUPPORT=y CONFIG_SEMAPHORE_SLEEPERS=y CONFIG_MMU=y CONFIG_ZONE_DMA=y +CONFIG_QUICKLIST=y +CONFIG_NR_QUICK=2 CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_GENERIC_HWEIGHT=y CONFIG_GENERIC_CALIBRATE_DELAY=y @@ -44,19 +47,18 @@ CONFIG_LOCALVERSION="" CONFIG_LOCALVERSION_AUTO=y CONFIG_SWAP=y CONFIG_SYSVIPC=y -# CONFIG_IPC_NS is not set CONFIG_SYSVIPC_SYSCTL=y CONFIG_POSIX_MQUEUE=y # CONFIG_BSD_PROCESS_ACCT is not set # CONFIG_TASKSTATS is not set -# CONFIG_UTS_NS is not set +# CONFIG_USER_NS is not set # CONFIG_AUDIT is not set CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=18 # CONFIG_CPUSETS is not set CONFIG_SYSFS_DEPRECATED=y -# CONFIG_RELAY is not set +CONFIG_RELAY=y CONFIG_BLK_DEV_INITRD=y CONFIG_INITRAMFS_SOURCE="" CONFIG_CC_OPTIMIZE_FOR_SIZE=y @@ -86,10 +88,6 @@ CONFIG_SLAB=y CONFIG_RT_MUTEXES=y # CONFIG_TINY_SHMEM is not set CONFIG_BASE_SMALL=0 - -# -# Loadable module support -# CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y @@ -97,12 +95,9 @@ CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_MODULE_SRCVERSION_ALL is not set # CONFIG_KMOD is not set CONFIG_STOP_MACHINE=y - -# -# Block layer -# CONFIG_BLOCK=y # CONFIG_BLK_DEV_IO_TRACE is not set +# CONFIG_BLK_DEV_BSG is not set # # IO Schedulers @@ -165,9 +160,12 @@ CONFIG_SPLIT_PTLOCK_CPUS=4 CONFIG_MIGRATION=y CONFIG_RESOURCES_64BIT=y CONFIG_ZONE_DMA_FLAG=1 +CONFIG_BOUNCE=y +CONFIG_VIRT_TO_BUS=y CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y CONFIG_OUT_OF_LINE_PFN_TO_PAGE=y CONFIG_NR_CPUS=32 +CONFIG_PHYSICAL_ALIGN=0x200000 CONFIG_HOTPLUG_CPU=y CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y CONFIG_HPET_TIMER=y @@ -180,7 +178,7 @@ CONFIG_X86_MCE_INTEL=y CONFIG_X86_MCE_AMD=y # CONFIG_KEXEC is not set # CONFIG_CRASH_DUMP is not set -CONFIG_RELOCATABLE=y +# CONFIG_RELOCATABLE is not set CONFIG_PHYSICAL_START=0x200000 CONFIG_SECCOMP=y # CONFIG_CC_STACKPROTECTOR is not set @@ -201,7 +199,6 @@ CONFIG_GENERIC_PENDING_IRQ=y CONFIG_PM=y # CONFIG_PM_LEGACY is not set # CONFIG_PM_DEBUG is not set -# CONFIG_PM_SYSFS_DEPRECATED is not set CONFIG_SOFTWARE_SUSPEND=y CONFIG_PM_STD_PARTITION="" CONFIG_SUSPEND_SMP=y @@ -248,7 +245,7 @@ CONFIG_CPU_FREQ_GOV_PERFORMANCE=y # CONFIG_CPU_FREQ_GOV_POWERSAVE is not set CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_CPU_FREQ_GOV_ONDEMAND=y -# CONFIG_CPU_FREQ_GOV_CONSERVATIVE is not set +CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y # # CPUFreq processor drivers @@ -351,20 +348,8 @@ CONFIG_IPV6_SIT=y # CONFIG_IPV6_MULTIPLE_TABLES is not set # CONFIG_NETWORK_SECMARK is not set # CONFIG_NETFILTER is not set - -# -# DCCP Configuration (EXPERIMENTAL) -# # CONFIG_IP_DCCP is not set - -# -# SCTP Configuration (EXPERIMENTAL) -# # CONFIG_IP_SCTP is not set - -# -# TIPC Configuration (EXPERIMENTAL) -# # CONFIG_TIPC is not set # CONFIG_ATM is not set # CONFIG_BRIDGE is not set @@ -401,6 +386,7 @@ CONFIG_IPV6_SIT=y # CONFIG_MAC80211 is not set # CONFIG_IEEE80211 is not set # CONFIG_RFKILL is not set +# CONFIG_NET_9P is not set # # Device Drivers @@ -415,21 +401,9 @@ CONFIG_FW_LOADER=y # CONFIG_DEBUG_DRIVER is not set # CONFIG_DEBUG_DEVRES is not set # CONFIG_SYS_HYPERVISOR is not set - -# -# Connector - unified userspace <-> kernelspace linker -# # CONFIG_CONNECTOR is not set # CONFIG_MTD is not set - -# -# Parallel port support -# # CONFIG_PARPORT is not set - -# -# Plug and Play support -# CONFIG_PNP=y # CONFIG_PNP_DEBUG is not set @@ -437,10 +411,7 @@ CONFIG_PNP=y # Protocols # CONFIG_PNPACPI=y - -# -# Block devices -# +CONFIG_BLK_DEV=y CONFIG_BLK_DEV_FD=y # CONFIG_BLK_CPQ_DA is not set # CONFIG_BLK_CPQ_CISS_DA is not set @@ -458,17 +429,14 @@ CONFIG_BLK_DEV_RAM_SIZE=4096 CONFIG_BLK_DEV_RAM_BLOCKSIZE=1024 # CONFIG_CDROM_PKTCDVD is not set # CONFIG_ATA_OVER_ETH is not set - -# -# Misc devices -# +CONFIG_MISC_DEVICES=y # CONFIG_IBM_ASM is not set # CONFIG_PHANTOM is not set +# CONFIG_EEPROM_93CX6 is not set # CONFIG_SGI_IOC4 is not set # CONFIG_TIFM_CORE is not set # CONFIG_SONY_LAPTOP is not set # CONFIG_THINKPAD_ACPI is not set -# CONFIG_BLINK is not set CONFIG_IDE=y CONFIG_BLK_DEV_IDE=y @@ -539,6 +507,7 @@ CONFIG_BLK_DEV_IDEDMA=y # # CONFIG_RAID_ATTRS is not set CONFIG_SCSI=y +CONFIG_SCSI_DMA=y # CONFIG_SCSI_TGT is not set CONFIG_SCSI_NETLINK=y # CONFIG_SCSI_PROC_FS is not set @@ -590,11 +559,9 @@ CONFIG_AIC79XX_DEBUG_MASK=0 # CONFIG_AIC79XX_REG_PRETTY_PRINT is not set # CONFIG_SCSI_AIC94XX is not set # CONFIG_SCSI_ARCMSR is not set -CONFIG_MEGARAID_NEWGEN=y -CONFIG_MEGARAID_MM=y -CONFIG_MEGARAID_MAILBOX=y +# CONFIG_MEGARAID_NEWGEN is not set # CONFIG_MEGARAID_LEGACY is not set -CONFIG_MEGARAID_SAS=y +# CONFIG_MEGARAID_SAS is not set # CONFIG_SCSI_HPTIOP is not set # CONFIG_SCSI_BUSLOGIC is not set # CONFIG_SCSI_DMX3191D is not set @@ -614,7 +581,6 @@ CONFIG_MEGARAID_SAS=y # CONFIG_SCSI_DC395x is not set # CONFIG_SCSI_DC390T is not set # CONFIG_SCSI_DEBUG is not set -# CONFIG_SCSI_ESP_CORE is not set # CONFIG_SCSI_SRP is not set CONFIG_ATA=y # CONFIG_ATA_NONSTANDARD is not set @@ -671,10 +637,6 @@ CONFIG_SATA_VIA=y # CONFIG_PATA_SIS is not set # CONFIG_PATA_VIA is not set # CONFIG_PATA_WINBOND is not set - -# -# Multi-device support (RAID and LVM) -# CONFIG_MD=y # CONFIG_BLK_DEV_MD is not set CONFIG_BLK_DEV_DM=y @@ -692,7 +654,7 @@ CONFIG_BLK_DEV_DM=y CONFIG_FUSION=y CONFIG_FUSION_SPI=y # CONFIG_FUSION_FC is not set -CONFIG_FUSION_SAS=y +# CONFIG_FUSION_SAS is not set CONFIG_FUSION_MAX_SGE=128 # CONFIG_FUSION_CTL is not set @@ -710,7 +672,10 @@ CONFIG_IEEE1394=y # # Controllers # -# CONFIG_IEEE1394_PCILYNX is not set + +# +# Texas Instruments PCILynx requires I2C +# CONFIG_IEEE1394_OHCI1394=y # @@ -722,32 +687,19 @@ CONFIG_IEEE1394_OHCI1394=y # CONFIG_IEEE1394_ETH1394 is not set # CONFIG_IEEE1394_DV1394 is not set CONFIG_IEEE1394_RAWIO=y - -# -# I2O device support -# # CONFIG_I2O is not set -# CONFIG_MACINTOSH_DRIVERS is not set - -# -# Network device support -# +CONFIG_MACINTOSH_DRIVERS=y +# CONFIG_MAC_EMUMOUSEBTN is not set CONFIG_NETDEVICES=y +CONFIG_NETDEVICES_MULTIQUEUE=y # CONFIG_DUMMY is not set # CONFIG_BONDING is not set +# CONFIG_MACVLAN is not set # CONFIG_EQUALIZER is not set CONFIG_TUN=y # CONFIG_NET_SB1000 is not set - -# -# ARCnet devices -# # CONFIG_ARCNET is not set # CONFIG_PHYLIB is not set - -# -# Ethernet (10 or 100Mbit) -# CONFIG_NET_ETHERNET=y CONFIG_MII=y # CONFIG_HAPPYMEAL is not set @@ -756,10 +708,6 @@ CONFIG_MII=y CONFIG_NET_VENDOR_3COM=y CONFIG_VORTEX=y # CONFIG_TYPHOON is not set - -# -# Tulip family network device support -# CONFIG_NET_TULIP=y # CONFIG_DE2104X is not set CONFIG_TULIP=y @@ -773,7 +721,8 @@ CONFIG_TULIP=y # CONFIG_HP100 is not set CONFIG_NET_PCI=y # CONFIG_PCNET32 is not set -# CONFIG_AMD8111_ETH is not set +CONFIG_AMD8111_ETH=y +# CONFIG_AMD8111E_NAPI is not set # CONFIG_ADAPTEC_STARFIRE is not set CONFIG_B44=y CONFIG_FORCEDETH=y @@ -808,7 +757,6 @@ CONFIG_E1000=y # CONFIG_SIS190 is not set # CONFIG_SKGE is not set # CONFIG_SKY2 is not set -# CONFIG_SK98LIN is not set # CONFIG_VIA_VELOCITY is not set CONFIG_TIGON3=y CONFIG_BNX2=y @@ -823,10 +771,6 @@ CONFIG_S2IO=m # CONFIG_MYRI10GE is not set # CONFIG_NETXEN_NIC is not set # CONFIG_MLX4_CORE is not set - -# -# Token Ring devices -# # CONFIG_TR is not set # @@ -855,15 +799,7 @@ CONFIG_NETCONSOLE=y CONFIG_NETPOLL=y # CONFIG_NETPOLL_TRAP is not set CONFIG_NET_POLL_CONTROLLER=y - -# -# ISDN subsystem -# # CONFIG_ISDN is not set - -# -# Telephony Support -# # CONFIG_PHONE is not set # @@ -871,6 +807,7 @@ CONFIG_NET_POLL_CONTROLLER=y # CONFIG_INPUT=y # CONFIG_INPUT_FF_MEMLESS is not set +# CONFIG_INPUT_POLLDEV is not set # # Userland interfaces @@ -936,6 +873,7 @@ CONFIG_HW_CONSOLE=y # CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_FIX_EARLYCON_MEM=y CONFIG_SERIAL_8250_PCI=y CONFIG_SERIAL_8250_PNP=y CONFIG_SERIAL_8250_NR_UARTS=4 @@ -951,16 +889,11 @@ CONFIG_SERIAL_CORE_CONSOLE=y CONFIG_UNIX98_PTYS=y CONFIG_LEGACY_PTYS=y CONFIG_LEGACY_PTY_COUNT=256 - -# -# IPMI -# # CONFIG_IPMI_HANDLER is not set # CONFIG_WATCHDOG is not set CONFIG_HW_RANDOM=y CONFIG_HW_RANDOM_INTEL=y CONFIG_HW_RANDOM_AMD=y -# CONFIG_HW_RANDOM_GEODE is not set # CONFIG_NVRAM is not set CONFIG_RTC=y # CONFIG_R3964 is not set @@ -979,127 +912,19 @@ CONFIG_HPET=y # CONFIG_HPET_RTC_IRQ is not set CONFIG_HPET_MMAP=y # CONFIG_HANGCHECK_TIMER is not set - -# -# TPM devices -# # CONFIG_TCG_TPM is not set # CONFIG_TELCLOCK is not set CONFIG_DEVPORT=y -CONFIG_I2C=m -CONFIG_I2C_BOARDINFO=y -CONFIG_I2C_CHARDEV=m - -# -# I2C Algorithms -# -# CONFIG_I2C_ALGOBIT is not set -# CONFIG_I2C_ALGOPCF is not set -# CONFIG_I2C_ALGOPCA is not set - -# -# I2C Hardware Bus support -# -# CONFIG_I2C_ALI1535 is not set -# CONFIG_I2C_ALI1563 is not set -# CONFIG_I2C_ALI15X3 is not set -# CONFIG_I2C_AMD756 is not set -# CONFIG_I2C_AMD8111 is not set -# CONFIG_I2C_I801 is not set -# CONFIG_I2C_I810 is not set -# CONFIG_I2C_PIIX4 is not set -# CONFIG_I2C_NFORCE2 is not set -# CONFIG_I2C_OCORES is not set -# CONFIG_I2C_PARPORT_LIGHT is not set -# CONFIG_I2C_PROSAVAGE is not set -# CONFIG_I2C_SAVAGE4 is not set -# CONFIG_I2C_SIMTEC is not set -# CONFIG_I2C_SIS5595 is not set -# CONFIG_I2C_SIS630 is not set -# CONFIG_I2C_SIS96X is not set -# CONFIG_I2C_STUB is not set -# CONFIG_I2C_TINY_USB is not set -# CONFIG_I2C_VIA is not set -# CONFIG_I2C_VIAPRO is not set -# CONFIG_I2C_VOODOO3 is not set - -# -# Miscellaneous I2C Chip support -# -# CONFIG_SENSORS_DS1337 is not set -# CONFIG_SENSORS_DS1374 is not set -# CONFIG_SENSORS_EEPROM is not set -# CONFIG_SENSORS_PCF8574 is not set -# CONFIG_SENSORS_PCA9539 is not set -# CONFIG_SENSORS_PCF8591 is not set -# CONFIG_SENSORS_MAX6875 is not set -# CONFIG_I2C_DEBUG_CORE is not set -# CONFIG_I2C_DEBUG_ALGO is not set -# CONFIG_I2C_DEBUG_BUS is not set -# CONFIG_I2C_DEBUG_CHIP is not set +# CONFIG_I2C is not set # # SPI support # # CONFIG_SPI is not set # CONFIG_SPI_MASTER is not set - -# -# Dallas's 1-wire bus -# # CONFIG_W1 is not set -CONFIG_HWMON=y -# CONFIG_HWMON_VID is not set -# CONFIG_SENSORS_ABITUGURU is not set -# CONFIG_SENSORS_AD7418 is not set -# CONFIG_SENSORS_ADM1021 is not set -# CONFIG_SENSORS_ADM1025 is not set -# CONFIG_SENSORS_ADM1026 is not set -# CONFIG_SENSORS_ADM1029 is not set -# CONFIG_SENSORS_ADM1031 is not set -# CONFIG_SENSORS_ADM9240 is not set -# CONFIG_SENSORS_K8TEMP is not set -# CONFIG_SENSORS_ASB100 is not set -# CONFIG_SENSORS_ATXP1 is not set -# CONFIG_SENSORS_DS1621 is not set -# CONFIG_SENSORS_F71805F is not set -# CONFIG_SENSORS_FSCHER is not set -# CONFIG_SENSORS_FSCPOS is not set -# CONFIG_SENSORS_GL518SM is not set -# CONFIG_SENSORS_GL520SM is not set -CONFIG_SENSORS_CORETEMP=y -# CONFIG_SENSORS_IT87 is not set -# CONFIG_SENSORS_LM63 is not set -# CONFIG_SENSORS_LM75 is not set -# CONFIG_SENSORS_LM77 is not set -# CONFIG_SENSORS_LM78 is not set -# CONFIG_SENSORS_LM80 is not set -# CONFIG_SENSORS_LM83 is not set -# CONFIG_SENSORS_LM85 is not set -# CONFIG_SENSORS_LM87 is not set -# CONFIG_SENSORS_LM90 is not set -# CONFIG_SENSORS_LM92 is not set -# CONFIG_SENSORS_MAX1619 is not set -# CONFIG_SENSORS_MAX6650 is not set -# CONFIG_SENSORS_PC87360 is not set -# CONFIG_SENSORS_PC87427 is not set -# CONFIG_SENSORS_SIS5595 is not set -# CONFIG_SENSORS_SMSC47M1 is not set -# CONFIG_SENSORS_SMSC47M192 is not set -CONFIG_SENSORS_SMSC47B397=m -# CONFIG_SENSORS_VIA686A is not set -# CONFIG_SENSORS_VT1211 is not set -# CONFIG_SENSORS_VT8231 is not set -# CONFIG_SENSORS_W83781D is not set -# CONFIG_SENSORS_W83791D is not set -# CONFIG_SENSORS_W83792D is not set -# CONFIG_SENSORS_W83793 is not set -# CONFIG_SENSORS_W83L785TS is not set -# CONFIG_SENSORS_W83627HF is not set -# CONFIG_SENSORS_W83627EHF is not set -# CONFIG_SENSORS_HDAPS is not set -# CONFIG_SENSORS_APPLESMC is not set -# CONFIG_HWMON_DEBUG_CHIP is not set +# CONFIG_POWER_SUPPLY is not set +# CONFIG_HWMON is not set # # Multifunction device drivers @@ -1149,15 +974,11 @@ CONFIG_SOUND=y # Open Sound System # CONFIG_SOUND_PRIME=y -# CONFIG_OSS_OBSOLETE is not set # CONFIG_SOUND_TRIDENT is not set # CONFIG_SOUND_MSNDCLAS is not set # CONFIG_SOUND_MSNDPIN is not set # CONFIG_SOUND_OSS is not set - -# -# HID Devices -# +CONFIG_HID_SUPPORT=y CONFIG_HID=y # CONFIG_HID_DEBUG is not set @@ -1168,10 +989,7 @@ CONFIG_USB_HID=y # CONFIG_USB_HIDINPUT_POWERBOOK is not set # CONFIG_HID_FF is not set # CONFIG_USB_HIDDEV is not set - -# -# USB support -# +CONFIG_USB_SUPPORT=y CONFIG_USB_ARCH_HAS_HCD=y CONFIG_USB_ARCH_HAS_OHCI=y CONFIG_USB_ARCH_HAS_EHCI=y @@ -1185,6 +1003,7 @@ CONFIG_USB_DEVICEFS=y # CONFIG_USB_DEVICE_CLASS is not set # CONFIG_USB_DYNAMIC_MINORS is not set # CONFIG_USB_SUSPEND is not set +# CONFIG_USB_PERSIST is not set # CONFIG_USB_OTG is not set # @@ -1194,7 +1013,6 @@ CONFIG_USB_EHCI_HCD=y # CONFIG_USB_EHCI_SPLIT_ISO is not set # CONFIG_USB_EHCI_ROOT_HUB_TT is not set # CONFIG_USB_EHCI_TT_NEWSCHED is not set -# CONFIG_USB_EHCI_BIG_ENDIAN_MMIO is not set # CONFIG_USB_ISP116X_HCD is not set CONFIG_USB_OHCI_HCD=y # CONFIG_USB_OHCI_BIG_ENDIAN_DESC is not set @@ -1202,6 +1020,7 @@ CONFIG_USB_OHCI_HCD=y CONFIG_USB_OHCI_LITTLE_ENDIAN=y CONFIG_USB_UHCI_HCD=y # CONFIG_USB_SL811_HCD is not set +# CONFIG_USB_R8A66597_HCD is not set # # USB Device Class drivers @@ -1292,15 +1111,7 @@ CONFIG_USB_MON=y # # LED Triggers # - -# -# InfiniBand support -# # CONFIG_INFINIBAND is not set - -# -# EDAC - error detection and reporting (RAS) (EXPERIMENTAL) -# # CONFIG_EDAC is not set # @@ -1320,11 +1131,13 @@ CONFIG_USB_MON=y # # DMA Devices # +CONFIG_VIRTUALIZATION=y +# CONFIG_KVM is not set # -# Virtualization +# Userspace I/O # -# CONFIG_KVM is not set +# CONFIG_UIO is not set # # Firmware Drivers @@ -1332,6 +1145,7 @@ CONFIG_USB_MON=y # CONFIG_EDD is not set # CONFIG_DELL_RBU is not set # CONFIG_DCDBAS is not set +CONFIG_DMIID=y # # File systems @@ -1447,7 +1261,6 @@ CONFIG_SUNRPC=y # CONFIG_NCP_FS is not set # CONFIG_CODA_FS is not set # CONFIG_AFS_FS is not set -# CONFIG_9P_FS is not set # # Partition Types @@ -1524,8 +1337,9 @@ CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y # CONFIG_DEBUG_SHIRQ is not set CONFIG_DETECT_SOFTLOCKUP=y +# CONFIG_SCHED_DEBUG is not set # CONFIG_SCHEDSTATS is not set -# CONFIG_TIMER_STATS is not set +CONFIG_TIMER_STATS=y # CONFIG_DEBUG_SLAB is not set # CONFIG_DEBUG_RT_MUTEXES is not set # CONFIG_RT_MUTEX_TESTER is not set @@ -1533,6 +1347,7 @@ CONFIG_DETECT_SOFTLOCKUP=y # CONFIG_DEBUG_MUTEXES is not set # CONFIG_DEBUG_LOCK_ALLOC is not set # CONFIG_PROVE_LOCKING is not set +# CONFIG_LOCK_STAT is not set # CONFIG_DEBUG_SPINLOCK_SLEEP is not set # CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set # CONFIG_DEBUG_KOBJECT is not set @@ -1541,8 +1356,6 @@ CONFIG_DEBUG_BUGVERBOSE=y # CONFIG_DEBUG_VM is not set # CONFIG_DEBUG_LIST is not set # CONFIG_FRAME_POINTER is not set -CONFIG_UNWIND_INFO=y -CONFIG_STACK_UNWIND=y # CONFIG_FORCED_INLINING is not set # CONFIG_RCU_TORTURE_TEST is not set # CONFIG_LKDTM is not set @@ -1557,10 +1370,6 @@ CONFIG_DEBUG_STACKOVERFLOW=y # # CONFIG_KEYS is not set # CONFIG_SECURITY is not set - -# -# Cryptographic options -# # CONFIG_CRYPTO is not set # @@ -1571,6 +1380,7 @@ CONFIG_BITREVERSE=y # CONFIG_CRC16 is not set # CONFIG_CRC_ITU_T is not set CONFIG_CRC32=y +# CONFIG_CRC7 is not set # CONFIG_LIBCRC32C is not set CONFIG_ZLIB_INFLATE=y CONFIG_PLIST=y diff --git a/arch/x86_64/ia32/ia32_aout.c b/arch/x86_64/ia32/ia32_aout.c index fe83edb93c1..08781370256 100644 --- a/arch/x86_64/ia32/ia32_aout.c +++ b/arch/x86_64/ia32/ia32_aout.c @@ -404,7 +404,7 @@ beyond_if: set_brk(current->mm->start_brk, current->mm->brk); - retval = ia32_setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT); + retval = setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT); if (retval < 0) { /* Someone check-me: is this error path enough? */ send_sig(SIGKILL, current, 0); diff --git a/arch/x86_64/ia32/ia32_binfmt.c b/arch/x86_64/ia32/ia32_binfmt.c index 185399baaf6..b70f3e7cf06 100644 --- a/arch/x86_64/ia32/ia32_binfmt.c +++ b/arch/x86_64/ia32/ia32_binfmt.c @@ -38,6 +38,7 @@ int sysctl_vsyscall32 = 1; +#undef ARCH_DLINFO #define ARCH_DLINFO do { \ if (sysctl_vsyscall32) { \ NEW_AUX_ENT(AT_SYSINFO, (u32)(u64)VSYSCALL32_VSYSCALL); \ @@ -232,9 +233,6 @@ do { \ #define load_elf_binary load_elf32_binary #define ELF_PLAT_INIT(r, load_addr) elf32_init(r) -#define setup_arg_pages(bprm, stack_top, exec_stack) \ - ia32_setup_arg_pages(bprm, stack_top, exec_stack) -int ia32_setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top, int executable_stack); #undef start_thread #define start_thread(regs,new_rip,new_rsp) do { \ @@ -286,61 +284,6 @@ static void elf32_init(struct pt_regs *regs) me->thread.es = __USER_DS; } -int ia32_setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top, - int executable_stack) -{ - unsigned long stack_base; - struct vm_area_struct *mpnt; - struct mm_struct *mm = current->mm; - int i, ret; - - stack_base = stack_top - MAX_ARG_PAGES * PAGE_SIZE; - mm->arg_start = bprm->p + stack_base; - - bprm->p += stack_base; - if (bprm->loader) - bprm->loader += stack_base; - bprm->exec += stack_base; - - mpnt = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); - if (!mpnt) - return -ENOMEM; - - down_write(&mm->mmap_sem); - { - mpnt->vm_mm = mm; - mpnt->vm_start = PAGE_MASK & (unsigned long) bprm->p; - mpnt->vm_end = stack_top; - if (executable_stack == EXSTACK_ENABLE_X) - mpnt->vm_flags = VM_STACK_FLAGS | VM_EXEC; - else if (executable_stack == EXSTACK_DISABLE_X) - mpnt->vm_flags = VM_STACK_FLAGS & ~VM_EXEC; - else - mpnt->vm_flags = VM_STACK_FLAGS; - mpnt->vm_page_prot = (mpnt->vm_flags & VM_EXEC) ? - PAGE_COPY_EXEC : PAGE_COPY; - if ((ret = insert_vm_struct(mm, mpnt))) { - up_write(&mm->mmap_sem); - kmem_cache_free(vm_area_cachep, mpnt); - return ret; - } - mm->stack_vm = mm->total_vm = vma_pages(mpnt); - } - - for (i = 0 ; i < MAX_ARG_PAGES ; i++) { - struct page *page = bprm->page[i]; - if (page) { - bprm->page[i] = NULL; - install_arg_page(mpnt, page, stack_base); - } - stack_base += PAGE_SIZE; - } - up_write(&mm->mmap_sem); - - return 0; -} -EXPORT_SYMBOL(ia32_setup_arg_pages); - #ifdef CONFIG_SYSCTL /* Register vsyscall32 into the ABI table */ #include <linux/sysctl.h> diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S index 47565c3345d..938278697e2 100644 --- a/arch/x86_64/ia32/ia32entry.S +++ b/arch/x86_64/ia32/ia32entry.S @@ -104,7 +104,7 @@ ENTRY(ia32_sysenter_target) pushq %rax CFI_ADJUST_CFA_OFFSET 8 cld - SAVE_ARGS 0,0,0 + SAVE_ARGS 0,0,1 /* no need to do an access_ok check here because rbp has been 32bit zero extended */ 1: movl (%rbp),%r9d @@ -294,7 +294,7 @@ ia32_badarg: */ ENTRY(ia32_syscall) - CFI_STARTPROC simple + CFI_STARTPROC32 simple CFI_SIGNAL_FRAME CFI_DEF_CFA rsp,SS+8-RIP /*CFI_REL_OFFSET ss,SS-RIP*/ @@ -330,6 +330,7 @@ ia32_sysret: ia32_tracesys: SAVE_REST + CLEAR_RREGS movq $-ENOSYS,RAX(%rsp) /* really needed? */ movq %rsp,%rdi /* &pt_regs -> arg1 */ call syscall_trace_enter @@ -526,7 +527,7 @@ ia32_sys_call_table: .quad sys_init_module .quad sys_delete_module .quad quiet_ni_syscall /* 130 get_kernel_syms */ - .quad sys_quotactl + .quad sys32_quotactl .quad sys_getpgid .quad sys_fchdir .quad quiet_ni_syscall /* bdflush */ @@ -719,4 +720,5 @@ ia32_sys_call_table: .quad compat_sys_signalfd .quad compat_sys_timerfd .quad sys_eventfd + .quad sys32_fallocate ia32_syscall_end: diff --git a/arch/x86_64/ia32/sys_ia32.c b/arch/x86_64/ia32/sys_ia32.c index 99a78a3cce7..bee96d61443 100644 --- a/arch/x86_64/ia32/sys_ia32.c +++ b/arch/x86_64/ia32/sys_ia32.c @@ -879,3 +879,11 @@ asmlinkage long sys32_fadvise64(int fd, unsigned offset_lo, unsigned offset_hi, return sys_fadvise64_64(fd, ((u64)offset_hi << 32) | offset_lo, len, advice); } + +asmlinkage long sys32_fallocate(int fd, int mode, unsigned offset_lo, + unsigned offset_hi, unsigned len_lo, + unsigned len_hi) +{ + return sys_fallocate(fd, mode, ((u64)offset_hi << 32) | offset_lo, + ((u64)len_hi << 32) | len_lo); +} diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile index de1de8a2fd8..47f1dc30bf5 100644 --- a/arch/x86_64/kernel/Makefile +++ b/arch/x86_64/kernel/Makefile @@ -44,6 +44,7 @@ obj-$(CONFIG_PCI) += early-quirks.o obj-y += topology.o obj-y += intel_cacheinfo.o +obj-y += addon_cpuid_features.o obj-y += pcspeaker.o CFLAGS_vsyscall.o := $(PROFILING) -g0 @@ -55,6 +56,7 @@ cpuid-$(subst m,y,$(CONFIG_X86_CPUID)) += ../../i386/kernel/cpuid.o topology-y += ../../i386/kernel/topology.o microcode-$(subst m,y,$(CONFIG_MICROCODE)) += ../../i386/kernel/microcode.o intel_cacheinfo-y += ../../i386/kernel/cpu/intel_cacheinfo.o +addon_cpuid_features-y += ../../i386/kernel/cpu/addon_cpuid_features.o quirks-y += ../../i386/kernel/quirks.o i8237-y += ../../i386/kernel/i8237.o msr-$(subst m,y,$(CONFIG_X86_MSR)) += ../../i386/kernel/msr.o diff --git a/arch/x86_64/kernel/acpi/sleep.c b/arch/x86_64/kernel/acpi/sleep.c index 195b7034a14..4277f2b27e6 100644 --- a/arch/x86_64/kernel/acpi/sleep.c +++ b/arch/x86_64/kernel/acpi/sleep.c @@ -55,7 +55,7 @@ /* address in low memory of the wakeup routine. */ unsigned long acpi_wakeup_address = 0; -unsigned long acpi_video_flags; +unsigned long acpi_realmode_flags; extern char wakeup_start, wakeup_end; extern unsigned long acpi_copy_wakeup_routine(unsigned long); @@ -103,9 +103,11 @@ static int __init acpi_sleep_setup(char *str) { while ((str != NULL) && (*str != '\0')) { if (strncmp(str, "s3_bios", 7) == 0) - acpi_video_flags = 1; + acpi_realmode_flags |= 1; if (strncmp(str, "s3_mode", 7) == 0) - acpi_video_flags |= 2; + acpi_realmode_flags |= 2; + if (strncmp(str, "s3_beep", 7) == 0) + acpi_realmode_flags |= 4; str = strchr(str, ','); if (str != NULL) str += strspn(str, ", \t"); diff --git a/arch/x86_64/kernel/acpi/wakeup.S b/arch/x86_64/kernel/acpi/wakeup.S index 8550a6ffa27..13f1480cbec 100644 --- a/arch/x86_64/kernel/acpi/wakeup.S +++ b/arch/x86_64/kernel/acpi/wakeup.S @@ -16,6 +16,21 @@ # cs = 0x1234, eip = 0x05 # +#define BEEP \ + inb $97, %al; \ + outb %al, $0x80; \ + movb $3, %al; \ + outb %al, $97; \ + outb %al, $0x80; \ + movb $-74, %al; \ + outb %al, $67; \ + outb %al, $0x80; \ + movb $-119, %al; \ + outb %al, $66; \ + outb %al, $0x80; \ + movb $15, %al; \ + outb %al, $66; + ALIGN .align 16 @@ -33,6 +48,13 @@ wakeup_code: movw %cs, %ax movw %ax, %ds # Make ds:0 point to wakeup_start movw %ax, %ss + + # Data segment must be set up before we can see whether to beep. + testl $4, realmode_flags - wakeup_code + jz 1f + BEEP +1: + # Private stack is needed for ASUS board mov $(wakeup_stack - wakeup_code), %sp @@ -48,7 +70,7 @@ wakeup_code: testl %eax, %eax jnz no_longmode - testl $1, video_flags - wakeup_code + testl $1, realmode_flags - wakeup_code jz 1f lcall $0xc000,$3 movw %cs, %ax @@ -56,7 +78,7 @@ wakeup_code: movw %ax, %ss 1: - testl $2, video_flags - wakeup_code + testl $2, realmode_flags - wakeup_code jz 1f mov video_mode - wakeup_code, %ax call mode_seta @@ -230,7 +252,7 @@ gdt_48a: real_magic: .quad 0 video_mode: .quad 0 -video_flags: .quad 0 +realmode_flags: .quad 0 .code16 bogus_real_magic: @@ -346,8 +368,8 @@ ENTRY(acpi_copy_wakeup_routine) movl saved_video_mode, %edx movl %edx, video_mode - wakeup_start (,%rdi) - movl acpi_video_flags, %edx - movl %edx, video_flags - wakeup_start (,%rdi) + movl acpi_realmode_flags, %edx + movl %edx, realmode_flags - wakeup_start (,%rdi) movq $0x12345678, real_magic - wakeup_start (,%rdi) movq $0x123456789abcdef0, %rdx movq %rdx, saved_magic diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c index a3d450d6c15..8f681cae7bf 100644 --- a/arch/x86_64/kernel/aperture.c +++ b/arch/x86_64/kernel/aperture.c @@ -20,7 +20,7 @@ #include <linux/ioport.h> #include <asm/e820.h> #include <asm/io.h> -#include <asm/proto.h> +#include <asm/iommu.h> #include <asm/pci-direct.h> #include <asm/dma.h> #include <asm/k8.h> @@ -214,7 +214,7 @@ void __init iommu_hole_init(void) if (iommu_aperture_disabled || !fix_aperture || !early_pci_allowed()) return; - printk("Checking aperture...\n"); + printk(KERN_INFO "Checking aperture...\n"); fix = 0; for (num = 24; num < 32; num++) { diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index 1b0e07bb872..900ff38d68d 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -92,8 +92,9 @@ unsigned int safe_apic_wait_icr_idle(void) void enable_NMI_through_LVT0 (void * dummy) { unsigned int v; - - v = APIC_DM_NMI; /* unmask and set to NMI */ + + /* unmask and set to NMI */ + v = APIC_DM_NMI; apic_write(APIC_LVT0, v); } @@ -120,7 +121,7 @@ void ack_bad_irq(unsigned int irq) * holds up an irq slot - in excessive cases (when multiple * unexpected vectors occur) that might lock up the APIC * completely. - * But don't ack when the APIC is disabled. -AK + * But don't ack when the APIC is disabled. -AK */ if (!disable_apic) ack_APIC_irq(); @@ -616,7 +617,7 @@ early_param("apic", apic_set_verbosity); * Detect and enable local APICs on non-SMP boards. * Original code written by Keir Fraser. * On AMD64 we trust the BIOS - if it says no APIC it is likely - * not correctly set up (usually the APIC timer won't work etc.) + * not correctly set up (usually the APIC timer won't work etc.) */ static int __init detect_init_APIC (void) @@ -789,13 +790,13 @@ static void setup_APIC_timer(unsigned int clocks) local_irq_save(flags); /* wait for irq slice */ - if (hpet_address && hpet_use_timer) { - int trigger = hpet_readl(HPET_T0_CMP); - while (hpet_readl(HPET_COUNTER) >= trigger) - /* do nothing */ ; - while (hpet_readl(HPET_COUNTER) < trigger) - /* do nothing */ ; - } else { + if (hpet_address && hpet_use_timer) { + int trigger = hpet_readl(HPET_T0_CMP); + while (hpet_readl(HPET_COUNTER) >= trigger) + /* do nothing */ ; + while (hpet_readl(HPET_COUNTER) < trigger) + /* do nothing */ ; + } else { int c1, c2; outb_p(0x00, 0x43); c2 = inb_p(0x40); @@ -881,10 +882,10 @@ static unsigned int calibration_result; void __init setup_boot_APIC_clock (void) { - if (disable_apic_timer) { - printk(KERN_INFO "Disabling APIC timer\n"); - return; - } + if (disable_apic_timer) { + printk(KERN_INFO "Disabling APIC timer\n"); + return; + } printk(KERN_INFO "Using local APIC timer interrupts.\n"); using_apic_timer = 1; @@ -990,8 +991,8 @@ int setup_profiling_timer(unsigned int multiplier) return -EINVAL; } -void setup_APIC_extened_lvt(unsigned char lvt_off, unsigned char vector, - unsigned char msg_type, unsigned char mask) +void setup_APIC_extended_lvt(unsigned char lvt_off, unsigned char vector, + unsigned char msg_type, unsigned char mask) { unsigned long reg = (lvt_off << 4) + K8_APIC_EXT_LVT_BASE; unsigned int v = (mask << 16) | (msg_type << 8) | vector; @@ -1128,20 +1129,6 @@ asmlinkage void smp_spurious_interrupt(void) if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) ack_APIC_irq(); -#if 0 - static unsigned long last_warning; - static unsigned long skipped; - - /* see sw-dev-man vol 3, chapter 7.4.13.5 */ - if (time_before(last_warning+30*HZ,jiffies)) { - printk(KERN_INFO "spurious APIC interrupt on CPU#%d, %ld skipped.\n", - smp_processor_id(), skipped); - last_warning = jiffies; - skipped = 0; - } else { - skipped++; - } -#endif irq_exit(); } @@ -1173,11 +1160,11 @@ asmlinkage void smp_error_interrupt(void) 7: Illegal register address */ printk (KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n", - smp_processor_id(), v , v1); + smp_processor_id(), v , v1); irq_exit(); } -int disable_apic; +int disable_apic; /* * This initializes the IO-APIC and APIC hardware if this is @@ -1185,11 +1172,11 @@ int disable_apic; */ int __init APIC_init_uniprocessor (void) { - if (disable_apic) { + if (disable_apic) { printk(KERN_INFO "Apic disabled\n"); - return -1; + return -1; } - if (!cpu_has_apic) { + if (!cpu_has_apic) { disable_apic = 1; printk(KERN_INFO "Apic disabled by BIOS\n"); return -1; @@ -1211,8 +1198,8 @@ int __init APIC_init_uniprocessor (void) return 0; } -static __init int setup_disableapic(char *str) -{ +static __init int setup_disableapic(char *str) +{ disable_apic = 1; clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); return 0; @@ -1220,10 +1207,10 @@ static __init int setup_disableapic(char *str) early_param("disableapic", setup_disableapic); /* same as disableapic, for compatibility */ -static __init int setup_nolapic(char *str) -{ +static __init int setup_nolapic(char *str) +{ return setup_disableapic(str); -} +} early_param("nolapic", setup_nolapic); static int __init parse_lapic_timer_c2_ok(char *arg) @@ -1233,13 +1220,13 @@ static int __init parse_lapic_timer_c2_ok(char *arg) } early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); -static __init int setup_noapictimer(char *str) -{ +static __init int setup_noapictimer(char *str) +{ if (str[0] != ' ' && str[0] != 0) return 0; disable_apic_timer = 1; return 1; -} +} static __init int setup_apicmaintimer(char *str) { @@ -1264,5 +1251,5 @@ static __init int setup_apicpmtimer(char *s) } __setup("apicpmtimer", setup_apicpmtimer); -__setup("noapictimer", setup_noapictimer); +__setup("noapictimer", setup_noapictimer); diff --git a/arch/x86_64/kernel/cpufreq/Kconfig b/arch/x86_64/kernel/cpufreq/Kconfig index c0749d2479f..a3fd51926cb 100644 --- a/arch/x86_64/kernel/cpufreq/Kconfig +++ b/arch/x86_64/kernel/cpufreq/Kconfig @@ -48,10 +48,6 @@ config X86_SPEEDSTEP_CENTRINO If in doubt, say N. -config X86_SPEEDSTEP_CENTRINO_ACPI - bool - depends on X86_SPEEDSTEP_CENTRINO - config X86_ACPI_CPUFREQ tristate "ACPI Processor P-States driver" select CPU_FREQ_TABLE @@ -73,7 +69,7 @@ comment "shared options" config X86_ACPI_CPUFREQ_PROC_INTF bool "/proc/acpi/processor/../performance interface (deprecated)" depends on PROC_FS - depends on X86_ACPI_CPUFREQ || X86_SPEEDSTEP_CENTRINO_ACPI || X86_POWERNOW_K8_ACPI + depends on X86_ACPI_CPUFREQ || X86_POWERNOW_K8_ACPI help This enables the deprecated /proc/acpi/processor/../performance interface. While it is helpful for debugging, the generic, diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index 13c6c37610e..0f4d5e209e9 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c @@ -194,37 +194,6 @@ unsigned long __init e820_end_of_ram(void) } /* - * Find the hole size in the range. - */ -unsigned long __init e820_hole_size(unsigned long start, unsigned long end) -{ - unsigned long ram = 0; - int i; - - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - unsigned long last, addr; - - if (ei->type != E820_RAM || - ei->addr+ei->size <= start || - ei->addr >= end) - continue; - - addr = round_up(ei->addr, PAGE_SIZE); - if (addr < start) - addr = start; - - last = round_down(ei->addr + ei->size, PAGE_SIZE); - if (last >= end) - last = end; - - if (last > addr) - ram += last - addr; - } - return ((end - start) - ram); -} - -/* * Mark e820 reserved areas as busy for the resource manager. */ void __init e820_reserve_resources(void) @@ -289,47 +258,61 @@ void __init e820_mark_nosave_regions(void) } } -/* Walk the e820 map and register active regions within a node */ -void __init -e820_register_active_regions(int nid, unsigned long start_pfn, - unsigned long end_pfn) +/* + * Finds an active region in the address range from start_pfn to end_pfn and + * returns its range in ei_startpfn and ei_endpfn for the e820 entry. + */ +static int __init e820_find_active_region(const struct e820entry *ei, + unsigned long start_pfn, + unsigned long end_pfn, + unsigned long *ei_startpfn, + unsigned long *ei_endpfn) { - int i; - unsigned long ei_startpfn, ei_endpfn; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT; - ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE) - >> PAGE_SHIFT; + *ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT; + *ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE) >> PAGE_SHIFT; - /* Skip map entries smaller than a page */ - if (ei_startpfn >= ei_endpfn) - continue; + /* Skip map entries smaller than a page */ + if (*ei_startpfn >= *ei_endpfn) + return 0; - /* Check if end_pfn_map should be updated */ - if (ei->type != E820_RAM && ei_endpfn > end_pfn_map) - end_pfn_map = ei_endpfn; + /* Check if end_pfn_map should be updated */ + if (ei->type != E820_RAM && *ei_endpfn > end_pfn_map) + end_pfn_map = *ei_endpfn; - /* Skip if map is outside the node */ - if (ei->type != E820_RAM || - ei_endpfn <= start_pfn || - ei_startpfn >= end_pfn) - continue; + /* Skip if map is outside the node */ + if (ei->type != E820_RAM || *ei_endpfn <= start_pfn || + *ei_startpfn >= end_pfn) + return 0; - /* Check for overlaps */ - if (ei_startpfn < start_pfn) - ei_startpfn = start_pfn; - if (ei_endpfn > end_pfn) - ei_endpfn = end_pfn; + /* Check for overlaps */ + if (*ei_startpfn < start_pfn) + *ei_startpfn = start_pfn; + if (*ei_endpfn > end_pfn) + *ei_endpfn = end_pfn; - /* Obey end_user_pfn to save on memmap */ - if (ei_startpfn >= end_user_pfn) - continue; - if (ei_endpfn > end_user_pfn) - ei_endpfn = end_user_pfn; + /* Obey end_user_pfn to save on memmap */ + if (*ei_startpfn >= end_user_pfn) + return 0; + if (*ei_endpfn > end_user_pfn) + *ei_endpfn = end_user_pfn; - add_active_range(nid, ei_startpfn, ei_endpfn); - } + return 1; +} + +/* Walk the e820 map and register active regions within a node */ +void __init +e820_register_active_regions(int nid, unsigned long start_pfn, + unsigned long end_pfn) +{ + unsigned long ei_startpfn; + unsigned long ei_endpfn; + int i; + + for (i = 0; i < e820.nr_map; i++) + if (e820_find_active_region(&e820.map[i], + start_pfn, end_pfn, + &ei_startpfn, &ei_endpfn)) + add_active_range(nid, ei_startpfn, ei_endpfn); } /* @@ -350,12 +333,35 @@ void __init add_memory_region(unsigned long start, unsigned long size, int type) e820.nr_map++; } +/* + * Find the hole size (in bytes) in the memory range. + * @start: starting address of the memory range to scan + * @end: ending address of the memory range to scan + */ +unsigned long __init e820_hole_size(unsigned long start, unsigned long end) +{ + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long end_pfn = end >> PAGE_SHIFT; + unsigned long ei_startpfn; + unsigned long ei_endpfn; + unsigned long ram = 0; + int i; + + for (i = 0; i < e820.nr_map; i++) { + if (e820_find_active_region(&e820.map[i], + start_pfn, end_pfn, + &ei_startpfn, &ei_endpfn)) + ram += ei_endpfn - ei_startpfn; + } + return end - start - (ram << PAGE_SHIFT); +} + void __init e820_print_map(char *who) { int i; for (i = 0; i < e820.nr_map; i++) { - printk(" %s: %016Lx - %016Lx ", who, + printk(KERN_INFO " %s: %016Lx - %016Lx ", who, (unsigned long long) e820.map[i].addr, (unsigned long long) (e820.map[i].addr + e820.map[i].size)); switch (e820.map[i].type) { diff --git a/arch/x86_64/kernel/early-quirks.c b/arch/x86_64/kernel/early-quirks.c index 990d9c218a5..13aa4fd728f 100644 --- a/arch/x86_64/kernel/early-quirks.c +++ b/arch/x86_64/kernel/early-quirks.c @@ -14,6 +14,7 @@ #include <linux/pci_ids.h> #include <asm/pci-direct.h> #include <asm/proto.h> +#include <asm/iommu.h> #include <asm/dma.h> static void __init via_bugs(void) diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c index 296d2b0c5d8..fd9aff3f389 100644 --- a/arch/x86_64/kernel/early_printk.c +++ b/arch/x86_64/kernel/early_printk.c @@ -6,6 +6,7 @@ #include <asm/io.h> #include <asm/processor.h> #include <asm/fcntl.h> +#include <xen/hvc-console.h> /* Simple VGA output */ @@ -242,6 +243,10 @@ static int __init setup_early_printk(char *buf) simnow_init(buf + 6); early_console = &simnow_console; keep_early = 1; +#ifdef CONFIG_HVC_XEN + } else if (!strncmp(buf, "xen", 3)) { + early_console = &xenboot_console; +#endif } if (keep_early) diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index a67f87bf401..830cfc6ee8c 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -282,7 +282,7 @@ sysret_careful: sysret_signal: TRACE_IRQS_ON sti - testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx + testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx jz 1f /* Really a signal */ @@ -375,7 +375,7 @@ int_very_careful: jmp int_restore_rest int_signal: - testl $(_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_SINGLESTEP),%edx + testl $(_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx jz 1f movq %rsp,%rdi # &ptregs -> arg1 xorl %esi,%esi # oldset -> arg2 @@ -599,7 +599,7 @@ retint_careful: jmp retint_check retint_signal: - testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx + testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx jz retint_swapgs TRACE_IRQS_ON sti diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index 1fab487dee8..e89abcdbdde 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -25,7 +25,7 @@ */ .text - .section .bootstrap.text + .section .text.head .code64 .globl startup_64 startup_64: @@ -73,7 +73,11 @@ startup_64: addq %rbp, init_level4_pgt + (511*8)(%rip) addq %rbp, level3_ident_pgt + 0(%rip) + addq %rbp, level3_kernel_pgt + (510*8)(%rip) + addq %rbp, level3_kernel_pgt + (511*8)(%rip) + + addq %rbp, level2_fixmap_pgt + (506*8)(%rip) /* Add an Identity mapping if I am above 1G */ leaq _text(%rip), %rdi @@ -239,10 +243,16 @@ ENTRY(secondary_startup_64) lretq /* SMP bootup changes these two */ +#ifndef CONFIG_HOTPLUG_CPU + .pushsection .init.data +#endif .align 8 .globl initial_code initial_code: .quad x86_64_start_kernel +#ifndef CONFIG_HOTPLUG_CPU + .popsection +#endif .globl init_rsp init_rsp: .quad init_thread_union+THREAD_SIZE-8 @@ -314,7 +324,16 @@ NEXT_PAGE(level3_kernel_pgt) .fill 510,8,0 /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE - .fill 1,8,0 + .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE + +NEXT_PAGE(level2_fixmap_pgt) + .fill 506,8,0 + .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE + /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */ + .fill 5,8,0 + +NEXT_PAGE(level1_fixmap_pgt) + .fill 512,8,0 NEXT_PAGE(level2_ident_pgt) /* Since I easily can, map the first 1G. diff --git a/arch/x86_64/kernel/hpet.c b/arch/x86_64/kernel/hpet.c index b8286968662..e2d1b912e15 100644 --- a/arch/x86_64/kernel/hpet.c +++ b/arch/x86_64/kernel/hpet.c @@ -133,7 +133,7 @@ struct clocksource clocksource_hpet = { .vread = vread_hpet, }; -int hpet_arch_init(void) +int __init hpet_arch_init(void) { unsigned int id; u64 tmp; @@ -190,7 +190,7 @@ int hpet_reenable(void) */ #define TICK_COUNT 100000000 -#define TICK_MIN 5000 +#define SMI_THRESHOLD 50000 #define MAX_TRIES 5 /* @@ -205,7 +205,7 @@ static void __init read_hpet_tsc(int *hpet, int *tsc) tsc1 = get_cycles_sync(); hpet1 = hpet_readl(HPET_COUNTER); tsc2 = get_cycles_sync(); - if (tsc2 - tsc1 > TICK_MIN) + if ((tsc2 - tsc1) < SMI_THRESHOLD) break; } *hpet = hpet1; @@ -439,7 +439,7 @@ int hpet_rtc_dropped_irq(void) return 1; } -irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs) +irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) { struct rtc_time curr_time; unsigned long rtc_int_flag = 0; diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c index 4b326655b20..948cae64609 100644 --- a/arch/x86_64/kernel/i8259.c +++ b/arch/x86_64/kernel/i8259.c @@ -444,24 +444,6 @@ void __init init_ISA_irqs (void) } } -void apic_timer_interrupt(void); -void spurious_interrupt(void); -void error_interrupt(void); -void reschedule_interrupt(void); -void call_function_interrupt(void); -void irq_move_cleanup_interrupt(void); -void invalidate_interrupt0(void); -void invalidate_interrupt1(void); -void invalidate_interrupt2(void); -void invalidate_interrupt3(void); -void invalidate_interrupt4(void); -void invalidate_interrupt5(void); -void invalidate_interrupt6(void); -void invalidate_interrupt7(void); -void thermal_interrupt(void); -void threshold_interrupt(void); -void i8254_timer_resume(void); - static void setup_timer_hardware(void) { outb_p(0x34,0x43); /* binary, mode 2, LSB/MSB, ch 0 */ diff --git a/arch/x86_64/kernel/init_task.c b/arch/x86_64/kernel/init_task.c index 3dc5854ba21..4ff33d4f855 100644 --- a/arch/x86_64/kernel/init_task.c +++ b/arch/x86_64/kernel/init_task.c @@ -44,7 +44,7 @@ EXPORT_SYMBOL(init_task); * section. Since TSS's are completely CPU-local, we want them * on exact cacheline boundaries, to eliminate cacheline ping-pong. */ -DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_internodealigned_in_smp = INIT_TSS; +DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; /* Copies of the original ist values from the tss are only accessed during * debugging, no special alignment required. diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index 1c6c6f72457..050141c0602 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -152,6 +152,32 @@ static inline void io_apic_modify(unsigned int apic, unsigned int value) writel(value, &io_apic->data); } +static int io_apic_level_ack_pending(unsigned int irq) +{ + struct irq_pin_list *entry; + unsigned long flags; + int pending = 0; + + spin_lock_irqsave(&ioapic_lock, flags); + entry = irq_2_pin + irq; + for (;;) { + unsigned int reg; + int pin; + + pin = entry->pin; + if (pin == -1) + break; + reg = io_apic_read(entry->apic, 0x10 + pin*2); + /* Is the remote IRR bit set? */ + pending |= (reg >> 14) & 1; + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } + spin_unlock_irqrestore(&ioapic_lock, flags); + return pending; +} + /* * Synchronize the IO-APIC and the CPU by doing * a dummy read from the IO-APIC @@ -1418,9 +1444,37 @@ static void ack_apic_level(unsigned int irq) ack_APIC_irq(); /* Now we can move and renable the irq */ - move_masked_irq(irq); - if (unlikely(do_unmask_irq)) + if (unlikely(do_unmask_irq)) { + /* Only migrate the irq if the ack has been received. + * + * On rare occasions the broadcast level triggered ack gets + * delayed going to ioapics, and if we reprogram the + * vector while Remote IRR is still set the irq will never + * fire again. + * + * To prevent this scenario we read the Remote IRR bit + * of the ioapic. This has two effects. + * - On any sane system the read of the ioapic will + * flush writes (and acks) going to the ioapic from + * this cpu. + * - We get to see if the ACK has actually been delivered. + * + * Based on failed experiments of reprogramming the + * ioapic entry from outside of irq context starting + * with masking the ioapic entry and then polling until + * Remote IRR was clear before reprogramming the + * ioapic I don't trust the Remote IRR bit to be + * completey accurate. + * + * However there appears to be no other way to plug + * this race, so if the Remote IRR bit is not + * accurate and is causing problems then it is a hardware bug + * and you can go talk to the chipset vendor about it. + */ + if (!io_apic_level_ack_pending(irq)) + move_masked_irq(irq); unmask_IO_APIC_irq(irq); + } } static struct irq_chip ioapic_chip __read_mostly = { diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c index d4a0d0ac993..a30e004682e 100644 --- a/arch/x86_64/kernel/kprobes.c +++ b/arch/x86_64/kernel/kprobes.c @@ -39,9 +39,9 @@ #include <linux/module.h> #include <linux/kdebug.h> -#include <asm/cacheflush.h> #include <asm/pgtable.h> #include <asm/uaccess.h> +#include <asm/alternative.h> void jprobe_return_end(void); static void __kprobes arch_copy_kprobe(struct kprobe *p); @@ -209,16 +209,12 @@ static void __kprobes arch_copy_kprobe(struct kprobe *p) void __kprobes arch_arm_kprobe(struct kprobe *p) { - *p->addr = BREAKPOINT_INSTRUCTION; - flush_icache_range((unsigned long) p->addr, - (unsigned long) p->addr + sizeof(kprobe_opcode_t)); + text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1); } void __kprobes arch_disarm_kprobe(struct kprobe *p) { - *p->addr = p->opcode; - flush_icache_range((unsigned long) p->addr, - (unsigned long) p->addr + sizeof(kprobe_opcode_t)); + text_poke(p->addr, &p->opcode, 1); } void __kprobes arch_remove_kprobe(struct kprobe *p) diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c index aa1d1599179..a66d607f5b9 100644 --- a/arch/x86_64/kernel/mce.c +++ b/arch/x86_64/kernel/mce.c @@ -18,6 +18,8 @@ #include <linux/capability.h> #include <linux/cpu.h> #include <linux/percpu.h> +#include <linux/poll.h> +#include <linux/thread_info.h> #include <linux/ctype.h> #include <linux/kmod.h> #include <linux/kdebug.h> @@ -26,6 +28,7 @@ #include <asm/mce.h> #include <asm/uaccess.h> #include <asm/smp.h> +#include <asm/idle.h> #define MISC_MCELOG_MINOR 227 #define NR_BANKS 6 @@ -34,13 +37,17 @@ atomic_t mce_entry; static int mce_dont_init; -/* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic, - 3: never panic or exit (for testing only) */ +/* + * Tolerant levels: + * 0: always panic on uncorrected errors, log corrected errors + * 1: panic or SIGBUS on uncorrected errors, log corrected errors + * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors + * 3: never panic or SIGBUS, log all errors (for testing only) + */ static int tolerant = 1; static int banks; static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL }; -static unsigned long console_logged; -static int notify_user; +static unsigned long notify_user; static int rip_msr; static int mce_bootlog = 1; static atomic_t mce_events; @@ -48,6 +55,8 @@ static atomic_t mce_events; static char trigger[128]; static char *trigger_argv[2] = { trigger, NULL }; +static DECLARE_WAIT_QUEUE_HEAD(mce_wait); + /* * Lockless MCE logging infrastructure. * This avoids deadlocks on printk locks without having to break locks. Also @@ -94,8 +103,7 @@ void mce_log(struct mce *mce) mcelog.entry[entry].finished = 1; wmb(); - if (!test_and_set_bit(0, &console_logged)) - notify_user = 1; + set_bit(0, ¬ify_user); } static void print_mce(struct mce *m) @@ -128,6 +136,7 @@ static void print_mce(struct mce *m) static void mce_panic(char *msg, struct mce *backup, unsigned long start) { int i; + oops_begin(); for (i = 0; i < MCE_LOG_LEN; i++) { unsigned long tsc = mcelog.entry[i].tsc; @@ -139,10 +148,7 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start) } if (backup) print_mce(backup); - if (tolerant >= 3) - printk("Fake panic: %s\n", msg); - else - panic(msg); + panic(msg); } static int mce_available(struct cpuinfo_x86 *c) @@ -167,17 +173,6 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) } } -static void do_mce_trigger(void) -{ - static atomic_t mce_logged; - int events = atomic_read(&mce_events); - if (events != atomic_read(&mce_logged) && trigger[0]) { - /* Small race window, but should be harmless. */ - atomic_set(&mce_logged, events); - call_usermodehelper(trigger, trigger_argv, NULL, -1); - } -} - /* * The actual machine check handler */ @@ -185,11 +180,19 @@ static void do_mce_trigger(void) void do_machine_check(struct pt_regs * regs, long error_code) { struct mce m, panicm; - int nowayout = (tolerant < 1); - int kill_it = 0; u64 mcestart = 0; int i; int panicm_found = 0; + /* + * If no_way_out gets set, there is no safe way to recover from this + * MCE. If tolerant is cranked up, we'll try anyway. + */ + int no_way_out = 0; + /* + * If kill_it gets set, there might be a way to recover from this + * error. + */ + int kill_it = 0; atomic_inc(&mce_entry); @@ -201,8 +204,9 @@ void do_machine_check(struct pt_regs * regs, long error_code) memset(&m, 0, sizeof(struct mce)); m.cpu = smp_processor_id(); rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); + /* if the restart IP is not valid, we're done for */ if (!(m.mcgstatus & MCG_STATUS_RIPV)) - kill_it = 1; + no_way_out = 1; rdtscll(mcestart); barrier(); @@ -221,10 +225,18 @@ void do_machine_check(struct pt_regs * regs, long error_code) continue; if (m.status & MCI_STATUS_EN) { - /* In theory _OVER could be a nowayout too, but - assume any overflowed errors were no fatal. */ - nowayout |= !!(m.status & MCI_STATUS_PCC); - kill_it |= !!(m.status & MCI_STATUS_UC); + /* if PCC was set, there's no way out */ + no_way_out |= !!(m.status & MCI_STATUS_PCC); + /* + * If this error was uncorrectable and there was + * an overflow, we're in trouble. If no overflow, + * we might get away with just killing a task. + */ + if (m.status & MCI_STATUS_UC) { + if (tolerant < 1 || m.status & MCI_STATUS_OVER) + no_way_out = 1; + kill_it = 1; + } } if (m.status & MCI_STATUS_MISCV) @@ -235,7 +247,6 @@ void do_machine_check(struct pt_regs * regs, long error_code) mce_get_rip(&m, regs); if (error_code >= 0) rdtscll(m.tsc); - wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0); if (error_code != -2) mce_log(&m); @@ -251,45 +262,59 @@ void do_machine_check(struct pt_regs * regs, long error_code) } /* Never do anything final in the polling timer */ - if (!regs) { - /* Normal interrupt context here. Call trigger for any new - events. */ - do_mce_trigger(); + if (!regs) goto out; - } /* If we didn't find an uncorrectable error, pick the last one (shouldn't happen, just being safe). */ if (!panicm_found) panicm = m; - if (nowayout) + + /* + * If we have decided that we just CAN'T continue, and the user + * has not set tolerant to an insane level, give up and die. + */ + if (no_way_out && tolerant < 3) mce_panic("Machine check", &panicm, mcestart); - if (kill_it) { + + /* + * If the error seems to be unrecoverable, something should be + * done. Try to kill as little as possible. If we can kill just + * one task, do that. If the user has set the tolerance very + * high, don't try to do anything at all. + */ + if (kill_it && tolerant < 3) { int user_space = 0; - if (m.mcgstatus & MCG_STATUS_RIPV) + /* + * If the EIPV bit is set, it means the saved IP is the + * instruction which caused the MCE. + */ + if (m.mcgstatus & MCG_STATUS_EIPV) user_space = panicm.rip && (panicm.cs & 3); - - /* When the machine was in user space and the CPU didn't get - confused it's normally not necessary to panic, unless you - are paranoid (tolerant == 0) - - RED-PEN could be more tolerant for MCEs in idle, - but most likely they occur at boot anyways, where - it is best to just halt the machine. */ - if ((!user_space && (panic_on_oops || tolerant < 2)) || - (unsigned)current->pid <= 1) - mce_panic("Uncorrected machine check", &panicm, mcestart); - - /* do_exit takes an awful lot of locks and has as - slight risk of deadlocking. If you don't want that - don't set tolerant >= 2 */ - if (tolerant < 3) + + /* + * If we know that the error was in user space, send a + * SIGBUS. Otherwise, panic if tolerance is low. + * + * do_exit() takes an awful lot of locks and has a slight + * risk of deadlocking. + */ + if (user_space) { do_exit(SIGBUS); + } else if (panic_on_oops || tolerant < 2) { + mce_panic("Uncorrected machine check", + &panicm, mcestart); + } } + /* notify userspace ASAP */ + set_thread_flag(TIF_MCE_NOTIFY); + out: - /* Last thing done in the machine check exception to clear state. */ + /* the last thing we do is clear state */ + for (i = 0; i < banks; i++) + wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); wrmsrl(MSR_IA32_MCG_STATUS, 0); out2: atomic_dec(&mce_entry); @@ -344,37 +369,69 @@ static void mcheck_timer(struct work_struct *work) on_each_cpu(mcheck_check_cpu, NULL, 1, 1); /* - * It's ok to read stale data here for notify_user and - * console_logged as we'll simply get the updated versions - * on the next mcheck_timer execution and atomic operations - * on console_logged act as synchronization for notify_user - * writes. + * Alert userspace if needed. If we logged an MCE, reduce the + * polling interval, otherwise increase the polling interval. */ - if (notify_user && console_logged) { + if (mce_notify_user()) { + next_interval = max(next_interval/2, HZ/100); + } else { + next_interval = min(next_interval*2, + (int)round_jiffies_relative(check_interval*HZ)); + } + + schedule_delayed_work(&mcheck_work, next_interval); +} + +/* + * This is only called from process context. This is where we do + * anything we need to alert userspace about new MCEs. This is called + * directly from the poller and also from entry.S and idle, thanks to + * TIF_MCE_NOTIFY. + */ +int mce_notify_user(void) +{ + clear_thread_flag(TIF_MCE_NOTIFY); + if (test_and_clear_bit(0, ¬ify_user)) { static unsigned long last_print; unsigned long now = jiffies; - /* if we logged an MCE, reduce the polling interval */ - next_interval = max(next_interval/2, HZ/100); - notify_user = 0; - clear_bit(0, &console_logged); + wake_up_interruptible(&mce_wait); + if (trigger[0]) + call_usermodehelper(trigger, trigger_argv, NULL, + UMH_NO_WAIT); + if (time_after_eq(now, last_print + (check_interval*HZ))) { last_print = now; printk(KERN_INFO "Machine check events logged\n"); } - } else { - next_interval = min(next_interval*2, check_interval*HZ); + + return 1; } + return 0; +} - schedule_delayed_work(&mcheck_work, next_interval); +/* see if the idle task needs to notify userspace */ +static int +mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk) +{ + /* IDLE_END should be safe - interrupts are back on */ + if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY)) + mce_notify_user(); + + return NOTIFY_OK; } +static struct notifier_block mce_idle_notifier = { + .notifier_call = mce_idle_callback, +}; static __init int periodic_mcheck_init(void) { next_interval = check_interval * HZ; if (next_interval) - schedule_delayed_work(&mcheck_work, next_interval); + schedule_delayed_work(&mcheck_work, + round_jiffies_relative(next_interval)); + idle_notifier_register(&mce_idle_notifier); return 0; } __initcall(periodic_mcheck_init); @@ -465,6 +522,40 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) * Character device to read and clear the MCE log. */ +static DEFINE_SPINLOCK(mce_state_lock); +static int open_count; /* #times opened */ +static int open_exclu; /* already open exclusive? */ + +static int mce_open(struct inode *inode, struct file *file) +{ + spin_lock(&mce_state_lock); + + if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { + spin_unlock(&mce_state_lock); + return -EBUSY; + } + + if (file->f_flags & O_EXCL) + open_exclu = 1; + open_count++; + + spin_unlock(&mce_state_lock); + + return nonseekable_open(inode, file); +} + +static int mce_release(struct inode *inode, struct file *file) +{ + spin_lock(&mce_state_lock); + + open_count--; + open_exclu = 0; + + spin_unlock(&mce_state_lock); + + return 0; +} + static void collect_tscs(void *data) { unsigned long *cpu_tsc = (unsigned long *)data; @@ -532,6 +623,14 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff return err ? -EFAULT : buf - ubuf; } +static unsigned int mce_poll(struct file *file, poll_table *wait) +{ + poll_wait(file, &mce_wait, wait); + if (rcu_dereference(mcelog.next)) + return POLLIN | POLLRDNORM; + return 0; +} + static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg) { int __user *p = (int __user *)arg; @@ -555,7 +654,10 @@ static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned } static const struct file_operations mce_chrdev_ops = { + .open = mce_open, + .release = mce_release, .read = mce_read, + .poll = mce_poll, .ioctl = mce_ioctl, }; @@ -565,6 +667,20 @@ static struct miscdevice mce_log_device = { &mce_chrdev_ops, }; +static unsigned long old_cr4 __initdata; + +void __init stop_mce(void) +{ + old_cr4 = read_cr4(); + clear_in_cr4(X86_CR4_MCE); +} + +void __init restart_mce(void) +{ + if (old_cr4 & X86_CR4_MCE) + set_in_cr4(X86_CR4_MCE); +} + /* * Old style boot options parsing. Only for compatibility. */ @@ -620,7 +736,8 @@ static void mce_restart(void) on_each_cpu(mce_init, NULL, 1, 1); next_interval = check_interval * HZ; if (next_interval) - schedule_delayed_work(&mcheck_work, next_interval); + schedule_delayed_work(&mcheck_work, + round_jiffies_relative(next_interval)); } static struct sysdev_class mce_sysclass = { diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c index 03356e64f9c..2f8a7f18b0f 100644 --- a/arch/x86_64/kernel/mce_amd.c +++ b/arch/x86_64/kernel/mce_amd.c @@ -157,9 +157,9 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c) high |= K8_APIC_EXT_LVT_ENTRY_THRESHOLD << 20; wrmsr(address, low, high); - setup_APIC_extened_lvt(K8_APIC_EXT_LVT_ENTRY_THRESHOLD, - THRESHOLD_APIC_VECTOR, - K8_APIC_EXT_INT_MSG_FIX, 0); + setup_APIC_extended_lvt(K8_APIC_EXT_LVT_ENTRY_THRESHOLD, + THRESHOLD_APIC_VECTOR, + K8_APIC_EXT_INT_MSG_FIX, 0); threshold_defaults.address = address; threshold_restart_bank(&threshold_defaults, 0, 0); diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index 61ae57eb9e4..8bf0ca03ac8 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -32,7 +32,6 @@ /* Have we found an MP table */ int smp_found_config; -unsigned int __initdata maxcpus = NR_CPUS; /* * Various Linux-internal data structures created from the @@ -649,6 +648,20 @@ static int mp_find_ioapic(int gsi) return -1; } +static u8 uniq_ioapic_id(u8 id) +{ + int i; + DECLARE_BITMAP(used, 256); + bitmap_zero(used, 256); + for (i = 0; i < nr_ioapics; i++) { + struct mpc_config_ioapic *ia = &mp_ioapics[i]; + __set_bit(ia->mpc_apicid, used); + } + if (!test_bit(id, used)) + return id; + return find_first_zero_bit(used, 256); +} + void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base) { int idx = 0; @@ -656,14 +669,14 @@ void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base) if (bad_ioapic(address)) return; - idx = nr_ioapics++; + idx = nr_ioapics; mp_ioapics[idx].mpc_type = MP_IOAPIC; mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE; mp_ioapics[idx].mpc_apicaddr = address; set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); - mp_ioapics[idx].mpc_apicid = id; + mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id); mp_ioapics[idx].mpc_apicver = 0; /* @@ -680,6 +693,8 @@ void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base) mp_ioapics[idx].mpc_apicaddr, mp_ioapic_routing[idx].gsi_start, mp_ioapic_routing[idx].gsi_end); + + nr_ioapics++; } void __init diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index 931c64bad5e..cb8ee9d02f8 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -296,7 +296,7 @@ static DEFINE_PER_CPU(unsigned, last_irq_sum); static DEFINE_PER_CPU(local_t, alert_counter); static DEFINE_PER_CPU(int, nmi_touch); -void touch_nmi_watchdog (void) +void touch_nmi_watchdog(void) { if (nmi_watchdog > 0) { unsigned cpu; @@ -306,8 +306,10 @@ void touch_nmi_watchdog (void) * do it ourselves because the alert count increase is not * atomic. */ - for_each_present_cpu (cpu) - per_cpu(nmi_touch, cpu) = 1; + for_each_present_cpu(cpu) { + if (per_cpu(nmi_touch, cpu) != 1) + per_cpu(nmi_touch, cpu) = 1; + } } touch_softlockup_watchdog(); @@ -382,11 +384,14 @@ int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) return rc; } +static unsigned ignore_nmis; + asmlinkage __kprobes void do_nmi(struct pt_regs * regs, long error_code) { nmi_enter(); add_pda(__nmi_count,1); - default_do_nmi(regs); + if (!ignore_nmis) + default_do_nmi(regs); nmi_exit(); } @@ -399,6 +404,18 @@ int do_nmi_callback(struct pt_regs * regs, int cpu) return 0; } +void stop_nmi(void) +{ + acpi_nmi_disable(); + ignore_nmis++; +} + +void restart_nmi(void) +{ + ignore_nmis--; + acpi_nmi_enable(); +} + #ifdef CONFIG_SYSCTL static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c index 5bd20b542c1..ba16c968ca3 100644 --- a/arch/x86_64/kernel/pci-calgary.c +++ b/arch/x86_64/kernel/pci-calgary.c @@ -1,7 +1,7 @@ /* * Derived from arch/powerpc/kernel/iommu.c * - * Copyright (C) IBM Corporation, 2006 + * Copyright IBM Corporation, 2006-2007 * Copyright (C) 2006 Jon Mason <jdmason@kudzu.us> * * Author: Jon Mason <jdmason@kudzu.us> @@ -35,7 +35,7 @@ #include <linux/pci_ids.h> #include <linux/pci.h> #include <linux/delay.h> -#include <asm/proto.h> +#include <asm/iommu.h> #include <asm/calgary.h> #include <asm/tce.h> #include <asm/pci-direct.h> @@ -50,13 +50,7 @@ int use_calgary __read_mostly = 0; #endif /* CONFIG_CALGARY_DEFAULT_ENABLED */ #define PCI_DEVICE_ID_IBM_CALGARY 0x02a1 -#define PCI_VENDOR_DEVICE_ID_CALGARY \ - (PCI_VENDOR_ID_IBM | PCI_DEVICE_ID_IBM_CALGARY << 16) - -/* we need these for register space address calculation */ -#define START_ADDRESS 0xfe000000 -#define CHASSIS_BASE 0 -#define ONE_BASED_CHASSIS_NUM 1 +#define PCI_DEVICE_ID_IBM_CALIOC2 0x0308 /* register offsets inside the host bridge space */ #define CALGARY_CONFIG_REG 0x0108 @@ -80,6 +74,12 @@ int use_calgary __read_mostly = 0; #define PHB_MEM_2_SIZE_LOW 0x02E0 #define PHB_DOSHOLE_OFFSET 0x08E0 +/* CalIOC2 specific */ +#define PHB_SAVIOR_L2 0x0DB0 +#define PHB_PAGE_MIG_CTRL 0x0DA8 +#define PHB_PAGE_MIG_DEBUG 0x0DA0 +#define PHB_ROOT_COMPLEX_STATUS 0x0CB0 + /* PHB_CONFIG_RW */ #define PHB_TCE_ENABLE 0x20000000 #define PHB_SLOT_DISABLE 0x1C000000 @@ -92,7 +92,11 @@ int use_calgary __read_mostly = 0; /* CSR (Channel/DMA Status Register) */ #define CSR_AGENT_MASK 0xffe0ffff /* CCR (Calgary Configuration Register) */ -#define CCR_2SEC_TIMEOUT 0x000000000000000EUL +#define CCR_2SEC_TIMEOUT 0x000000000000000EUL +/* PMCR/PMDR (Page Migration Control/Debug Registers */ +#define PMR_SOFTSTOP 0x80000000 +#define PMR_SOFTSTOPFAULT 0x40000000 +#define PMR_HARDSTOP 0x20000000 #define MAX_NUM_OF_PHBS 8 /* how many PHBs in total? */ #define MAX_NUM_CHASSIS 8 /* max number of chassis */ @@ -155,9 +159,26 @@ struct calgary_bus_info { void __iomem *bbar; }; -static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, }; +static void calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev); +static void calgary_tce_cache_blast(struct iommu_table *tbl); +static void calgary_dump_error_regs(struct iommu_table *tbl); +static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev); +static void calioc2_tce_cache_blast(struct iommu_table *tbl); +static void calioc2_dump_error_regs(struct iommu_table *tbl); + +static struct cal_chipset_ops calgary_chip_ops = { + .handle_quirks = calgary_handle_quirks, + .tce_cache_blast = calgary_tce_cache_blast, + .dump_error_regs = calgary_dump_error_regs +}; -static void tce_cache_blast(struct iommu_table *tbl); +static struct cal_chipset_ops calioc2_chip_ops = { + .handle_quirks = calioc2_handle_quirks, + .tce_cache_blast = calioc2_tce_cache_blast, + .dump_error_regs = calioc2_dump_error_regs +}; + +static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, }; /* enable this to stress test the chip's TCE cache */ #ifdef CONFIG_IOMMU_DEBUG @@ -187,6 +208,7 @@ static inline unsigned long verify_bit_range(unsigned long* bitmap, { return ~0UL; } + #endif /* CONFIG_IOMMU_DEBUG */ static inline unsigned int num_dma_pages(unsigned long dma, unsigned int dmalen) @@ -206,11 +228,12 @@ static inline int translate_phb(struct pci_dev* dev) } static void iommu_range_reserve(struct iommu_table *tbl, - unsigned long start_addr, unsigned int npages) + unsigned long start_addr, unsigned int npages) { unsigned long index; unsigned long end; unsigned long badbit; + unsigned long flags; index = start_addr >> PAGE_SHIFT; @@ -222,6 +245,8 @@ static void iommu_range_reserve(struct iommu_table *tbl, if (end > tbl->it_size) /* don't go off the table */ end = tbl->it_size; + spin_lock_irqsave(&tbl->it_lock, flags); + badbit = verify_bit_range(tbl->it_map, 0, index, end); if (badbit != ~0UL) { if (printk_ratelimit()) @@ -231,23 +256,29 @@ static void iommu_range_reserve(struct iommu_table *tbl, } set_bit_string(tbl->it_map, index, npages); + + spin_unlock_irqrestore(&tbl->it_lock, flags); } static unsigned long iommu_range_alloc(struct iommu_table *tbl, unsigned int npages) { + unsigned long flags; unsigned long offset; BUG_ON(npages == 0); + spin_lock_irqsave(&tbl->it_lock, flags); + offset = find_next_zero_string(tbl->it_map, tbl->it_hint, tbl->it_size, npages); if (offset == ~0UL) { - tce_cache_blast(tbl); + tbl->chip_ops->tce_cache_blast(tbl); offset = find_next_zero_string(tbl->it_map, 0, tbl->it_size, npages); if (offset == ~0UL) { printk(KERN_WARNING "Calgary: IOMMU full.\n"); + spin_unlock_irqrestore(&tbl->it_lock, flags); if (panic_on_overflow) panic("Calgary: fix the allocator.\n"); else @@ -259,17 +290,17 @@ static unsigned long iommu_range_alloc(struct iommu_table *tbl, tbl->it_hint = offset + npages; BUG_ON(tbl->it_hint > tbl->it_size); + spin_unlock_irqrestore(&tbl->it_lock, flags); + return offset; } static dma_addr_t iommu_alloc(struct iommu_table *tbl, void *vaddr, unsigned int npages, int direction) { - unsigned long entry, flags; + unsigned long entry; dma_addr_t ret = bad_dma_address; - spin_lock_irqsave(&tbl->it_lock, flags); - entry = iommu_range_alloc(tbl, npages); if (unlikely(entry == bad_dma_address)) @@ -282,23 +313,21 @@ static dma_addr_t iommu_alloc(struct iommu_table *tbl, void *vaddr, tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK, direction); - spin_unlock_irqrestore(&tbl->it_lock, flags); - return ret; error: - spin_unlock_irqrestore(&tbl->it_lock, flags); printk(KERN_WARNING "Calgary: failed to allocate %u pages in " "iommu %p\n", npages, tbl); return bad_dma_address; } -static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, +static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, unsigned int npages) { unsigned long entry; unsigned long badbit; unsigned long badend; + unsigned long flags; /* were we called with bad_dma_address? */ badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE); @@ -315,6 +344,8 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, tce_free(tbl, entry, npages); + spin_lock_irqsave(&tbl->it_lock, flags); + badbit = verify_bit_range(tbl->it_map, 1, entry, entry + npages); if (badbit != ~0UL) { if (printk_ratelimit()) @@ -324,23 +355,40 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, } __clear_bit_string(tbl->it_map, entry, npages); + + spin_unlock_irqrestore(&tbl->it_lock, flags); } -static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, - unsigned int npages) +static inline struct iommu_table *find_iommu_table(struct device *dev) { - unsigned long flags; + struct pci_dev *pdev; + struct pci_bus *pbus; + struct iommu_table *tbl; - spin_lock_irqsave(&tbl->it_lock, flags); + pdev = to_pci_dev(dev); - __iommu_free(tbl, dma_addr, npages); + /* is the device behind a bridge? */ + if (unlikely(pdev->bus->parent)) + pbus = pdev->bus->parent; + else + pbus = pdev->bus; - spin_unlock_irqrestore(&tbl->it_lock, flags); + tbl = pci_iommu(pbus); + + BUG_ON(pdev->bus->parent && + (tbl->it_busno != pdev->bus->parent->number)); + + return tbl; } -static void __calgary_unmap_sg(struct iommu_table *tbl, +static void calgary_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems, int direction) { + struct iommu_table *tbl = find_iommu_table(dev); + + if (!translate_phb(to_pci_dev(dev))) + return; + while (nelems--) { unsigned int npages; dma_addr_t dma = sglist->dma_address; @@ -350,33 +398,17 @@ static void __calgary_unmap_sg(struct iommu_table *tbl, break; npages = num_dma_pages(dma, dmalen); - __iommu_free(tbl, dma, npages); + iommu_free(tbl, dma, npages); sglist++; } } -void calgary_unmap_sg(struct device *dev, struct scatterlist *sglist, - int nelems, int direction) -{ - unsigned long flags; - struct iommu_table *tbl = to_pci_dev(dev)->bus->self->sysdata; - - if (!translate_phb(to_pci_dev(dev))) - return; - - spin_lock_irqsave(&tbl->it_lock, flags); - - __calgary_unmap_sg(tbl, sglist, nelems, direction); - - spin_unlock_irqrestore(&tbl->it_lock, flags); -} - static int calgary_nontranslate_map_sg(struct device* dev, struct scatterlist *sg, int nelems, int direction) { int i; - for (i = 0; i < nelems; i++ ) { + for (i = 0; i < nelems; i++ ) { struct scatterlist *s = &sg[i]; BUG_ON(!s->page); s->dma_address = virt_to_bus(page_address(s->page) +s->offset); @@ -385,11 +417,10 @@ static int calgary_nontranslate_map_sg(struct device* dev, return nelems; } -int calgary_map_sg(struct device *dev, struct scatterlist *sg, +static int calgary_map_sg(struct device *dev, struct scatterlist *sg, int nelems, int direction) { - struct iommu_table *tbl = to_pci_dev(dev)->bus->self->sysdata; - unsigned long flags; + struct iommu_table *tbl = find_iommu_table(dev); unsigned long vaddr; unsigned int npages; unsigned long entry; @@ -398,8 +429,6 @@ int calgary_map_sg(struct device *dev, struct scatterlist *sg, if (!translate_phb(to_pci_dev(dev))) return calgary_nontranslate_map_sg(dev, sg, nelems, direction); - spin_lock_irqsave(&tbl->it_lock, flags); - for (i = 0; i < nelems; i++ ) { struct scatterlist *s = &sg[i]; BUG_ON(!s->page); @@ -423,26 +452,23 @@ int calgary_map_sg(struct device *dev, struct scatterlist *sg, s->dma_length = s->length; } - spin_unlock_irqrestore(&tbl->it_lock, flags); - return nelems; error: - __calgary_unmap_sg(tbl, sg, nelems, direction); + calgary_unmap_sg(dev, sg, nelems, direction); for (i = 0; i < nelems; i++) { sg[i].dma_address = bad_dma_address; sg[i].dma_length = 0; } - spin_unlock_irqrestore(&tbl->it_lock, flags); return 0; } -dma_addr_t calgary_map_single(struct device *dev, void *vaddr, +static dma_addr_t calgary_map_single(struct device *dev, void *vaddr, size_t size, int direction) { dma_addr_t dma_handle = bad_dma_address; unsigned long uaddr; unsigned int npages; - struct iommu_table *tbl = to_pci_dev(dev)->bus->self->sysdata; + struct iommu_table *tbl = find_iommu_table(dev); uaddr = (unsigned long)vaddr; npages = num_dma_pages(uaddr, size); @@ -455,10 +481,10 @@ dma_addr_t calgary_map_single(struct device *dev, void *vaddr, return dma_handle; } -void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, +static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, size_t size, int direction) { - struct iommu_table *tbl = to_pci_dev(dev)->bus->self->sysdata; + struct iommu_table *tbl = find_iommu_table(dev); unsigned int npages; if (!translate_phb(to_pci_dev(dev))) @@ -468,15 +494,13 @@ void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, iommu_free(tbl, dma_handle, npages); } -void* calgary_alloc_coherent(struct device *dev, size_t size, +static void* calgary_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag) { void *ret = NULL; dma_addr_t mapping; unsigned int npages, order; - struct iommu_table *tbl; - - tbl = to_pci_dev(dev)->bus->self->sysdata; + struct iommu_table *tbl = find_iommu_table(dev); size = PAGE_ALIGN(size); /* size rounded up to full pages */ npages = size >> PAGE_SHIFT; @@ -552,7 +576,22 @@ static inline void __iomem* calgary_reg(void __iomem *bar, unsigned long offset) return (void __iomem*)target; } -static void tce_cache_blast(struct iommu_table *tbl) +static inline int is_calioc2(unsigned short device) +{ + return (device == PCI_DEVICE_ID_IBM_CALIOC2); +} + +static inline int is_calgary(unsigned short device) +{ + return (device == PCI_DEVICE_ID_IBM_CALGARY); +} + +static inline int is_cal_pci_dev(unsigned short device) +{ + return (is_calgary(device) || is_calioc2(device)); +} + +static void calgary_tce_cache_blast(struct iommu_table *tbl) { u64 val; u32 aer; @@ -589,6 +628,85 @@ static void tce_cache_blast(struct iommu_table *tbl) (void)readl(target); /* flush */ } +static void calioc2_tce_cache_blast(struct iommu_table *tbl) +{ + void __iomem *bbar = tbl->bbar; + void __iomem *target; + u64 val64; + u32 val; + int i = 0; + int count = 1; + unsigned char bus = tbl->it_busno; + +begin: + printk(KERN_DEBUG "Calgary: CalIOC2 bus 0x%x entering tce cache blast " + "sequence - count %d\n", bus, count); + + /* 1. using the Page Migration Control reg set SoftStop */ + target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL); + val = be32_to_cpu(readl(target)); + printk(KERN_DEBUG "1a. read 0x%x [LE] from %p\n", val, target); + val |= PMR_SOFTSTOP; + printk(KERN_DEBUG "1b. writing 0x%x [LE] to %p\n", val, target); + writel(cpu_to_be32(val), target); + + /* 2. poll split queues until all DMA activity is done */ + printk(KERN_DEBUG "2a. starting to poll split queues\n"); + target = calgary_reg(bbar, split_queue_offset(bus)); + do { + val64 = readq(target); + i++; + } while ((val64 & 0xff) != 0xff && i < 100); + if (i == 100) + printk(KERN_WARNING "CalIOC2: PCI bus not quiesced, " + "continuing anyway\n"); + + /* 3. poll Page Migration DEBUG for SoftStopFault */ + target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG); + val = be32_to_cpu(readl(target)); + printk(KERN_DEBUG "3. read 0x%x [LE] from %p\n", val, target); + + /* 4. if SoftStopFault - goto (1) */ + if (val & PMR_SOFTSTOPFAULT) { + if (++count < 100) + goto begin; + else { + printk(KERN_WARNING "CalIOC2: too many SoftStopFaults, " + "aborting TCE cache flush sequence!\n"); + return; /* pray for the best */ + } + } + + /* 5. Slam into HardStop by reading PHB_PAGE_MIG_CTRL */ + target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL); + printk(KERN_DEBUG "5a. slamming into HardStop by reading %p\n", target); + val = be32_to_cpu(readl(target)); + printk(KERN_DEBUG "5b. read 0x%x [LE] from %p\n", val, target); + target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG); + val = be32_to_cpu(readl(target)); + printk(KERN_DEBUG "5c. read 0x%x [LE] from %p (debug)\n", val, target); + + /* 6. invalidate TCE cache */ + printk(KERN_DEBUG "6. invalidating TCE cache\n"); + target = calgary_reg(bbar, tar_offset(bus)); + writeq(tbl->tar_val, target); + + /* 7. Re-read PMCR */ + printk(KERN_DEBUG "7a. Re-reading PMCR\n"); + target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL); + val = be32_to_cpu(readl(target)); + printk(KERN_DEBUG "7b. read 0x%x [LE] from %p\n", val, target); + + /* 8. Remove HardStop */ + printk(KERN_DEBUG "8a. removing HardStop from PMCR\n"); + target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL); + val = 0; + printk(KERN_DEBUG "8b. writing 0x%x [LE] to %p\n", val, target); + writel(cpu_to_be32(val), target); + val = be32_to_cpu(readl(target)); + printk(KERN_DEBUG "8c. read 0x%x [LE] from %p\n", val, target); +} + static void __init calgary_reserve_mem_region(struct pci_dev *dev, u64 start, u64 limit) { @@ -598,7 +716,7 @@ static void __init calgary_reserve_mem_region(struct pci_dev *dev, u64 start, limit++; numpages = ((limit - start) >> PAGE_SHIFT); - iommu_range_reserve(dev->sysdata, start, numpages); + iommu_range_reserve(pci_iommu(dev->bus), start, numpages); } static void __init calgary_reserve_peripheral_mem_1(struct pci_dev *dev) @@ -606,7 +724,7 @@ static void __init calgary_reserve_peripheral_mem_1(struct pci_dev *dev) void __iomem *target; u64 low, high, sizelow; u64 start, limit; - struct iommu_table *tbl = dev->sysdata; + struct iommu_table *tbl = pci_iommu(dev->bus); unsigned char busnum = dev->bus->number; void __iomem *bbar = tbl->bbar; @@ -630,7 +748,7 @@ static void __init calgary_reserve_peripheral_mem_2(struct pci_dev *dev) u32 val32; u64 low, high, sizelow, sizehigh; u64 start, limit; - struct iommu_table *tbl = dev->sysdata; + struct iommu_table *tbl = pci_iommu(dev->bus); unsigned char busnum = dev->bus->number; void __iomem *bbar = tbl->bbar; @@ -666,14 +784,20 @@ static void __init calgary_reserve_regions(struct pci_dev *dev) { unsigned int npages; u64 start; - struct iommu_table *tbl = dev->sysdata; + struct iommu_table *tbl = pci_iommu(dev->bus); /* reserve EMERGENCY_PAGES from bad_dma_address and up */ iommu_range_reserve(tbl, bad_dma_address, EMERGENCY_PAGES); /* avoid the BIOS/VGA first 640KB-1MB region */ - start = (640 * 1024); - npages = ((1024 - 640) * 1024) >> PAGE_SHIFT; + /* for CalIOC2 - avoid the entire first MB */ + if (is_calgary(dev->device)) { + start = (640 * 1024); + npages = ((1024 - 640) * 1024) >> PAGE_SHIFT; + } else { /* calioc2 */ + start = 0; + npages = (1 * 1024 * 1024) >> PAGE_SHIFT; + } iommu_range_reserve(tbl, start, npages); /* reserve the two PCI peripheral memory regions in IO space */ @@ -694,10 +818,17 @@ static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar) if (ret) return ret; - tbl = dev->sysdata; + tbl = pci_iommu(dev->bus); tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space; tce_free(tbl, 0, tbl->it_size); + if (is_calgary(dev->device)) + tbl->chip_ops = &calgary_chip_ops; + else if (is_calioc2(dev->device)) + tbl->chip_ops = &calioc2_chip_ops; + else + BUG(); + calgary_reserve_regions(dev); /* set TARs for each PHB */ @@ -706,15 +837,15 @@ static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar) /* zero out all TAR bits under sw control */ val64 &= ~TAR_SW_BITS; - - tbl = dev->sysdata; table_phys = (u64)__pa(tbl->it_base); + val64 |= table_phys; BUG_ON(specified_table_size > TCE_TABLE_SIZE_8M); val64 |= (u64) specified_table_size; tbl->tar_val = cpu_to_be64(val64); + writeq(tbl->tar_val, target); readq(target); /* flush */ @@ -724,7 +855,7 @@ static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar) static void __init calgary_free_bus(struct pci_dev *dev) { u64 val64; - struct iommu_table *tbl = dev->sysdata; + struct iommu_table *tbl = pci_iommu(dev->bus); void __iomem *target; unsigned int bitmapsz; @@ -739,16 +870,81 @@ static void __init calgary_free_bus(struct pci_dev *dev) tbl->it_map = NULL; kfree(tbl); - dev->sysdata = NULL; + + set_pci_iommu(dev->bus, NULL); /* Can't free bootmem allocated memory after system is up :-( */ bus_info[dev->bus->number].tce_space = NULL; } +static void calgary_dump_error_regs(struct iommu_table *tbl) +{ + void __iomem *bbar = tbl->bbar; + void __iomem *target; + u32 csr, plssr; + + target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_CSR_OFFSET); + csr = be32_to_cpu(readl(target)); + + target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_PLSSR_OFFSET); + plssr = be32_to_cpu(readl(target)); + + /* If no error, the agent ID in the CSR is not valid */ + printk(KERN_EMERG "Calgary: DMA error on Calgary PHB 0x%x, " + "0x%08x@CSR 0x%08x@PLSSR\n", tbl->it_busno, csr, plssr); +} + +static void calioc2_dump_error_regs(struct iommu_table *tbl) +{ + void __iomem *bbar = tbl->bbar; + u32 csr, csmr, plssr, mck, rcstat; + void __iomem *target; + unsigned long phboff = phb_offset(tbl->it_busno); + unsigned long erroff; + u32 errregs[7]; + int i; + + /* dump CSR */ + target = calgary_reg(bbar, phboff | PHB_CSR_OFFSET); + csr = be32_to_cpu(readl(target)); + /* dump PLSSR */ + target = calgary_reg(bbar, phboff | PHB_PLSSR_OFFSET); + plssr = be32_to_cpu(readl(target)); + /* dump CSMR */ + target = calgary_reg(bbar, phboff | 0x290); + csmr = be32_to_cpu(readl(target)); + /* dump mck */ + target = calgary_reg(bbar, phboff | 0x800); + mck = be32_to_cpu(readl(target)); + + printk(KERN_EMERG "Calgary: DMA error on CalIOC2 PHB 0x%x\n", + tbl->it_busno); + + printk(KERN_EMERG "Calgary: 0x%08x@CSR 0x%08x@PLSSR 0x%08x@CSMR 0x%08x@MCK\n", + csr, plssr, csmr, mck); + + /* dump rest of error regs */ + printk(KERN_EMERG "Calgary: "); + for (i = 0; i < ARRAY_SIZE(errregs); i++) { + /* err regs are at 0x810 - 0x870 */ + erroff = (0x810 + (i * 0x10)); + target = calgary_reg(bbar, phboff | erroff); + errregs[i] = be32_to_cpu(readl(target)); + printk("0x%08x@0x%lx ", errregs[i], erroff); + } + printk("\n"); + + /* root complex status */ + target = calgary_reg(bbar, phboff | PHB_ROOT_COMPLEX_STATUS); + rcstat = be32_to_cpu(readl(target)); + printk(KERN_EMERG "Calgary: 0x%08x@0x%x\n", rcstat, + PHB_ROOT_COMPLEX_STATUS); +} + static void calgary_watchdog(unsigned long data) { struct pci_dev *dev = (struct pci_dev *)data; - struct iommu_table *tbl = dev->sysdata; + struct iommu_table *tbl = pci_iommu(dev->bus); void __iomem *bbar = tbl->bbar; u32 val32; void __iomem *target; @@ -758,13 +954,14 @@ static void calgary_watchdog(unsigned long data) /* If no error, the agent ID in the CSR is not valid */ if (val32 & CSR_AGENT_MASK) { - printk(KERN_EMERG "calgary_watchdog: DMA error on PHB %#x, " - "CSR = %#x\n", dev->bus->number, val32); + tbl->chip_ops->dump_error_regs(tbl); + + /* reset error */ writel(0, target); /* Disable bus that caused the error */ target = calgary_reg(bbar, phb_offset(tbl->it_busno) | - PHB_CONFIG_RW_OFFSET); + PHB_CONFIG_RW_OFFSET); val32 = be32_to_cpu(readl(target)); val32 |= PHB_SLOT_DISABLE; writel(cpu_to_be32(val32), target); @@ -775,8 +972,8 @@ static void calgary_watchdog(unsigned long data) } } -static void __init calgary_increase_split_completion_timeout(void __iomem *bbar, - unsigned char busnum) +static void __init calgary_set_split_completion_timeout(void __iomem *bbar, + unsigned char busnum, unsigned long timeout) { u64 val64; void __iomem *target; @@ -802,11 +999,40 @@ static void __init calgary_increase_split_completion_timeout(void __iomem *bbar, /* zero out this PHB's timer bits */ mask = ~(0xFUL << phb_shift); val64 &= mask; - val64 |= (CCR_2SEC_TIMEOUT << phb_shift); + val64 |= (timeout << phb_shift); writeq(cpu_to_be64(val64), target); readq(target); /* flush */ } +static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev) +{ + unsigned char busnum = dev->bus->number; + void __iomem *bbar = tbl->bbar; + void __iomem *target; + u32 val; + + /* + * CalIOC2 designers recommend setting bit 8 in 0xnDB0 to 1 + */ + target = calgary_reg(bbar, phb_offset(busnum) | PHB_SAVIOR_L2); + val = cpu_to_be32(readl(target)); + val |= 0x00800000; + writel(cpu_to_be32(val), target); +} + +static void calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev) +{ + unsigned char busnum = dev->bus->number; + + /* + * Give split completion a longer timeout on bus 1 for aic94xx + * http://bugzilla.kernel.org/show_bug.cgi?id=7180 + */ + if (is_calgary(dev->device) && (busnum == 1)) + calgary_set_split_completion_timeout(tbl->bbar, busnum, + CCR_2SEC_TIMEOUT); +} + static void __init calgary_enable_translation(struct pci_dev *dev) { u32 val32; @@ -816,7 +1042,7 @@ static void __init calgary_enable_translation(struct pci_dev *dev) struct iommu_table *tbl; busnum = dev->bus->number; - tbl = dev->sysdata; + tbl = pci_iommu(dev->bus); bbar = tbl->bbar; /* enable TCE in PHB Config Register */ @@ -824,20 +1050,15 @@ static void __init calgary_enable_translation(struct pci_dev *dev) val32 = be32_to_cpu(readl(target)); val32 |= PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE; - printk(KERN_INFO "Calgary: enabling translation on PHB %#x\n", busnum); + printk(KERN_INFO "Calgary: enabling translation on %s PHB %#x\n", + (dev->device == PCI_DEVICE_ID_IBM_CALGARY) ? + "Calgary" : "CalIOC2", busnum); printk(KERN_INFO "Calgary: errant DMAs will now be prevented on this " "bus.\n"); writel(cpu_to_be32(val32), target); readl(target); /* flush */ - /* - * Give split completion a longer timeout on bus 1 for aic94xx - * http://bugzilla.kernel.org/show_bug.cgi?id=7180 - */ - if (busnum == 1) - calgary_increase_split_completion_timeout(bbar, busnum); - init_timer(&tbl->watchdog_timer); tbl->watchdog_timer.function = &calgary_watchdog; tbl->watchdog_timer.data = (unsigned long)dev; @@ -853,7 +1074,7 @@ static void __init calgary_disable_translation(struct pci_dev *dev) struct iommu_table *tbl; busnum = dev->bus->number; - tbl = dev->sysdata; + tbl = pci_iommu(dev->bus); bbar = tbl->bbar; /* disable TCE in PHB Config Register */ @@ -871,13 +1092,19 @@ static void __init calgary_disable_translation(struct pci_dev *dev) static void __init calgary_init_one_nontraslated(struct pci_dev *dev) { pci_dev_get(dev); - dev->sysdata = NULL; - dev->bus->self = dev; + set_pci_iommu(dev->bus, NULL); + + /* is the device behind a bridge? */ + if (dev->bus->parent) + dev->bus->parent->self = dev; + else + dev->bus->self = dev; } static int __init calgary_init_one(struct pci_dev *dev) { void __iomem *bbar; + struct iommu_table *tbl; int ret; BUG_ON(dev->bus->number >= MAX_PHB_BUS_NUM); @@ -888,7 +1115,18 @@ static int __init calgary_init_one(struct pci_dev *dev) goto done; pci_dev_get(dev); - dev->bus->self = dev; + + if (dev->bus->parent) { + if (dev->bus->parent->self) + printk(KERN_WARNING "Calgary: IEEEE, dev %p has " + "bus->parent->self!\n", dev); + dev->bus->parent->self = dev; + } else + dev->bus->self = dev; + + tbl = pci_iommu(dev->bus); + tbl->chip_ops->handle_quirks(tbl, dev); + calgary_enable_translation(dev); return 0; @@ -924,11 +1162,18 @@ static int __init calgary_locate_bbars(void) target = calgary_reg(bbar, offset); val = be32_to_cpu(readl(target)); + start_bus = (u8)((val & 0x00FF0000) >> 16); end_bus = (u8)((val & 0x0000FF00) >> 8); - for (bus = start_bus; bus <= end_bus; bus++) { - bus_info[bus].bbar = bbar; - bus_info[bus].phbid = phb; + + if (end_bus) { + for (bus = start_bus; bus <= end_bus; bus++) { + bus_info[bus].bbar = bbar; + bus_info[bus].phbid = phb; + } + } else { + bus_info[start_bus].bbar = bbar; + bus_info[start_bus].phbid = phb; } } } @@ -948,22 +1193,24 @@ static int __init calgary_init(void) { int ret; struct pci_dev *dev = NULL; + void *tce_space; ret = calgary_locate_bbars(); if (ret) return ret; do { - dev = pci_get_device(PCI_VENDOR_ID_IBM, - PCI_DEVICE_ID_IBM_CALGARY, - dev); + dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev); if (!dev) break; + if (!is_cal_pci_dev(dev->device)) + continue; if (!translate_phb(dev)) { calgary_init_one_nontraslated(dev); continue; } - if (!bus_info[dev->bus->number].tce_space && !translate_empty_slots) + tce_space = bus_info[dev->bus->number].tce_space; + if (!tce_space && !translate_empty_slots) continue; ret = calgary_init_one(dev); @@ -976,10 +1223,11 @@ static int __init calgary_init(void) error: do { dev = pci_get_device_reverse(PCI_VENDOR_ID_IBM, - PCI_DEVICE_ID_IBM_CALGARY, - dev); + PCI_ANY_ID, dev); if (!dev) break; + if (!is_cal_pci_dev(dev->device)) + continue; if (!translate_phb(dev)) { pci_dev_put(dev); continue; @@ -1057,9 +1305,29 @@ static int __init build_detail_arrays(void) return 0; } -void __init detect_calgary(void) +static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev) { + int dev; u32 val; + + if (pci_dev == PCI_DEVICE_ID_IBM_CALIOC2) { + /* + * FIXME: properly scan for devices accross the + * PCI-to-PCI bridge on every CalIOC2 port. + */ + return 1; + } + + for (dev = 1; dev < 8; dev++) { + val = read_pci_config(bus, dev, 0, 0); + if (val != 0xffffffff) + break; + } + return (val != 0xffffffff); +} + +void __init detect_calgary(void) +{ int bus; void *tbl; int calgary_found = 0; @@ -1116,29 +1384,26 @@ void __init detect_calgary(void) specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE); for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { - int dev; struct calgary_bus_info *info = &bus_info[bus]; + unsigned short pci_device; + u32 val; + + val = read_pci_config(bus, 0, 0, 0); + pci_device = (val & 0xFFFF0000) >> 16; - if (read_pci_config(bus, 0, 0, 0) != PCI_VENDOR_DEVICE_ID_CALGARY) + if (!is_cal_pci_dev(pci_device)) continue; if (info->translation_disabled) continue; - /* - * Scan the slots of the PCI bus to see if there is a device present. - * The parent bus will be the zero-ith device, so start at 1. - */ - for (dev = 1; dev < 8; dev++) { - val = read_pci_config(bus, dev, 0, 0); - if (val != 0xffffffff || translate_empty_slots) { - tbl = alloc_tce_table(); - if (!tbl) - goto cleanup; - info->tce_space = tbl; - calgary_found = 1; - break; - } + if (calgary_bus_has_devices(bus, pci_device) || + translate_empty_slots) { + tbl = alloc_tce_table(); + if (!tbl) + goto cleanup; + info->tce_space = tbl; + calgary_found = 1; } } @@ -1249,3 +1514,66 @@ static int __init calgary_parse_options(char *p) return 1; } __setup("calgary=", calgary_parse_options); + +static void __init calgary_fixup_one_tce_space(struct pci_dev *dev) +{ + struct iommu_table *tbl; + unsigned int npages; + int i; + + tbl = pci_iommu(dev->bus); + + for (i = 0; i < 4; i++) { + struct resource *r = &dev->resource[PCI_BRIDGE_RESOURCES + i]; + + /* Don't give out TCEs that map MEM resources */ + if (!(r->flags & IORESOURCE_MEM)) + continue; + + /* 0-based? we reserve the whole 1st MB anyway */ + if (!r->start) + continue; + + /* cover the whole region */ + npages = (r->end - r->start) >> PAGE_SHIFT; + npages++; + + iommu_range_reserve(tbl, r->start, npages); + } +} + +static int __init calgary_fixup_tce_spaces(void) +{ + struct pci_dev *dev = NULL; + void *tce_space; + + if (no_iommu || swiotlb || !calgary_detected) + return -ENODEV; + + printk(KERN_DEBUG "Calgary: fixing up tce spaces\n"); + + do { + dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev); + if (!dev) + break; + if (!is_cal_pci_dev(dev->device)) + continue; + if (!translate_phb(dev)) + continue; + + tce_space = bus_info[dev->bus->number].tce_space; + if (!tce_space) + continue; + + calgary_fixup_one_tce_space(dev); + + } while (1); + + return 0; +} + +/* + * We need to be call after pcibios_assign_resources (fs_initcall level) + * and before device_initcall. + */ +rootfs_initcall(calgary_fixup_tce_spaces); diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c index 9f80aad3fe2..05d745ede56 100644 --- a/arch/x86_64/kernel/pci-dma.c +++ b/arch/x86_64/kernel/pci-dma.c @@ -8,7 +8,7 @@ #include <linux/pci.h> #include <linux/module.h> #include <asm/io.h> -#include <asm/proto.h> +#include <asm/iommu.h> #include <asm/calgary.h> int iommu_merge __read_mostly = 0; @@ -22,8 +22,7 @@ EXPORT_SYMBOL(bad_dma_address); int iommu_bio_merge __read_mostly = 0; EXPORT_SYMBOL(iommu_bio_merge); -int iommu_sac_force __read_mostly = 0; -EXPORT_SYMBOL(iommu_sac_force); +static int iommu_sac_force __read_mostly = 0; int no_iommu __read_mostly; #ifdef CONFIG_IOMMU_DEBUG @@ -322,6 +321,11 @@ static int __init pci_iommu_init(void) return 0; } +void pci_iommu_shutdown(void) +{ + gart_iommu_shutdown(); +} + #ifdef CONFIG_PCI /* Many VIA bridges seem to corrupt data for DAC. Disable it here */ diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c index ae091cdc1a4..4918c575d58 100644 --- a/arch/x86_64/kernel/pci-gart.c +++ b/arch/x86_64/kernel/pci-gart.c @@ -28,6 +28,7 @@ #include <asm/mtrr.h> #include <asm/pgtable.h> #include <asm/proto.h> +#include <asm/iommu.h> #include <asm/cacheflush.h> #include <asm/swiotlb.h> #include <asm/dma.h> @@ -235,7 +236,7 @@ static dma_addr_t gart_map_simple(struct device *dev, char *buf, } /* Map a single area into the IOMMU */ -dma_addr_t gart_map_single(struct device *dev, void *addr, size_t size, int dir) +static dma_addr_t gart_map_single(struct device *dev, void *addr, size_t size, int dir) { unsigned long phys_mem, bus; @@ -253,7 +254,7 @@ dma_addr_t gart_map_single(struct device *dev, void *addr, size_t size, int dir) /* * Free a DMA mapping. */ -void gart_unmap_single(struct device *dev, dma_addr_t dma_addr, +static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, int direction) { unsigned long iommu_page; @@ -275,7 +276,7 @@ void gart_unmap_single(struct device *dev, dma_addr_t dma_addr, /* * Wrapper for pci_unmap_single working with scatterlists. */ -void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) +static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) { int i; @@ -571,6 +572,26 @@ static const struct dma_mapping_ops gart_dma_ops = { .unmap_sg = gart_unmap_sg, }; +void gart_iommu_shutdown(void) +{ + struct pci_dev *dev; + int i; + + if (no_agp && (dma_ops != &gart_dma_ops)) + return; + + for (i = 0; i < num_k8_northbridges; i++) { + u32 ctl; + + dev = k8_northbridges[i]; + pci_read_config_dword(dev, 0x90, &ctl); + + ctl &= ~1; + + pci_write_config_dword(dev, 0x90, ctl); + } +} + void __init gart_iommu_init(void) { struct agp_kern_info info; diff --git a/arch/x86_64/kernel/pci-nommu.c b/arch/x86_64/kernel/pci-nommu.c index 6dade0c867c..2a34c6c025a 100644 --- a/arch/x86_64/kernel/pci-nommu.c +++ b/arch/x86_64/kernel/pci-nommu.c @@ -6,7 +6,7 @@ #include <linux/string.h> #include <linux/dma-mapping.h> -#include <asm/proto.h> +#include <asm/iommu.h> #include <asm/processor.h> #include <asm/dma.h> @@ -34,7 +34,7 @@ nommu_map_single(struct device *hwdev, void *ptr, size_t size, return bus; } -void nommu_unmap_single(struct device *dev, dma_addr_t addr,size_t size, +static void nommu_unmap_single(struct device *dev, dma_addr_t addr,size_t size, int direction) { } @@ -54,7 +54,7 @@ void nommu_unmap_single(struct device *dev, dma_addr_t addr,size_t size, * Device ownership issues as mentioned above for pci_map_single are * the same here. */ -int nommu_map_sg(struct device *hwdev, struct scatterlist *sg, +static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg, int nents, int direction) { int i; @@ -74,7 +74,7 @@ int nommu_map_sg(struct device *hwdev, struct scatterlist *sg, * Again, cpu read rules concerning calls here are the same as for * pci_unmap_single() above. */ -void nommu_unmap_sg(struct device *dev, struct scatterlist *sg, +static void nommu_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) { } diff --git a/arch/x86_64/kernel/pci-swiotlb.c b/arch/x86_64/kernel/pci-swiotlb.c index 4b4569abc60..b2f405ea7c8 100644 --- a/arch/x86_64/kernel/pci-swiotlb.c +++ b/arch/x86_64/kernel/pci-swiotlb.c @@ -5,7 +5,7 @@ #include <linux/module.h> #include <linux/dma-mapping.h> -#include <asm/proto.h> +#include <asm/iommu.h> #include <asm/swiotlb.h> #include <asm/dma.h> diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index 5909039f37a..e7ac629d4c4 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -207,6 +207,7 @@ void cpu_idle (void) if (__get_cpu_var(cpu_idle_state)) __get_cpu_var(cpu_idle_state) = 0; + check_pgt_cache(); rmb(); idle = pm_idle; if (!idle) @@ -278,7 +279,7 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) */ if (!pm_idle) { if (!printed) { - printk("using mwait in idle threads.\n"); + printk(KERN_INFO "using mwait in idle threads.\n"); printed = 1; } pm_idle = mwait_idle; @@ -305,6 +306,7 @@ early_param("idle", idle_setup); void __show_regs(struct pt_regs * regs) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; + unsigned long d0, d1, d2, d3, d6, d7; unsigned int fsindex,gsindex; unsigned int ds,cs,es; @@ -340,15 +342,24 @@ void __show_regs(struct pt_regs * regs) rdmsrl(MSR_GS_BASE, gs); rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); - asm("movq %%cr0, %0": "=r" (cr0)); - asm("movq %%cr2, %0": "=r" (cr2)); - asm("movq %%cr3, %0": "=r" (cr3)); - asm("movq %%cr4, %0": "=r" (cr4)); + cr0 = read_cr0(); + cr2 = read_cr2(); + cr3 = read_cr3(); + cr4 = read_cr4(); printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", fs,fsindex,gs,gsindex,shadowgs); printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0); printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4); + + get_debugreg(d0, 0); + get_debugreg(d1, 1); + get_debugreg(d2, 2); + printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); + get_debugreg(d3, 3); + get_debugreg(d6, 6); + get_debugreg(d7, 7); + printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); } void show_regs(struct pt_regs *regs) diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c index 9409117b9f1..e83cc67155a 100644 --- a/arch/x86_64/kernel/ptrace.c +++ b/arch/x86_64/kernel/ptrace.c @@ -102,16 +102,25 @@ unsigned long convert_rip_to_linear(struct task_struct *child, struct pt_regs *r u32 *desc; unsigned long base; - down(&child->mm->context.sem); - desc = child->mm->context.ldt + (seg & ~7); - base = (desc[0] >> 16) | ((desc[1] & 0xff) << 16) | (desc[1] & 0xff000000); + seg &= ~7UL; - /* 16-bit code segment? */ - if (!((desc[1] >> 22) & 1)) - addr &= 0xffff; - addr += base; + down(&child->mm->context.sem); + if (unlikely((seg >> 3) >= child->mm->context.size)) + addr = -1L; /* bogus selector, access would fault */ + else { + desc = child->mm->context.ldt + seg; + base = ((desc[0] >> 16) | + ((desc[1] & 0xff) << 16) | + (desc[1] & 0xff000000)); + + /* 16-bit code segment? */ + if (!((desc[1] >> 22) & 1)) + addr &= 0xffff; + addr += base; + } up(&child->mm->context.sem); } + return addr; } @@ -313,17 +322,9 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) switch (request) { /* when I and D space are separate, these will need to be fixed. */ case PTRACE_PEEKTEXT: /* read word at location addr. */ - case PTRACE_PEEKDATA: { - unsigned long tmp; - int copied; - - copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0); - ret = -EIO; - if (copied != sizeof(tmp)) - break; - ret = put_user(tmp,(unsigned long __user *) data); + case PTRACE_PEEKDATA: + ret = generic_ptrace_peekdata(child, addr, data); break; - } /* read the word at location addr in the USER area. */ case PTRACE_PEEKUSR: { @@ -367,10 +368,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) /* when I and D space are separate, this will have to be fixed. */ case PTRACE_POKETEXT: /* write the word at location addr. */ case PTRACE_POKEDATA: - ret = 0; - if (access_process_vm(child, addr, &data, sizeof(data), 1) == sizeof(data)) - break; - ret = -EIO; + ret = generic_ptrace_pokedata(child, addr, data); break; case PTRACE_POKEUSR: /* write the word at location addr in the USER area */ diff --git a/arch/x86_64/kernel/reboot.c b/arch/x86_64/kernel/reboot.c index 7503068e788..368db2b9c5a 100644 --- a/arch/x86_64/kernel/reboot.c +++ b/arch/x86_64/kernel/reboot.c @@ -16,6 +16,7 @@ #include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/apic.h> +#include <asm/iommu.h> /* * Power off function, if any @@ -81,6 +82,7 @@ static inline void kb_wait(void) void machine_shutdown(void) { unsigned long flags; + /* Stop the cpus and apics */ #ifdef CONFIG_SMP int reboot_cpu_id; @@ -111,6 +113,8 @@ void machine_shutdown(void) disable_IO_APIC(); local_irq_restore(flags); + + pci_iommu_shutdown(); } void machine_emergency_restart(void) diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index eb6524f3ac2..af838f6b0b7 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -575,6 +575,8 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) level = cpuid_eax(1); if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)) set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); + if (c->x86 == 0x10) + set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); /* Enable workaround for FXSAVE leak */ if (c->x86 >= 6) @@ -600,8 +602,14 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) if (c->extended_cpuid_level >= 0x80000008) amd_detect_cmp(c); - /* Fix cpuid4 emulation for more */ - num_cache_leaves = 3; + if (c->extended_cpuid_level >= 0x80000006 && + (cpuid_edx(0x80000006) & 0xf000)) + num_cache_leaves = 4; + else + num_cache_leaves = 3; + + if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11) + set_bit(X86_FEATURE_K8, &c->x86_capability); /* RDTSC can be speculated around */ clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); @@ -846,6 +854,8 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c) c->x86_capability[2] = cpuid_edx(0x80860001); } + init_scattered_cpuid_features(c); + c->apicid = phys_pkg_id(0); /* @@ -931,7 +941,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce", "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov", "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx", - "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", NULL, + "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe", /* AMD-defined */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, @@ -947,10 +957,11 @@ static int show_cpuinfo(struct seq_file *m, void *v) NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* Other (Linux-defined) */ - "cxmmx", NULL, "cyrix_arr", "centaur_mcr", NULL, - "constant_tsc", NULL, NULL, - "up", NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr", + NULL, NULL, NULL, NULL, + "constant_tsc", "up", NULL, "arch_perfmon", + "pebs", "bts", NULL, "sync_rdtsc", + "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* Intel-defined (#2) */ @@ -961,7 +972,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) /* VIA/Cyrix/Centaur-defined */ NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en", - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, @@ -972,6 +983,12 @@ static int show_cpuinfo(struct seq_file *m, void *v) "osvw", "ibs", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + + /* Auxiliary (Linux-defined) */ + "ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, }; static char *x86_power_flags[] = { "ts", /* temperature sensor */ diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c index 290f5d8037c..739175b01e0 100644 --- a/arch/x86_64/kernel/signal.c +++ b/arch/x86_64/kernel/signal.c @@ -26,6 +26,7 @@ #include <asm/i387.h> #include <asm/proto.h> #include <asm/ia32_unistd.h> +#include <asm/mce.h> /* #define DEBUG_SIG 1 */ @@ -472,6 +473,12 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) clear_thread_flag(TIF_SINGLESTEP); } +#ifdef CONFIG_X86_MCE + /* notify userspace of pending MCEs */ + if (thread_info_flags & _TIF_MCE_NOTIFY) + mce_notify_user(); +#endif /* CONFIG_X86_MCE */ + /* deal with pending signal delivery */ if (thread_info_flags & (_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK)) do_signal(regs); @@ -480,7 +487,7 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) void signal_fault(struct pt_regs *regs, void __user *frame, char *where) { struct task_struct *me = current; - if (exception_trace) + if (show_unhandled_signals && printk_ratelimit()) printk("%s[%d] bad frame in %s frame:%p rip:%lx rsp:%lx orax:%lx\n", me->comm,me->pid,where,frame,regs->rip,regs->rsp,regs->orig_rax); diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c index 2ff46859162..673a300b594 100644 --- a/arch/x86_64/kernel/smp.c +++ b/arch/x86_64/kernel/smp.c @@ -241,7 +241,7 @@ void flush_tlb_mm (struct mm_struct * mm) } if (!cpus_empty(cpu_mask)) flush_tlb_others(cpu_mask, mm, FLUSH_ALL); - + check_pgt_cache(); preempt_enable(); } EXPORT_SYMBOL(flush_tlb_mm); @@ -357,7 +357,7 @@ __smp_call_function_single(int cpu, void (*func) (void *info), void *info, } /* - * smp_call_function_single - Run a function on another CPU + * smp_call_function_single - Run a function on a specific CPU * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. * @nonatomic: Currently unused. @@ -374,17 +374,21 @@ int smp_call_function_single (int cpu, void (*func) (void *info), void *info, { /* prevent preemption and reschedule on another processor */ int me = get_cpu(); + + /* Can deadlock when called with interrupts disabled */ + WARN_ON(irqs_disabled()); + if (cpu == me) { + local_irq_disable(); + func(info); + local_irq_enable(); put_cpu(); return 0; } - /* Can deadlock when called with interrupts disabled */ - WARN_ON(irqs_disabled()); - - spin_lock_bh(&call_lock); + spin_lock(&call_lock); __smp_call_function_single(cpu, func, info, nonatomic, wait); - spin_unlock_bh(&call_lock); + spin_unlock(&call_lock); put_cpu(); return 0; } diff --git a/arch/x86_64/kernel/suspend.c b/arch/x86_64/kernel/suspend.c index 6a5a98f2a75..ea83a9f9196 100644 --- a/arch/x86_64/kernel/suspend.c +++ b/arch/x86_64/kernel/suspend.c @@ -55,11 +55,11 @@ void __save_processor_state(struct saved_context *ctxt) * control registers */ rdmsrl(MSR_EFER, ctxt->efer); - asm volatile ("movq %%cr0, %0" : "=r" (ctxt->cr0)); - asm volatile ("movq %%cr2, %0" : "=r" (ctxt->cr2)); - asm volatile ("movq %%cr3, %0" : "=r" (ctxt->cr3)); - asm volatile ("movq %%cr4, %0" : "=r" (ctxt->cr4)); - asm volatile ("movq %%cr8, %0" : "=r" (ctxt->cr8)); + ctxt->cr0 = read_cr0(); + ctxt->cr2 = read_cr2(); + ctxt->cr3 = read_cr3(); + ctxt->cr4 = read_cr4(); + ctxt->cr8 = read_cr8(); } void save_processor_state(void) @@ -81,11 +81,11 @@ void __restore_processor_state(struct saved_context *ctxt) * control registers */ wrmsrl(MSR_EFER, ctxt->efer); - asm volatile ("movq %0, %%cr8" :: "r" (ctxt->cr8)); - asm volatile ("movq %0, %%cr4" :: "r" (ctxt->cr4)); - asm volatile ("movq %0, %%cr3" :: "r" (ctxt->cr3)); - asm volatile ("movq %0, %%cr2" :: "r" (ctxt->cr2)); - asm volatile ("movq %0, %%cr0" :: "r" (ctxt->cr0)); + write_cr8(ctxt->cr8); + write_cr4(ctxt->cr4); + write_cr3(ctxt->cr3); + write_cr2(ctxt->cr2); + write_cr0(ctxt->cr0); /* * now restore the descriptor tables to their proper values diff --git a/arch/x86_64/kernel/tce.c b/arch/x86_64/kernel/tce.c index f61fb8e4f12..3aeae2fa2e2 100644 --- a/arch/x86_64/kernel/tce.c +++ b/arch/x86_64/kernel/tce.c @@ -136,9 +136,9 @@ int build_tce_table(struct pci_dev *dev, void __iomem *bbar) struct iommu_table *tbl; int ret; - if (dev->sysdata) { - printk(KERN_ERR "Calgary: dev %p has sysdata %p\n", - dev, dev->sysdata); + if (pci_iommu(dev->bus)) { + printk(KERN_ERR "Calgary: dev %p has sysdata->iommu %p\n", + dev, pci_iommu(dev->bus)); BUG(); } @@ -155,11 +155,7 @@ int build_tce_table(struct pci_dev *dev, void __iomem *bbar) tbl->bbar = bbar; - /* - * NUMA is already using the bus's sysdata pointer, so we use - * the bus's pci_dev's sysdata instead. - */ - dev->sysdata = tbl; + set_pci_iommu(dev->bus, tbl); return 0; diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 4a0895bacf5..6d48a4e826d 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -33,6 +33,7 @@ #include <acpi/acpi_bus.h> #endif #include <asm/8253pit.h> +#include <asm/i8253.h> #include <asm/pgtable.h> #include <asm/vsyscall.h> #include <asm/timex.h> @@ -44,12 +45,14 @@ #include <asm/hpet.h> #include <asm/mpspec.h> #include <asm/nmi.h> +#include <asm/vgtod.h> static char *timename = NULL; DEFINE_SPINLOCK(rtc_lock); EXPORT_SYMBOL(rtc_lock); DEFINE_SPINLOCK(i8253_lock); +EXPORT_SYMBOL(i8253_lock); volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; @@ -79,8 +82,9 @@ EXPORT_SYMBOL(profile_pc); * sheet for details. */ -static void set_rtc_mmss(unsigned long nowtime) +static int set_rtc_mmss(unsigned long nowtime) { + int retval = 0; int real_seconds, real_minutes, cmos_minutes; unsigned char control, freq_select; @@ -120,6 +124,7 @@ static void set_rtc_mmss(unsigned long nowtime) if (abs(real_minutes - cmos_minutes) >= 30) { printk(KERN_WARNING "time.c: can't update CMOS clock " "from %d to %d\n", cmos_minutes, real_minutes); + retval = -1; } else { BIN_TO_BCD(real_seconds); BIN_TO_BCD(real_minutes); @@ -139,12 +144,17 @@ static void set_rtc_mmss(unsigned long nowtime) CMOS_WRITE(freq_select, RTC_FREQ_SELECT); spin_unlock(&rtc_lock); + + return retval; } +int update_persistent_clock(struct timespec now) +{ + return set_rtc_mmss(now.tv_sec); +} void main_timer_handler(void) { - static unsigned long rtc_update = 0; /* * Here we are in the timer irq handler. We have irqs locally disabled (so we * don't need spin_lock_irqsave()) but we don't know if the timer_bh is running @@ -172,20 +182,6 @@ void main_timer_handler(void) if (!using_apic_timer) smp_local_timer_interrupt(); -/* - * If we have an externally synchronized Linux clock, then update CMOS clock - * accordingly every ~11 minutes. set_rtc_mmss() will be called in the jiffy - * closest to exactly 500 ms before the next second. If the update fails, we - * don't care, as it'll be updated on the next turn, and the problem (time way - * off) isn't likely to go away much sooner anyway. - */ - - if (ntp_synced() && xtime.tv_sec > rtc_update && - abs(xtime.tv_nsec - 500000000) <= tick_nsec / 2) { - set_rtc_mmss(xtime.tv_sec); - rtc_update = xtime.tv_sec + 660; - } - write_sequnlock(&xtime_lock); } @@ -199,7 +195,7 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -static unsigned long get_cmos_time(void) +unsigned long read_persistent_clock(void) { unsigned int year, mon, day, hour, min, sec; unsigned long flags; @@ -226,7 +222,7 @@ static unsigned long get_cmos_time(void) /* * We know that x86-64 always uses BCD format, no need to check the * config register. - */ + */ BCD_TO_BIN(sec); BCD_TO_BIN(min); @@ -239,11 +235,11 @@ static unsigned long get_cmos_time(void) BCD_TO_BIN(century); year += century * 100; printk(KERN_INFO "Extended CMOS year: %d\n", century * 100); - } else { + } else { /* * x86-64 systems only exists since 2002. * This will work up to Dec 31, 2100 - */ + */ year += 2000; } @@ -255,45 +251,45 @@ static unsigned long get_cmos_time(void) #define TICK_COUNT 100000000 static unsigned int __init tsc_calibrate_cpu_khz(void) { - int tsc_start, tsc_now; - int i, no_ctr_free; - unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0; - unsigned long flags; - - for (i = 0; i < 4; i++) - if (avail_to_resrv_perfctr_nmi_bit(i)) - break; - no_ctr_free = (i == 4); - if (no_ctr_free) { - i = 3; - rdmsrl(MSR_K7_EVNTSEL3, evntsel3); - wrmsrl(MSR_K7_EVNTSEL3, 0); - rdmsrl(MSR_K7_PERFCTR3, pmc3); - } else { - reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i); - reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i); - } - local_irq_save(flags); - /* start meauring cycles, incrementing from 0 */ - wrmsrl(MSR_K7_PERFCTR0 + i, 0); - wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76); - rdtscl(tsc_start); - do { - rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now); - tsc_now = get_cycles_sync(); - } while ((tsc_now - tsc_start) < TICK_COUNT); - - local_irq_restore(flags); - if (no_ctr_free) { - wrmsrl(MSR_K7_EVNTSEL3, 0); - wrmsrl(MSR_K7_PERFCTR3, pmc3); - wrmsrl(MSR_K7_EVNTSEL3, evntsel3); - } else { - release_perfctr_nmi(MSR_K7_PERFCTR0 + i); - release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); - } - - return pmc_now * tsc_khz / (tsc_now - tsc_start); + int tsc_start, tsc_now; + int i, no_ctr_free; + unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0; + unsigned long flags; + + for (i = 0; i < 4; i++) + if (avail_to_resrv_perfctr_nmi_bit(i)) + break; + no_ctr_free = (i == 4); + if (no_ctr_free) { + i = 3; + rdmsrl(MSR_K7_EVNTSEL3, evntsel3); + wrmsrl(MSR_K7_EVNTSEL3, 0); + rdmsrl(MSR_K7_PERFCTR3, pmc3); + } else { + reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i); + reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i); + } + local_irq_save(flags); + /* start meauring cycles, incrementing from 0 */ + wrmsrl(MSR_K7_PERFCTR0 + i, 0); + wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76); + rdtscl(tsc_start); + do { + rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now); + tsc_now = get_cycles_sync(); + } while ((tsc_now - tsc_start) < TICK_COUNT); + + local_irq_restore(flags); + if (no_ctr_free) { + wrmsrl(MSR_K7_EVNTSEL3, 0); + wrmsrl(MSR_K7_PERFCTR3, pmc3); + wrmsrl(MSR_K7_EVNTSEL3, evntsel3); + } else { + release_perfctr_nmi(MSR_K7_PERFCTR0 + i); + release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); + } + + return pmc_now * tsc_khz / (tsc_now - tsc_start); } /* @@ -321,7 +317,7 @@ static unsigned int __init pit_calibrate_tsc(void) end = get_cycles_sync(); spin_unlock_irqrestore(&i8253_lock, flags); - + return (end - start) / 50; } @@ -366,25 +362,20 @@ static struct irqaction irq0 = { .handler = timer_interrupt, .flags = IRQF_DISABLED | IRQF_IRQPOLL, .mask = CPU_MASK_NONE, - .name = "timer" + .name = "timer" }; void __init time_init(void) { if (nohpet) hpet_address = 0; - xtime.tv_sec = get_cmos_time(); - xtime.tv_nsec = 0; - - set_normalized_timespec(&wall_to_monotonic, - -xtime.tv_sec, -xtime.tv_nsec); if (hpet_arch_init()) hpet_address = 0; if (hpet_use_timer) { /* set tick_nsec to use the proper rate for HPET */ - tick_nsec = TICK_NSEC_HPET; + tick_nsec = TICK_NSEC_HPET; tsc_khz = hpet_calibrate_tsc(); timename = "HPET"; } else { @@ -415,54 +406,21 @@ void __init time_init(void) setup_irq(0, &irq0); } - -static long clock_cmos_diff; -static unsigned long sleep_start; - /* * sysfs support for the timer. */ static int timer_suspend(struct sys_device *dev, pm_message_t state) { - /* - * Estimate time zone so that set_time can update the clock - */ - long cmos_time = get_cmos_time(); - - clock_cmos_diff = -cmos_time; - clock_cmos_diff += get_seconds(); - sleep_start = cmos_time; return 0; } static int timer_resume(struct sys_device *dev) { - unsigned long flags; - unsigned long sec; - unsigned long ctime = get_cmos_time(); - long sleep_length = (ctime - sleep_start) * HZ; - - if (sleep_length < 0) { - printk(KERN_WARNING "Time skew detected in timer resume!\n"); - /* The time after the resume must not be earlier than the time - * before the suspend or some nasty things will happen - */ - sleep_length = 0; - ctime = sleep_start; - } if (hpet_address) hpet_reenable(); else i8254_timer_resume(); - - sec = ctime + clock_cmos_diff; - write_seqlock_irqsave(&xtime_lock,flags); - xtime.tv_sec = sec; - xtime.tv_nsec = 0; - jiffies += sleep_length; - write_sequnlock_irqrestore(&xtime_lock,flags); - touch_softlockup_watchdog(); return 0; } diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index aac1c0be54c..03888420775 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -34,6 +34,10 @@ #include <linux/bug.h> #include <linux/kdebug.h> +#if defined(CONFIG_EDAC) +#include <linux/edac.h> +#endif + #include <asm/system.h> #include <asm/io.h> #include <asm/atomic.h> @@ -330,6 +334,7 @@ static int print_trace_stack(void *data, char *name) static void print_trace_address(void *data, unsigned long addr) { + touch_nmi_watchdog(); printk_address(addr); } @@ -518,6 +523,7 @@ void __kprobes __die(const char * str, struct pt_regs * regs, long err) printk("\n"); notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV); show_registers(regs); + add_taint(TAINT_DIE); /* Executive summary in case the oops scrolled away */ printk(KERN_ALERT "RIP "); printk_address(regs->rip); @@ -531,7 +537,7 @@ void die(const char * str, struct pt_regs * regs, long err) unsigned long flags = oops_begin(); if (!user_mode(regs)) - report_bug(regs->rip); + report_bug(regs->rip, regs); __die(str, regs, err); oops_end(flags); @@ -578,7 +584,8 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, tsk->thread.error_code = error_code; tsk->thread.trap_no = trapnr; - if (exception_trace && unhandled_signal(tsk, signr)) + if (show_unhandled_signals && unhandled_signal(tsk, signr) && + printk_ratelimit()) printk(KERN_INFO "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n", tsk->comm, tsk->pid, str, @@ -682,7 +689,8 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs, tsk->thread.error_code = error_code; tsk->thread.trap_no = 13; - if (exception_trace && unhandled_signal(tsk, SIGSEGV)) + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && + printk_ratelimit()) printk(KERN_INFO "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n", tsk->comm, tsk->pid, @@ -717,6 +725,13 @@ mem_parity_error(unsigned char reason, struct pt_regs * regs) reason); printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n"); +#if defined(CONFIG_EDAC) + if(edac_handler_set()) { + edac_atomic_assert_error(); + return; + } +#endif + if (panic_on_unrecovered_nmi) panic("NMI: Not continuing"); diff --git a/arch/x86_64/kernel/tsc.c b/arch/x86_64/kernel/tsc.c index 48f9a8e6aa9..9b76b03d060 100644 --- a/arch/x86_64/kernel/tsc.c +++ b/arch/x86_64/kernel/tsc.c @@ -44,7 +44,7 @@ unsigned long long sched_clock(void) static int tsc_unstable; -static inline int check_tsc_unstable(void) +inline int check_tsc_unstable(void) { return tsc_unstable; } @@ -61,25 +61,9 @@ static inline int check_tsc_unstable(void) * first tick after the change will be slightly wrong. */ -#include <linux/workqueue.h> - -static unsigned int cpufreq_delayed_issched = 0; -static unsigned int cpufreq_init = 0; -static struct work_struct cpufreq_delayed_get_work; - -static void handle_cpufreq_delayed_get(struct work_struct *v) -{ - unsigned int cpu; - for_each_online_cpu(cpu) { - cpufreq_get(cpu); - } - cpufreq_delayed_issched = 0; -} - -static unsigned int ref_freq = 0; -static unsigned long loops_per_jiffy_ref = 0; - -static unsigned long tsc_khz_ref = 0; +static unsigned int ref_freq; +static unsigned long loops_per_jiffy_ref; +static unsigned long tsc_khz_ref; static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) @@ -125,10 +109,8 @@ static struct notifier_block time_cpufreq_notifier_block = { static int __init cpufreq_tsc(void) { - INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get); - if (!cpufreq_register_notifier(&time_cpufreq_notifier_block, - CPUFREQ_TRANSITION_NOTIFIER)) - cpufreq_init = 1; + cpufreq_register_notifier(&time_cpufreq_notifier_block, + CPUFREQ_TRANSITION_NOTIFIER); return 0; } @@ -153,17 +135,18 @@ __cpuinit int unsynchronized_tsc(void) #endif /* Most intel systems have synchronized TSCs except for multi node systems */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { #ifdef CONFIG_ACPI /* But TSC doesn't tick in C3 so don't use it there */ - if (acpi_gbl_FADT.header.length > 0 && acpi_gbl_FADT.C3latency < 1000) + if (acpi_gbl_FADT.header.length > 0 && + acpi_gbl_FADT.C3latency < 1000) return 1; #endif - return 0; + return 0; } - /* Assume multi socket systems are not synchronized */ - return num_present_cpus() > 1; + /* Assume multi socket systems are not synchronized */ + return num_present_cpus() > 1; } int __init notsc_setup(char *s) diff --git a/arch/x86_64/kernel/verify_cpu.S b/arch/x86_64/kernel/verify_cpu.S index e035f594819..45b6f8a975a 100644 --- a/arch/x86_64/kernel/verify_cpu.S +++ b/arch/x86_64/kernel/verify_cpu.S @@ -37,20 +37,6 @@ verify_cpu: pushl $0 # Kill any dangerous flags popfl - /* minimum CPUID flags for x86-64 as defined by AMD */ -#define M(x) (1<<(x)) -#define M2(a,b) M(a)|M(b) -#define M4(a,b,c,d) M(a)|M(b)|M(c)|M(d) - -#define SSE_MASK \ - (M2(X86_FEATURE_XMM,X86_FEATURE_XMM2)) -#define REQUIRED_MASK1 \ - (M4(X86_FEATURE_FPU,X86_FEATURE_PSE,X86_FEATURE_TSC,X86_FEATURE_MSR)|\ - M4(X86_FEATURE_PAE,X86_FEATURE_CX8,X86_FEATURE_PGE,X86_FEATURE_CMOV)|\ - M(X86_FEATURE_FXSR)) -#define REQUIRED_MASK2 \ - (M(X86_FEATURE_LM - 32)) - pushfl # standard way to check for cpuid popl %eax movl %eax,%ebx @@ -79,8 +65,8 @@ verify_cpu: verify_cpu_noamd: movl $0x1,%eax # Does the cpu have what it takes cpuid - andl $REQUIRED_MASK1,%edx - xorl $REQUIRED_MASK1,%edx + andl $REQUIRED_MASK0,%edx + xorl $REQUIRED_MASK0,%edx jnz verify_cpu_no_longmode movl $0x80000000,%eax # See if extended cpuid is implemented @@ -90,8 +76,8 @@ verify_cpu_noamd: movl $0x80000001,%eax # Does the cpu have what it takes cpuid - andl $REQUIRED_MASK2,%edx - xorl $REQUIRED_MASK2,%edx + andl $REQUIRED_MASK1,%edx + xorl $REQUIRED_MASK1,%edx jnz verify_cpu_no_longmode verify_cpu_sse_test: diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S index dbccfda8364..ba8ea97abd2 100644 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ b/arch/x86_64/kernel/vmlinux.lds.S @@ -28,7 +28,7 @@ SECTIONS _text = .; /* Text and read-only data */ .text : AT(ADDR(.text) - LOAD_OFFSET) { /* First the code that has to be first for bootstrapping */ - *(.bootstrap.text) + *(.text.head) _stext = .; /* Then the rest */ TEXT_TEXT @@ -48,10 +48,19 @@ SECTIONS __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) } __stop___ex_table = .; - BUG_TABLE + NOTES :text :note + + BUG_TABLE :text RODATA + . = ALIGN(4); + .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) { + __tracedata_start = .; + *(.tracedata) + __tracedata_end = .; + } + . = ALIGN(PAGE_SIZE); /* Align data segment to page size boundary */ /* Data */ .data : AT(ADDR(.data) - LOAD_OFFSET) { @@ -91,6 +100,9 @@ SECTIONS .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) { *(.vsyscall_gtod_data) } vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data); + .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) + { *(.vsyscall_clock) } + vsyscall_clock = VVIRT(.vsyscall_clock); .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) @@ -131,20 +143,11 @@ SECTIONS /* might get freed after init */ . = ALIGN(4096); __smp_alt_begin = .; - __smp_alt_instructions = .; - .smp_altinstructions : AT(ADDR(.smp_altinstructions) - LOAD_OFFSET) { - *(.smp_altinstructions) - } - __smp_alt_instructions_end = .; - . = ALIGN(8); __smp_locks = .; .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { *(.smp_locks) } __smp_locks_end = .; - .smp_altinstr_replacement : AT(ADDR(.smp_altinstr_replacement) - LOAD_OFFSET) { - *(.smp_altinstr_replacement) - } . = ALIGN(4096); __smp_alt_end = .; @@ -187,6 +190,12 @@ SECTIONS .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) } .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) } +/* vdso blob that is mapped into user space */ + vdso_start = . ; + .vdso : AT(ADDR(.vdso) - LOAD_OFFSET) { *(.vdso) } + . = ALIGN(4096); + vdso_end = .; + #ifdef CONFIG_BLK_DEV_INITRD . = ALIGN(4096); __initramfs_start = .; @@ -194,10 +203,8 @@ SECTIONS __initramfs_end = .; #endif - . = ALIGN(4096); - __per_cpu_start = .; - .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) } - __per_cpu_end = .; + PERCPU(4096) + . = ALIGN(4096); __init_end = .; diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c index 57660d58d50..06c34949bfd 100644 --- a/arch/x86_64/kernel/vsyscall.c +++ b/arch/x86_64/kernel/vsyscall.c @@ -42,6 +42,7 @@ #include <asm/segment.h> #include <asm/desc.h> #include <asm/topology.h> +#include <asm/vgtod.h> #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) #define __syscall_clobber "r11","rcx","memory" @@ -57,26 +58,9 @@ * - writen by timer interrupt or systcl (/proc/sys/kernel/vsyscall64) * Try to keep this structure as small as possible to avoid cache line ping pongs */ -struct vsyscall_gtod_data_t { - seqlock_t lock; - - /* open coded 'struct timespec' */ - time_t wall_time_sec; - u32 wall_time_nsec; - - int sysctl_enabled; - struct timezone sys_tz; - struct { /* extract of a clocksource struct */ - cycle_t (*vread)(void); - cycle_t cycle_last; - cycle_t mask; - u32 mult; - u32 shift; - } clock; -}; int __vgetcpu_mode __section_vgetcpu_mode; -struct vsyscall_gtod_data_t __vsyscall_gtod_data __section_vsyscall_gtod_data = +struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data = { .lock = SEQLOCK_UNLOCKED, .sysctl_enabled = 1, @@ -96,6 +80,8 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; vsyscall_gtod_data.sys_tz = sys_tz; + vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; + vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic; write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c index 635e58d443d..327c9f2fa62 100644 --- a/arch/x86_64/mm/fault.c +++ b/arch/x86_64/mm/fault.c @@ -159,7 +159,7 @@ void dump_pagetable(unsigned long address) pmd_t *pmd; pte_t *pte; - asm("movq %%cr3,%0" : "=r" (pgd)); + pgd = (pgd_t *)read_cr3(); pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); pgd += pgd_index(address); @@ -221,16 +221,6 @@ static int is_errata93(struct pt_regs *regs, unsigned long address) return 0; } -int unhandled_signal(struct task_struct *tsk, int sig) -{ - if (is_init(tsk)) - return 1; - if (tsk->ptrace & PT_PTRACED) - return 0; - return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) || - (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL); -} - static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, unsigned long error_code) { @@ -301,8 +291,8 @@ static int vmalloc_fault(unsigned long address) return 0; } -int page_fault_trace = 0; -int exception_trace = 1; +static int page_fault_trace; +int show_unhandled_signals = 1; /* * This routine handles page faults. It determines the address, @@ -317,7 +307,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, struct vm_area_struct * vma; unsigned long address; const struct exception_table_entry *fixup; - int write; + int write, fault; unsigned long flags; siginfo_t info; @@ -326,7 +316,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, prefetchw(&mm->mmap_sem); /* get the address */ - __asm__("movq %%cr2,%0":"=r" (address)); + address = read_cr2(); info.si_code = SEGV_MAPERR; @@ -450,19 +440,18 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - switch (handle_mm_fault(mm, vma, address, write)) { - case VM_FAULT_MINOR: - tsk->min_flt++; - break; - case VM_FAULT_MAJOR: - tsk->maj_flt++; - break; - case VM_FAULT_SIGBUS: - goto do_sigbus; - default: - goto out_of_memory; + fault = handle_mm_fault(mm, vma, address, write); + if (unlikely(fault & VM_FAULT_ERROR)) { + if (fault & VM_FAULT_OOM) + goto out_of_memory; + else if (fault & VM_FAULT_SIGBUS) + goto do_sigbus; + BUG(); } - + if (fault & VM_FAULT_MAJOR) + tsk->maj_flt++; + else + tsk->min_flt++; up_read(&mm->mmap_sem); return; @@ -495,7 +484,8 @@ bad_area_nosemaphore: (address >> 32)) return; - if (exception_trace && unhandled_signal(tsk, SIGSEGV)) { + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && + printk_ratelimit()) { printk( "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n", tsk->pid > 1 ? KERN_INFO : KERN_EMERG, @@ -569,7 +559,7 @@ out_of_memory: } printk("VM: killing process %s\n", tsk->comm); if (error_code & 4) - do_exit(SIGKILL); + do_group_exit(SIGKILL); goto no_context; do_sigbus: diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index 9a0e98accf0..38f5d636800 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -383,7 +383,7 @@ void __meminit init_memory_mapping(unsigned long start, unsigned long end) } if (!after_bootmem) - asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features)); + mmu_cr4_features = read_cr4(); __flush_tlb_all(); } @@ -600,16 +600,6 @@ void mark_rodata_ro(void) { unsigned long start = (unsigned long)_stext, end; -#ifdef CONFIG_HOTPLUG_CPU - /* It must still be possible to apply SMP alternatives. */ - if (num_possible_cpus() > 1) - start = (unsigned long)_etext; -#endif - -#ifdef CONFIG_KPROBES - start = (unsigned long)__start_rodata; -#endif - end = (unsigned long)__end_rodata; start = (start + PAGE_SIZE - 1) & PAGE_MASK; end &= PAGE_MASK; @@ -697,41 +687,6 @@ int kern_addr_valid(unsigned long addr) return pfn_valid(pte_pfn(*pte)); } -#ifdef CONFIG_SYSCTL -#include <linux/sysctl.h> - -extern int exception_trace, page_fault_trace; - -static ctl_table debug_table2[] = { - { - .ctl_name = 99, - .procname = "exception-trace", - .data = &exception_trace, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - {} -}; - -static ctl_table debug_root_table2[] = { - { - .ctl_name = CTL_DEBUG, - .procname = "debug", - .mode = 0555, - .child = debug_table2 - }, - {} -}; - -static __init int x8664_sysctl_init(void) -{ - register_sysctl_table(debug_root_table2); - return 0; -} -__initcall(x8664_sysctl_init); -#endif - /* A pseudo VMA to allow ptrace access for the vsyscall page. This only covers the 64bit vsyscall page now. 32bit has a real VMA now and does not need special handling anymore. */ @@ -769,8 +724,17 @@ int in_gate_area_no_task(unsigned long addr) return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END); } -void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size) +void * __init alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size) { return __alloc_bootmem_core(pgdat->bdata, size, SMP_CACHE_BYTES, (4UL*1024*1024*1024), 0); } + +const char *arch_vma_name(struct vm_area_struct *vma) +{ + if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso) + return "[vdso]"; + if (vma == &gate_vma) + return "[vsyscall]"; + return NULL; +} diff --git a/arch/x86_64/mm/k8topology.c b/arch/x86_64/mm/k8topology.c index f983c75825d..a96006f7ae0 100644 --- a/arch/x86_64/mm/k8topology.c +++ b/arch/x86_64/mm/k8topology.c @@ -44,12 +44,12 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) { unsigned long prevbase; struct bootnode nodes[8]; - int nodeid, i, nb; + int nodeid, i, j, nb; unsigned char nodeids[8]; int found = 0; u32 reg; unsigned numnodes; - unsigned dualcore = 0; + unsigned num_cores; if (!early_pci_allowed()) return -1; @@ -60,6 +60,9 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb); + num_cores = (cpuid_ecx(0x80000008) & 0xff) + 1; + printk(KERN_INFO "CPU has %d num_cores\n", num_cores); + reg = read_pci_config(0, nb, 0, 0x60); numnodes = ((reg >> 4) & 0xF) + 1; if (numnodes <= 1) @@ -73,8 +76,6 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) unsigned long base,limit; u32 nodeid; - /* Undefined before E stepping, but hopefully 0 */ - dualcore |= ((read_pci_config(0, nb, 3, 0xe8) >> 12) & 3) == 1; base = read_pci_config(0, nb, 1, 0x40 + i*8); limit = read_pci_config(0, nb, 1, 0x44 + i*8); @@ -170,8 +171,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) for (i = 0; i < 8; i++) { if (nodes[i].start != nodes[i].end) { nodeid = nodeids[i]; - apicid_to_node[nodeid << dualcore] = i; - apicid_to_node[(nodeid << dualcore) + dualcore] = i; + for (j = 0; j < num_cores; j++) + apicid_to_node[(nodeid * num_cores) + j] = i; setup_node_bootmem(i, nodes[i].start, nodes[i].end); } } diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index 51548947ad3..6da23552226 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c @@ -273,9 +273,6 @@ void __init numa_init_array(void) #ifdef CONFIG_NUMA_EMU /* Numa emulation */ -#define E820_ADDR_HOLE_SIZE(start, end) \ - (e820_hole_size((start) >> PAGE_SHIFT, (end) >> PAGE_SHIFT) << \ - PAGE_SHIFT) char *cmdline __initdata; /* @@ -319,7 +316,7 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, return -1; if (num_nodes > MAX_NUMNODES) num_nodes = MAX_NUMNODES; - size = (max_addr - *addr - E820_ADDR_HOLE_SIZE(*addr, max_addr)) / + size = (max_addr - *addr - e820_hole_size(*addr, max_addr)) / num_nodes; /* * Calculate the number of big nodes that can be allocated as a result @@ -347,7 +344,7 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, if (i == num_nodes + node_start - 1) end = max_addr; else - while (end - *addr - E820_ADDR_HOLE_SIZE(*addr, end) < + while (end - *addr - e820_hole_size(*addr, end) < size) { end += FAKE_NODE_MIN_SIZE; if (end > max_addr) { @@ -476,18 +473,22 @@ out: /* * We need to vacate all active ranges that may have been registered by - * SRAT. + * SRAT and set acpi_numa to -1 so that srat_disabled() always returns + * true. NUMA emulation has succeeded so we will not scan ACPI nodes. */ remove_all_active_ranges(); +#ifdef CONFIG_ACPI_NUMA + acpi_numa = -1; +#endif for_each_node_mask(i, node_possible_map) { e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, nodes[i].end >> PAGE_SHIFT); setup_node_bootmem(i, nodes[i].start, nodes[i].end); } + acpi_fake_nodes(nodes, num_nodes); numa_init_array(); return 0; } -#undef E820_ADDR_HOLE_SIZE #endif /* CONFIG_NUMA_EMU */ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c index 9148f4a4cec..7e161c698af 100644 --- a/arch/x86_64/mm/pageattr.c +++ b/arch/x86_64/mm/pageattr.c @@ -13,7 +13,7 @@ #include <asm/tlbflush.h> #include <asm/io.h> -static inline pte_t *lookup_address(unsigned long address) +pte_t *lookup_address(unsigned long address) { pgd_t *pgd = pgd_offset_k(address); pud_t *pud; @@ -74,14 +74,12 @@ static void flush_kernel_map(void *arg) struct page *pg; /* When clflush is available always use it because it is - much cheaper than WBINVD. Disable clflush for now because - the high level code is not ready yet */ - if (1 || !cpu_has_clflush) + much cheaper than WBINVD. */ + if (!cpu_has_clflush) asm volatile("wbinvd" ::: "memory"); else list_for_each_entry(pg, l, lru) { void *adr = page_address(pg); - if (cpu_has_clflush) - cache_flush_page(adr); + cache_flush_page(adr); } __flush_tlb_all(); } @@ -95,7 +93,8 @@ static LIST_HEAD(deferred_pages); /* protected by init_mm.mmap_sem */ static inline void save_page(struct page *fpage) { - list_add(&fpage->lru, &deferred_pages); + if (!test_and_set_bit(PG_arch_1, &fpage->flags)) + list_add(&fpage->lru, &deferred_pages); } /* @@ -129,9 +128,12 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, pte_t *kpte; struct page *kpte_page; pgprot_t ref_prot2; + kpte = lookup_address(address); if (!kpte) return 0; kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK); + BUG_ON(PageLRU(kpte_page)); + BUG_ON(PageCompound(kpte_page)); if (pgprot_val(prot) != pgprot_val(ref_prot)) { if (!pte_huge(*kpte)) { set_pte(kpte, pfn_pte(pfn, prot)); @@ -159,10 +161,9 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, /* on x86-64 the direct mapping set at boot is not using 4k pages */ BUG_ON(PageReserved(kpte_page)); - if (page_private(kpte_page) == 0) { - save_page(kpte_page); + save_page(kpte_page); + if (page_private(kpte_page) == 0) revert_page(address, ref_prot); - } return 0; } @@ -234,6 +235,10 @@ void global_flush_tlb(void) flush_map(&l); list_for_each_entry_safe(pg, next, &l, lru) { + list_del(&pg->lru); + clear_bit(PG_arch_1, &pg->flags); + if (page_private(pg) != 0) + continue; ClearPagePrivate(pg); __free_page(pg); } diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c index 1e76bb0a727..acdf03e1914 100644 --- a/arch/x86_64/mm/srat.c +++ b/arch/x86_64/mm/srat.c @@ -106,9 +106,9 @@ static __init int slit_valid(struct acpi_table_slit *slit) for (j = 0; j < d; j++) { u8 val = slit->entry[d*i + j]; if (i == j) { - if (val != 10) + if (val != LOCAL_DISTANCE) return 0; - } else if (val <= 10) + } else if (val <= LOCAL_DISTANCE) return 0; } } @@ -350,7 +350,7 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) /* Sanity check to catch more bad SRATs (they are amazingly common). Make sure the PXMs cover all memory. */ -static int nodes_cover_memory(void) +static int __init nodes_cover_memory(const struct bootnode *nodes) { int i; unsigned long pxmram, e820ram; @@ -394,6 +394,9 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) { int i; + if (acpi_numa <= 0) + return -1; + /* First clean up the node list */ for (i = 0; i < MAX_NUMNODES; i++) { cutoff_node(i, start, end); @@ -403,10 +406,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) } } - if (acpi_numa <= 0) - return -1; - - if (!nodes_cover_memory()) { + if (!nodes_cover_memory(nodes)) { bad_srat(); return -1; } @@ -440,6 +440,86 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) return 0; } +#ifdef CONFIG_NUMA_EMU +static int __init find_node_by_addr(unsigned long addr) +{ + int ret = NUMA_NO_NODE; + int i; + + for_each_node_mask(i, nodes_parsed) { + /* + * Find the real node that this emulated node appears on. For + * the sake of simplicity, we only use a real node's starting + * address to determine which emulated node it appears on. + */ + if (addr >= nodes[i].start && addr < nodes[i].end) { + ret = i; + break; + } + } + return i; +} + +/* + * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID + * mappings that respect the real ACPI topology but reflect our emulated + * environment. For each emulated node, we find which real node it appears on + * and create PXM to NID mappings for those fake nodes which mirror that + * locality. SLIT will now represent the correct distances between emulated + * nodes as a result of the real topology. + */ +void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes) +{ + int i, j; + int fake_node_to_pxm_map[MAX_NUMNODES] = { + [0 ... MAX_NUMNODES-1] = PXM_INVAL + }; + unsigned char fake_apicid_to_node[MAX_LOCAL_APIC] = { + [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE + }; + + printk(KERN_INFO "Faking PXM affinity for fake nodes on real " + "topology.\n"); + for (i = 0; i < num_nodes; i++) { + int nid, pxm; + + nid = find_node_by_addr(fake_nodes[i].start); + if (nid == NUMA_NO_NODE) + continue; + pxm = node_to_pxm(nid); + if (pxm == PXM_INVAL) + continue; + fake_node_to_pxm_map[i] = pxm; + /* + * For each apicid_to_node mapping that exists for this real + * node, it must now point to the fake node ID. + */ + for (j = 0; j < MAX_LOCAL_APIC; j++) + if (apicid_to_node[j] == nid) + fake_apicid_to_node[j] = i; + } + for (i = 0; i < num_nodes; i++) + __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i); + memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node)); + + nodes_clear(nodes_parsed); + for (i = 0; i < num_nodes; i++) + if (fake_nodes[i].start != fake_nodes[i].end) + node_set(i, nodes_parsed); + WARN_ON(!nodes_cover_memory(fake_nodes)); +} + +static int null_slit_node_compare(int a, int b) +{ + return node_to_pxm(a) == node_to_pxm(b); +} +#else +static int null_slit_node_compare(int a, int b) +{ + return a == b; +} +#endif /* CONFIG_NUMA_EMU */ + void __init srat_reserve_add_area(int nodeid) { if (found_add_area && nodes_add[nodeid].end) { @@ -464,7 +544,8 @@ int __node_distance(int a, int b) int index; if (!acpi_slit) - return a == b ? 10 : 20; + return null_slit_node_compare(a, b) ? LOCAL_DISTANCE : + REMOTE_DISTANCE; index = acpi_slit->locality_count * node_to_pxm(a); return acpi_slit->entry[index + node_to_pxm(b)]; } diff --git a/arch/x86_64/pci/k8-bus.c b/arch/x86_64/pci/k8-bus.c index 3acf60ded2a..9cc813e2970 100644 --- a/arch/x86_64/pci/k8-bus.c +++ b/arch/x86_64/pci/k8-bus.c @@ -59,6 +59,8 @@ fill_mp_bus_to_cpumask(void) j <= SUBORDINATE_LDT_BUS_NUMBER(ldtbus); j++) { struct pci_bus *bus; + struct pci_sysdata *sd; + long node = NODE_ID(nid); /* Algorithm a bit dumb, but it shouldn't matter here */ @@ -67,7 +69,9 @@ fill_mp_bus_to_cpumask(void) continue; if (!node_online(node)) node = 0; - bus->sysdata = (void *)node; + + sd = bus->sysdata; + sd->node = node; } } } diff --git a/arch/x86_64/vdso/Makefile b/arch/x86_64/vdso/Makefile new file mode 100644 index 00000000000..8d03de029d9 --- /dev/null +++ b/arch/x86_64/vdso/Makefile @@ -0,0 +1,49 @@ +# +# x86-64 vDSO. +# + +# files to link into the vdso +# vdso-start.o has to be first +vobjs-y := vdso-start.o vdso-note.o vclock_gettime.o vgetcpu.o vvar.o + +# files to link into kernel +obj-y := vma.o vdso.o vdso-syms.o + +vobjs := $(foreach F,$(vobjs-y),$(obj)/$F) + +$(obj)/vdso.o: $(obj)/vdso.so + +targets += vdso.so vdso.lds $(vobjs-y) vdso-syms.o + +# The DSO images are built using a special linker script. +quiet_cmd_syscall = SYSCALL $@ + cmd_syscall = $(CC) -m elf_x86_64 -nostdlib $(SYSCFLAGS_$(@F)) \ + -Wl,-T,$(filter-out FORCE,$^) -o $@ + +export CPPFLAGS_vdso.lds += -P -C -U$(ARCH) + +vdso-flags = -fPIC -shared -Wl,-soname=linux-vdso.so.1 \ + $(call ld-option, -Wl$(comma)--hash-style=sysv) \ + -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 +SYSCFLAGS_vdso.so = $(vdso-flags) + +$(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so + +$(obj)/vdso.so: $(src)/vdso.lds $(vobjs) FORCE + $(call if_changed,syscall) + +CFL := $(PROFILING) -mcmodel=small -fPIC -g0 -O2 -fasynchronous-unwind-tables -m64 + +$(obj)/vclock_gettime.o: CFLAGS = $(CFL) +$(obj)/vgetcpu.o: CFLAGS = $(CFL) + +# We also create a special relocatable object that should mirror the symbol +# table and layout of the linked DSO. With ld -R we can then refer to +# these symbols in the kernel code rather than hand-coded addresses. +extra-y += vdso-syms.o +$(obj)/built-in.o: $(obj)/vdso-syms.o +$(obj)/built-in.o: ld_flags += -R $(obj)/vdso-syms.o + +SYSCFLAGS_vdso-syms.o = -r -d +$(obj)/vdso-syms.o: $(src)/vdso.lds $(vobjs) FORCE + $(call if_changed,syscall) diff --git a/arch/x86_64/vdso/vclock_gettime.c b/arch/x86_64/vdso/vclock_gettime.c new file mode 100644 index 00000000000..17f6a00de71 --- /dev/null +++ b/arch/x86_64/vdso/vclock_gettime.c @@ -0,0 +1,120 @@ +/* + * Copyright 2006 Andi Kleen, SUSE Labs. + * Subject to the GNU Public License, v.2 + * + * Fast user context implementation of clock_gettime and gettimeofday. + * + * The code should have no internal unresolved relocations. + * Check with readelf after changing. + * Also alternative() doesn't work. + */ + +#include <linux/kernel.h> +#include <linux/posix-timers.h> +#include <linux/time.h> +#include <linux/string.h> +#include <asm/vsyscall.h> +#include <asm/vgtod.h> +#include <asm/timex.h> +#include <asm/hpet.h> +#include <asm/unistd.h> +#include <asm/io.h> +#include <asm/vgtod.h> +#include "vextern.h" + +#define gtod vdso_vsyscall_gtod_data + +static long vdso_fallback_gettime(long clock, struct timespec *ts) +{ + long ret; + asm("syscall" : "=a" (ret) : + "0" (__NR_clock_gettime),"D" (clock), "S" (ts) : "memory"); + return ret; +} + +static inline long vgetns(void) +{ + cycles_t (*vread)(void); + vread = gtod->clock.vread; + return ((vread() - gtod->clock.cycle_last) * gtod->clock.mult) >> + gtod->clock.shift; +} + +static noinline int do_realtime(struct timespec *ts) +{ + unsigned long seq, ns; + do { + seq = read_seqbegin(>od->lock); + ts->tv_sec = gtod->wall_time_sec; + ts->tv_nsec = gtod->wall_time_nsec; + ns = vgetns(); + } while (unlikely(read_seqretry(>od->lock, seq))); + timespec_add_ns(ts, ns); + return 0; +} + +/* Copy of the version in kernel/time.c which we cannot directly access */ +static void vset_normalized_timespec(struct timespec *ts, long sec, long nsec) +{ + while (nsec >= NSEC_PER_SEC) { + nsec -= NSEC_PER_SEC; + ++sec; + } + while (nsec < 0) { + nsec += NSEC_PER_SEC; + --sec; + } + ts->tv_sec = sec; + ts->tv_nsec = nsec; +} + +static noinline int do_monotonic(struct timespec *ts) +{ + unsigned long seq, ns, secs; + do { + seq = read_seqbegin(>od->lock); + secs = gtod->wall_time_sec; + ns = gtod->wall_time_nsec + vgetns(); + secs += gtod->wall_to_monotonic.tv_sec; + ns += gtod->wall_to_monotonic.tv_nsec; + } while (unlikely(read_seqretry(>od->lock, seq))); + vset_normalized_timespec(ts, secs, ns); + return 0; +} + +int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) +{ + if (likely(gtod->sysctl_enabled && gtod->clock.vread)) + switch (clock) { + case CLOCK_REALTIME: + return do_realtime(ts); + case CLOCK_MONOTONIC: + return do_monotonic(ts); + } + return vdso_fallback_gettime(clock, ts); +} +int clock_gettime(clockid_t, struct timespec *) + __attribute__((weak, alias("__vdso_clock_gettime"))); + +int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) +{ + long ret; + if (likely(gtod->sysctl_enabled && gtod->clock.vread)) { + BUILD_BUG_ON(offsetof(struct timeval, tv_usec) != + offsetof(struct timespec, tv_nsec) || + sizeof(*tv) != sizeof(struct timespec)); + do_realtime((struct timespec *)tv); + tv->tv_usec /= 1000; + if (unlikely(tz != NULL)) { + /* This relies on gcc inlining the memcpy. We'll notice + if it ever fails to do so. */ + memcpy(tz, >od->sys_tz, sizeof(struct timezone)); + } + return 0; + } + asm("syscall" : "=a" (ret) : + "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory"); + return ret; +} +int gettimeofday(struct timeval *, struct timezone *) + __attribute__((weak, alias("__vdso_gettimeofday"))); diff --git a/arch/x86_64/vdso/vdso-note.S b/arch/x86_64/vdso/vdso-note.S new file mode 100644 index 00000000000..79a071e4357 --- /dev/null +++ b/arch/x86_64/vdso/vdso-note.S @@ -0,0 +1,12 @@ +/* + * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text. + * Here we can supply some information useful to userland. + */ + +#include <linux/uts.h> +#include <linux/version.h> +#include <linux/elfnote.h> + +ELFNOTE_START(Linux, 0, "a") + .long LINUX_VERSION_CODE +ELFNOTE_END diff --git a/arch/x86_64/vdso/vdso-start.S b/arch/x86_64/vdso/vdso-start.S new file mode 100644 index 00000000000..2dc2cdb84d6 --- /dev/null +++ b/arch/x86_64/vdso/vdso-start.S @@ -0,0 +1,2 @@ + .globl vdso_kernel_start +vdso_kernel_start: diff --git a/arch/x86_64/vdso/vdso.S b/arch/x86_64/vdso/vdso.S new file mode 100644 index 00000000000..92e80c1972a --- /dev/null +++ b/arch/x86_64/vdso/vdso.S @@ -0,0 +1,2 @@ + .section ".vdso","a" + .incbin "arch/x86_64/vdso/vdso.so" diff --git a/arch/x86_64/vdso/vdso.lds.S b/arch/x86_64/vdso/vdso.lds.S new file mode 100644 index 00000000000..b9a60e665d0 --- /dev/null +++ b/arch/x86_64/vdso/vdso.lds.S @@ -0,0 +1,77 @@ +/* + * Linker script for vsyscall DSO. The vsyscall page is an ELF shared + * object prelinked to its virtual address, and with only one read-only + * segment (that fits in one page). This script controls its layout. + */ +#include <asm/asm-offsets.h> +#include "voffset.h" + +#define VDSO_PRELINK 0xffffffffff700000 + +SECTIONS +{ + . = VDSO_PRELINK + SIZEOF_HEADERS; + + .hash : { *(.hash) } :text + .gnu.hash : { *(.gnu.hash) } + .dynsym : { *(.dynsym) } + .dynstr : { *(.dynstr) } + .gnu.version : { *(.gnu.version) } + .gnu.version_d : { *(.gnu.version_d) } + .gnu.version_r : { *(.gnu.version_r) } + + /* This linker script is used both with -r and with -shared. + For the layouts to match, we need to skip more than enough + space for the dynamic symbol table et al. If this amount + is insufficient, ld -shared will barf. Just increase it here. */ + . = VDSO_PRELINK + VDSO_TEXT_OFFSET; + + .text : { *(.text) } :text + .text.ptr : { *(.text.ptr) } :text + . = VDSO_PRELINK + 0x900; + .data : { *(.data) } :text + .bss : { *(.bss) } :text + + .altinstructions : { *(.altinstructions) } :text + .altinstr_replacement : { *(.altinstr_replacement) } :text + + .note : { *(.note.*) } :text :note + .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr + .eh_frame : { KEEP (*(.eh_frame)) } :text + .dynamic : { *(.dynamic) } :text :dynamic + .useless : { + *(.got.plt) *(.got) + *(.gnu.linkonce.d.*) + *(.dynbss) + *(.gnu.linkonce.b.*) + } :text +} + +/* + * We must supply the ELF program headers explicitly to get just one + * PT_LOAD segment, and set the flags explicitly to make segments read-only. + */ +PHDRS +{ + text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */ + dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ + note PT_NOTE FLAGS(4); /* PF_R */ + eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */ +} + +/* + * This controls what symbols we export from the DSO. + */ +VERSION +{ + LINUX_2.6 { + global: + clock_gettime; + __vdso_clock_gettime; + gettimeofday; + __vdso_gettimeofday; + getcpu; + __vdso_getcpu; + local: *; + }; +} diff --git a/arch/x86_64/vdso/vextern.h b/arch/x86_64/vdso/vextern.h new file mode 100644 index 00000000000..1683ba2ae3e --- /dev/null +++ b/arch/x86_64/vdso/vextern.h @@ -0,0 +1,16 @@ +#ifndef VEXTERN +#include <asm/vsyscall.h> +#define VEXTERN(x) \ + extern typeof(x) *vdso_ ## x __attribute__((visibility("hidden"))); +#endif + +#define VMAGIC 0xfeedbabeabcdefabUL + +/* Any kernel variables used in the vDSO must be exported in the main + kernel's vmlinux.lds.S/vsyscall.h/proper __section and + put into vextern.h and be referenced as a pointer with vdso prefix. + The main kernel later fills in the values. */ + +VEXTERN(jiffies) +VEXTERN(vgetcpu_mode) +VEXTERN(vsyscall_gtod_data) diff --git a/arch/x86_64/vdso/vgetcpu.c b/arch/x86_64/vdso/vgetcpu.c new file mode 100644 index 00000000000..91f6e85d0fc --- /dev/null +++ b/arch/x86_64/vdso/vgetcpu.c @@ -0,0 +1,50 @@ +/* + * Copyright 2006 Andi Kleen, SUSE Labs. + * Subject to the GNU Public License, v.2 + * + * Fast user context implementation of getcpu() + */ + +#include <linux/kernel.h> +#include <linux/getcpu.h> +#include <linux/jiffies.h> +#include <linux/time.h> +#include <asm/vsyscall.h> +#include <asm/vgtod.h> +#include "vextern.h" + +long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) +{ + unsigned int dummy, p; + unsigned long j = 0; + + /* Fast cache - only recompute value once per jiffies and avoid + relatively costly rdtscp/cpuid otherwise. + This works because the scheduler usually keeps the process + on the same CPU and this syscall doesn't guarantee its + results anyways. + We do this here because otherwise user space would do it on + its own in a likely inferior way (no access to jiffies). + If you don't like it pass NULL. */ + if (tcache && tcache->blob[0] == (j = *vdso_jiffies)) { + p = tcache->blob[1]; + } else if (*vdso_vgetcpu_mode == VGETCPU_RDTSCP) { + /* Load per CPU data from RDTSCP */ + rdtscp(dummy, dummy, p); + } else { + /* Load per CPU data from GDT */ + asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); + } + if (tcache) { + tcache->blob[0] = j; + tcache->blob[1] = p; + } + if (cpu) + *cpu = p & 0xfff; + if (node) + *node = p >> 12; + return 0; +} + +long getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) + __attribute__((weak, alias("__vdso_getcpu"))); diff --git a/arch/x86_64/vdso/vma.c b/arch/x86_64/vdso/vma.c new file mode 100644 index 00000000000..d4cb83a6c06 --- /dev/null +++ b/arch/x86_64/vdso/vma.c @@ -0,0 +1,139 @@ +/* + * Set up the VMAs to tell the VM about the vDSO. + * Copyright 2007 Andi Kleen, SUSE Labs. + * Subject to the GPL, v.2 + */ +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/init.h> +#include <linux/random.h> +#include <asm/vsyscall.h> +#include <asm/vgtod.h> +#include <asm/proto.h> +#include "voffset.h" + +int vdso_enabled = 1; + +#define VEXTERN(x) extern typeof(__ ## x) *vdso_ ## x; +#include "vextern.h" +#undef VEXTERN + +extern char vdso_kernel_start[], vdso_start[], vdso_end[]; +extern unsigned short vdso_sync_cpuid; + +struct page **vdso_pages; + +static inline void *var_ref(void *vbase, char *var, char *name) +{ + unsigned offset = var - &vdso_kernel_start[0] + VDSO_TEXT_OFFSET; + void *p = vbase + offset; + if (*(void **)p != (void *)VMAGIC) { + printk("VDSO: variable %s broken\n", name); + vdso_enabled = 0; + } + return p; +} + +static int __init init_vdso_vars(void) +{ + int npages = (vdso_end - vdso_start + PAGE_SIZE - 1) / PAGE_SIZE; + int i; + char *vbase; + + vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL); + if (!vdso_pages) + goto oom; + for (i = 0; i < npages; i++) { + struct page *p; + p = alloc_page(GFP_KERNEL); + if (!p) + goto oom; + vdso_pages[i] = p; + copy_page(page_address(p), vdso_start + i*PAGE_SIZE); + } + + vbase = vmap(vdso_pages, npages, 0, PAGE_KERNEL); + if (!vbase) + goto oom; + + if (memcmp(vbase, "\177ELF", 4)) { + printk("VDSO: I'm broken; not ELF\n"); + vdso_enabled = 0; + } + +#define V(x) *(typeof(x) *) var_ref(vbase, (char *)RELOC_HIDE(&x, 0), #x) +#define VEXTERN(x) \ + V(vdso_ ## x) = &__ ## x; +#include "vextern.h" +#undef VEXTERN + return 0; + + oom: + printk("Cannot allocate vdso\n"); + vdso_enabled = 0; + return -ENOMEM; +} +__initcall(init_vdso_vars); + +struct linux_binprm; + +/* Put the vdso above the (randomized) stack with another randomized offset. + This way there is no hole in the middle of address space. + To save memory make sure it is still in the same PTE as the stack top. + This doesn't give that many random bits */ +static unsigned long vdso_addr(unsigned long start, unsigned len) +{ + unsigned long addr, end; + unsigned offset; + end = (start + PMD_SIZE - 1) & PMD_MASK; + if (end >= TASK_SIZE64) + end = TASK_SIZE64; + end -= len; + /* This loses some more bits than a modulo, but is cheaper */ + offset = get_random_int() & (PTRS_PER_PTE - 1); + addr = start + (offset << PAGE_SHIFT); + if (addr >= end) + addr = end; + return addr; +} + +/* Setup a VMA at program startup for the vsyscall page. + Not called for compat tasks */ +int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) +{ + struct mm_struct *mm = current->mm; + unsigned long addr; + int ret; + unsigned len = round_up(vdso_end - vdso_start, PAGE_SIZE); + + if (!vdso_enabled) + return 0; + + down_write(&mm->mmap_sem); + addr = vdso_addr(mm->start_stack, len); + addr = get_unmapped_area(NULL, addr, len, 0, 0); + if (IS_ERR_VALUE(addr)) { + ret = addr; + goto up_fail; + } + + ret = install_special_mapping(mm, addr, len, + VM_READ|VM_EXEC| + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| + VM_ALWAYSDUMP, + vdso_pages); + if (ret) + goto up_fail; + + current->mm->context.vdso = (void *)addr; +up_fail: + up_write(&mm->mmap_sem); + return ret; +} + +static __init int vdso_setup(char *s) +{ + vdso_enabled = simple_strtoul(s, NULL, 0); + return 0; +} +__setup("vdso=", vdso_setup); diff --git a/arch/x86_64/vdso/voffset.h b/arch/x86_64/vdso/voffset.h new file mode 100644 index 00000000000..5304204911f --- /dev/null +++ b/arch/x86_64/vdso/voffset.h @@ -0,0 +1 @@ +#define VDSO_TEXT_OFFSET 0x500 diff --git a/arch/x86_64/vdso/vvar.c b/arch/x86_64/vdso/vvar.c new file mode 100644 index 00000000000..6fc22219a47 --- /dev/null +++ b/arch/x86_64/vdso/vvar.c @@ -0,0 +1,12 @@ +/* Define pointer to external vDSO variables. + These are part of the vDSO. The kernel fills in the real addresses + at boot time. This is done because when the vdso is linked the + kernel isn't yet and we don't know the final addresses. */ +#include <linux/kernel.h> +#include <linux/time.h> +#include <asm/vsyscall.h> +#include <asm/timex.h> +#include <asm/vgtod.h> + +#define VEXTERN(x) typeof (__ ## x) *vdso_ ## x = (void *)VMAGIC; +#include "vextern.h" |