aboutsummaryrefslogtreecommitdiff
path: root/arch/x86_64
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86_64')
-rw-r--r--arch/x86_64/Kconfig20
-rw-r--r--arch/x86_64/Makefile3
-rw-r--r--arch/x86_64/boot/.gitignore2
-rw-r--r--arch/x86_64/boot/Makefile136
-rw-r--r--arch/x86_64/boot/bootsect.S98
-rw-r--r--arch/x86_64/boot/compressed/Makefile11
-rw-r--r--arch/x86_64/boot/compressed/head.S6
-rw-r--r--arch/x86_64/boot/install.sh2
-rw-r--r--arch/x86_64/boot/mtools.conf.in17
-rw-r--r--arch/x86_64/boot/setup.S826
-rw-r--r--arch/x86_64/boot/tools/build.c185
-rw-r--r--arch/x86_64/defconfig288
-rw-r--r--arch/x86_64/ia32/ia32_aout.c2
-rw-r--r--arch/x86_64/ia32/ia32_binfmt.c59
-rw-r--r--arch/x86_64/ia32/ia32entry.S8
-rw-r--r--arch/x86_64/ia32/sys_ia32.c8
-rw-r--r--arch/x86_64/kernel/Makefile2
-rw-r--r--arch/x86_64/kernel/acpi/sleep.c8
-rw-r--r--arch/x86_64/kernel/acpi/wakeup.S32
-rw-r--r--arch/x86_64/kernel/aperture.c4
-rw-r--r--arch/x86_64/kernel/apic.c77
-rw-r--r--arch/x86_64/kernel/cpufreq/Kconfig6
-rw-r--r--arch/x86_64/kernel/e820.c138
-rw-r--r--arch/x86_64/kernel/early-quirks.c1
-rw-r--r--arch/x86_64/kernel/early_printk.c5
-rw-r--r--arch/x86_64/kernel/entry.S6
-rw-r--r--arch/x86_64/kernel/head.S23
-rw-r--r--arch/x86_64/kernel/hpet.c8
-rw-r--r--arch/x86_64/kernel/i8259.c18
-rw-r--r--arch/x86_64/kernel/init_task.c2
-rw-r--r--arch/x86_64/kernel/io_apic.c58
-rw-r--r--arch/x86_64/kernel/kprobes.c10
-rw-r--r--arch/x86_64/kernel/mce.c255
-rw-r--r--arch/x86_64/kernel/mce_amd.c6
-rw-r--r--arch/x86_64/kernel/mpparse.c21
-rw-r--r--arch/x86_64/kernel/nmi.c25
-rw-r--r--arch/x86_64/kernel/pci-calgary.c570
-rw-r--r--arch/x86_64/kernel/pci-dma.c10
-rw-r--r--arch/x86_64/kernel/pci-gart.c27
-rw-r--r--arch/x86_64/kernel/pci-nommu.c8
-rw-r--r--arch/x86_64/kernel/pci-swiotlb.c2
-rw-r--r--arch/x86_64/kernel/process.c21
-rw-r--r--arch/x86_64/kernel/ptrace.c40
-rw-r--r--arch/x86_64/kernel/reboot.c4
-rw-r--r--arch/x86_64/kernel/setup.c33
-rw-r--r--arch/x86_64/kernel/signal.c9
-rw-r--r--arch/x86_64/kernel/smp.c18
-rw-r--r--arch/x86_64/kernel/suspend.c20
-rw-r--r--arch/x86_64/kernel/tce.c12
-rw-r--r--arch/x86_64/kernel/time.c158
-rw-r--r--arch/x86_64/kernel/traps.c21
-rw-r--r--arch/x86_64/kernel/tsc.c41
-rw-r--r--arch/x86_64/kernel/verify_cpu.S22
-rw-r--r--arch/x86_64/kernel/vmlinux.lds.S37
-rw-r--r--arch/x86_64/kernel/vsyscall.c22
-rw-r--r--arch/x86_64/mm/fault.c48
-rw-r--r--arch/x86_64/mm/init.c58
-rw-r--r--arch/x86_64/mm/k8topology.c13
-rw-r--r--arch/x86_64/mm/numa.c15
-rw-r--r--arch/x86_64/mm/pageattr.c25
-rw-r--r--arch/x86_64/mm/srat.c97
-rw-r--r--arch/x86_64/pci/k8-bus.c6
-rw-r--r--arch/x86_64/vdso/Makefile49
-rw-r--r--arch/x86_64/vdso/vclock_gettime.c120
-rw-r--r--arch/x86_64/vdso/vdso-note.S12
-rw-r--r--arch/x86_64/vdso/vdso-start.S2
-rw-r--r--arch/x86_64/vdso/vdso.S2
-rw-r--r--arch/x86_64/vdso/vdso.lds.S77
-rw-r--r--arch/x86_64/vdso/vextern.h16
-rw-r--r--arch/x86_64/vdso/vgetcpu.c50
-rw-r--r--arch/x86_64/vdso/vma.c139
-rw-r--r--arch/x86_64/vdso/voffset.h1
-rw-r--r--arch/x86_64/vdso/vvar.c12
73 files changed, 1896 insertions, 2297 deletions
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index 5ce94430c01..45f82ae6d38 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -32,6 +32,10 @@ config GENERIC_TIME_VSYSCALL
bool
default y
+config GENERIC_CMOS_UPDATE
+ bool
+ default y
+
config ZONE_DMA32
bool
default y
@@ -56,6 +60,14 @@ config ZONE_DMA
bool
default y
+config QUICKLIST
+ bool
+ default y
+
+config NR_QUICK
+ int
+ default 2
+
config ISA
bool
@@ -427,6 +439,10 @@ config NR_CPUS
This is purely to save memory - each supported CPU requires
memory in the static kernel configuration.
+config PHYSICAL_ALIGN
+ hex
+ default "0x200000"
+
config HOTPLUG_CPU
bool "Support for suspend on SMP and hot-pluggable CPUs (EXPERIMENTAL)"
depends on SMP && HOTPLUG && EXPERIMENTAL
@@ -770,8 +786,8 @@ menu "Instrumentation Support"
source "arch/x86_64/oprofile/Kconfig"
config KPROBES
- bool "Kprobes (EXPERIMENTAL)"
- depends on KALLSYMS && EXPERIMENTAL && MODULES
+ bool "Kprobes"
+ depends on KALLSYMS && MODULES
help
Kprobes allows you to trap at almost any kernel address and
execute a callback function. register_kprobe() establishes
diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile
index 29617ae3926..128561d3e87 100644
--- a/arch/x86_64/Makefile
+++ b/arch/x86_64/Makefile
@@ -76,7 +76,8 @@ head-y := arch/x86_64/kernel/head.o arch/x86_64/kernel/head64.o arch/x86_64/kern
libs-y += arch/x86_64/lib/
core-y += arch/x86_64/kernel/ \
arch/x86_64/mm/ \
- arch/x86_64/crypto/
+ arch/x86_64/crypto/ \
+ arch/x86_64/vdso/
core-$(CONFIG_IA32_EMULATION) += arch/x86_64/ia32/
drivers-$(CONFIG_PCI) += arch/x86_64/pci/
drivers-$(CONFIG_OPROFILE) += arch/x86_64/oprofile/
diff --git a/arch/x86_64/boot/.gitignore b/arch/x86_64/boot/.gitignore
index 495f20c085d..18465143cfa 100644
--- a/arch/x86_64/boot/.gitignore
+++ b/arch/x86_64/boot/.gitignore
@@ -1,3 +1,5 @@
bootsect
bzImage
setup
+setup.bin
+setup.elf
diff --git a/arch/x86_64/boot/Makefile b/arch/x86_64/boot/Makefile
index ee6f6505f95..67096389de1 100644
--- a/arch/x86_64/boot/Makefile
+++ b/arch/x86_64/boot/Makefile
@@ -1,135 +1,9 @@
#
# arch/x86_64/boot/Makefile
#
-# This file is subject to the terms and conditions of the GNU General Public
-# License. See the file "COPYING" in the main directory of this archive
-# for more details.
-#
-# Copyright (C) 1994 by Linus Torvalds
-#
-
-# ROOT_DEV specifies the default root-device when making the image.
-# This can be either FLOPPY, CURRENT, /dev/xxxx or empty, in which case
-# the default of FLOPPY is used by 'build'.
-
-ROOT_DEV := CURRENT
-
-# If you want to preset the SVGA mode, uncomment the next line and
-# set SVGA_MODE to whatever number you want.
-# Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode.
-# The number is the same as you would ordinarily press at bootup.
-
-SVGA_MODE := -DSVGA_MODE=NORMAL_VGA
-
-# If you want the RAM disk device, define this to be the size in blocks.
-
-#RAMDISK := -DRAMDISK=512
-
-targets := vmlinux.bin bootsect bootsect.o \
- setup setup.o bzImage mtools.conf
-
-EXTRA_CFLAGS := -m32
-
-hostprogs-y := tools/build
-HOST_EXTRACFLAGS += $(LINUXINCLUDE)
-subdir- := compressed/ #Let make clean descend in compressed/
-# ---------------------------------------------------------------------------
-
-$(obj)/bzImage: IMAGE_OFFSET := 0x100000
-$(obj)/bzImage: EXTRA_AFLAGS := $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__
-$(obj)/bzImage: BUILDFLAGS := -b
-
-quiet_cmd_image = BUILD $@
-cmd_image = $(obj)/tools/build $(BUILDFLAGS) $(obj)/bootsect $(obj)/setup \
- $(obj)/vmlinux.bin $(ROOT_DEV) > $@
-
-$(obj)/bzImage: $(obj)/bootsect $(obj)/setup \
- $(obj)/vmlinux.bin $(obj)/tools/build FORCE
- $(call if_changed,image)
- @echo 'Kernel: $@ is ready' ' (#'`cat .version`')'
-
-$(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE
- $(call if_changed,objcopy)
-
-LDFLAGS_bootsect := -Ttext 0x0 -s --oformat binary
-LDFLAGS_setup := -Ttext 0x0 -s --oformat binary -e begtext
-
-$(obj)/setup $(obj)/bootsect: %: %.o FORCE
- $(call if_changed,ld)
-
-$(obj)/compressed/vmlinux: FORCE
- $(Q)$(MAKE) $(build)=$(obj)/compressed IMAGE_OFFSET=$(IMAGE_OFFSET) $@
-
-# Set this if you want to pass append arguments to the zdisk/fdimage/isoimage kernel
-FDARGS =
-# Set this if you want an initrd included with the zdisk/fdimage/isoimage kernel
-FDINITRD =
-
-image_cmdline = default linux $(FDARGS) $(if $(FDINITRD),initrd=initrd.img,)
-
-$(obj)/mtools.conf: $(src)/mtools.conf.in
- sed -e 's|@OBJ@|$(obj)|g' < $< > $@
-
-# This requires write access to /dev/fd0
-zdisk: $(BOOTIMAGE) $(obj)/mtools.conf
- MTOOLSRC=$(obj)/mtools.conf mformat a: ; sync
- syslinux /dev/fd0 ; sync
- echo '$(image_cmdline)' | \
- MTOOLSRC=$(obj)/mtools.conf mcopy - a:syslinux.cfg
- if [ -f '$(FDINITRD)' ] ; then \
- MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' a:initrd.img ; \
- fi
- MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) a:linux ; sync
-
-# These require being root or having syslinux 2.02 or higher installed
-fdimage fdimage144: $(BOOTIMAGE) $(obj)/mtools.conf
- dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=1440
- MTOOLSRC=$(obj)/mtools.conf mformat v: ; sync
- syslinux $(obj)/fdimage ; sync
- echo '$(image_cmdline)' | \
- MTOOLSRC=$(obj)/mtools.conf mcopy - v:syslinux.cfg
- if [ -f '$(FDINITRD)' ] ; then \
- MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' v:initrd.img ; \
- fi
- MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) v:linux ; sync
-
-fdimage288: $(BOOTIMAGE) $(obj)/mtools.conf
- dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=2880
- MTOOLSRC=$(obj)/mtools.conf mformat w: ; sync
- syslinux $(obj)/fdimage ; sync
- echo '$(image_cmdline)' | \
- MTOOLSRC=$(obj)/mtools.conf mcopy - w:syslinux.cfg
- if [ -f '$(FDINITRD)' ] ; then \
- MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' w:initrd.img ; \
- fi
- MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) w:linux ; sync
-
-isoimage: $(BOOTIMAGE)
- -rm -rf $(obj)/isoimage
- mkdir $(obj)/isoimage
- for i in lib lib64 share end ; do \
- if [ -f /usr/$$i/syslinux/isolinux.bin ] ; then \
- cp /usr/$$i/syslinux/isolinux.bin $(obj)/isoimage ; \
- break ; \
- fi ; \
- if [ $$i = end ] ; then exit 1 ; fi ; \
- done
- cp $(BOOTIMAGE) $(obj)/isoimage/linux
- echo '$(image_cmdline)' > $(obj)/isoimage/isolinux.cfg
- if [ -f '$(FDINITRD)' ] ; then \
- cp '$(FDINITRD)' $(obj)/isoimage/initrd.img ; \
- fi
- mkisofs -J -r -o $(obj)/image.iso -b isolinux.bin -c boot.cat \
- -no-emul-boot -boot-load-size 4 -boot-info-table \
- $(obj)/isoimage
- rm -rf $(obj)/isoimage
-
-zlilo: $(BOOTIMAGE)
- if [ -f $(INSTALL_PATH)/vmlinuz ]; then mv $(INSTALL_PATH)/vmlinuz $(INSTALL_PATH)/vmlinuz.old; fi
- if [ -f $(INSTALL_PATH)/System.map ]; then mv $(INSTALL_PATH)/System.map $(INSTALL_PATH)/System.old; fi
- cat $(BOOTIMAGE) > $(INSTALL_PATH)/vmlinuz
- cp System.map $(INSTALL_PATH)/
- if [ -x /sbin/lilo ]; then /sbin/lilo; else /etc/lilo/install; fi
+# The actual boot code is shared with i386 including the Makefile.
+# So tell kbuild that we fetch the code from i386 and include the
+# Makefile from i386 too.
-install:
- sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(BOOTIMAGE) System.map "$(INSTALL_PATH)"
+src := arch/i386/boot
+include $(src)/Makefile
diff --git a/arch/x86_64/boot/bootsect.S b/arch/x86_64/boot/bootsect.S
deleted file mode 100644
index 011b7a4993d..00000000000
--- a/arch/x86_64/boot/bootsect.S
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * bootsect.S Copyright (C) 1991, 1992 Linus Torvalds
- *
- * modified by Drew Eckhardt
- * modified by Bruce Evans (bde)
- * modified by Chris Noe (May 1999) (as86 -> gas)
- * gutted by H. Peter Anvin (Jan 2003)
- *
- * BIG FAT NOTE: We're in real mode using 64k segments. Therefore segment
- * addresses must be multiplied by 16 to obtain their respective linear
- * addresses. To avoid confusion, linear addresses are written using leading
- * hex while segment addresses are written as segment:offset.
- *
- */
-
-#include <asm/boot.h>
-
-SETUPSECTS = 4 /* default nr of setup-sectors */
-BOOTSEG = 0x07C0 /* original address of boot-sector */
-INITSEG = DEF_INITSEG /* we move boot here - out of the way */
-SETUPSEG = DEF_SETUPSEG /* setup starts here */
-SYSSEG = DEF_SYSSEG /* system loaded at 0x10000 (65536) */
-SYSSIZE = DEF_SYSSIZE /* system size: # of 16-byte clicks */
- /* to be loaded */
-ROOT_DEV = 0 /* ROOT_DEV is now written by "build" */
-SWAP_DEV = 0 /* SWAP_DEV is now written by "build" */
-
-#ifndef SVGA_MODE
-#define SVGA_MODE ASK_VGA
-#endif
-
-#ifndef RAMDISK
-#define RAMDISK 0
-#endif
-
-#ifndef ROOT_RDONLY
-#define ROOT_RDONLY 1
-#endif
-
-.code16
-.text
-
-.global _start
-_start:
-
- # Normalize the start address
- jmpl $BOOTSEG, $start2
-
-start2:
- movw %cs, %ax
- movw %ax, %ds
- movw %ax, %es
- movw %ax, %ss
- movw $0x7c00, %sp
- sti
- cld
-
- movw $bugger_off_msg, %si
-
-msg_loop:
- lodsb
- andb %al, %al
- jz die
- movb $0xe, %ah
- movw $7, %bx
- int $0x10
- jmp msg_loop
-
-die:
- # Allow the user to press a key, then reboot
- xorw %ax, %ax
- int $0x16
- int $0x19
-
- # int 0x19 should never return. In case it does anyway,
- # invoke the BIOS reset code...
- ljmp $0xf000,$0xfff0
-
-
-bugger_off_msg:
- .ascii "Direct booting from floppy is no longer supported.\r\n"
- .ascii "Please use a boot loader program instead.\r\n"
- .ascii "\n"
- .ascii "Remove disk and press any key to reboot . . .\r\n"
- .byte 0
-
-
- # Kernel attributes; used by setup
-
- .org 497
-setup_sects: .byte SETUPSECTS
-root_flags: .word ROOT_RDONLY
-syssize: .word SYSSIZE
-swap_dev: .word SWAP_DEV
-ram_size: .word RAMDISK
-vid_mode: .word SVGA_MODE
-root_dev: .word ROOT_DEV
-boot_flag: .word 0xAA55
diff --git a/arch/x86_64/boot/compressed/Makefile b/arch/x86_64/boot/compressed/Makefile
index 705a3e33d7e..877c0bdbbc6 100644
--- a/arch/x86_64/boot/compressed/Makefile
+++ b/arch/x86_64/boot/compressed/Makefile
@@ -3,15 +3,14 @@
#
# create a compressed vmlinux image from the original vmlinux
#
-# Note all the files here are compiled/linked as 32bit executables.
-#
targets := vmlinux vmlinux.bin vmlinux.bin.gz head.o misc.o piggy.o
-EXTRA_AFLAGS := -traditional
-# cannot use EXTRA_CFLAGS because base CFLAGS contains -mkernel which conflicts with
-# -m32
-CFLAGS := -m64 -D__KERNEL__ -Iinclude -O2 -fno-strict-aliasing -fPIC -mcmodel=small -fno-builtin
+CFLAGS := -m64 -D__KERNEL__ $(LINUXINCLUDE) -O2 \
+ -fno-strict-aliasing -fPIC -mcmodel=small \
+ $(call cc-option, -ffreestanding) \
+ $(call cc-option, -fno-stack-protector)
+AFLAGS := $(CFLAGS) -D__ASSEMBLY__
LDFLAGS := -m elf_x86_64
LDFLAGS_vmlinux := -T
diff --git a/arch/x86_64/boot/compressed/head.S b/arch/x86_64/boot/compressed/head.S
index f9d5692a010..1312bfaff30 100644
--- a/arch/x86_64/boot/compressed/head.S
+++ b/arch/x86_64/boot/compressed/head.S
@@ -46,10 +46,10 @@ startup_32:
* at and where we were actually loaded at. This can only be done
* with a short local call on x86. Nothing else will tell us what
* address we are running at. The reserved chunk of the real-mode
- * data at 0x34-0x3f are used as the stack for this calculation.
- * Only 4 bytes are needed.
+ * data at 0x1e4 (defined as a scratch field) are used as the stack
+ * for this calculation. Only 4 bytes are needed.
*/
- leal 0x40(%esi), %esp
+ leal (0x1e4+4)(%esi), %esp
call 1f
1: popl %ebp
subl $1b, %ebp
diff --git a/arch/x86_64/boot/install.sh b/arch/x86_64/boot/install.sh
deleted file mode 100644
index baaa2369bdb..00000000000
--- a/arch/x86_64/boot/install.sh
+++ /dev/null
@@ -1,2 +0,0 @@
-#!/bin/sh
-. $srctree/arch/i386/boot/install.sh
diff --git a/arch/x86_64/boot/mtools.conf.in b/arch/x86_64/boot/mtools.conf.in
deleted file mode 100644
index efd6d2490c1..00000000000
--- a/arch/x86_64/boot/mtools.conf.in
+++ /dev/null
@@ -1,17 +0,0 @@
-#
-# mtools configuration file for "make (b)zdisk"
-#
-
-# Actual floppy drive
-drive a:
- file="/dev/fd0"
-
-# 1.44 MB floppy disk image
-drive v:
- file="@OBJ@/fdimage" cylinders=80 heads=2 sectors=18 filter
-
-# 2.88 MB floppy disk image (mostly for virtual uses)
-drive w:
- file="@OBJ@/fdimage" cylinders=80 heads=2 sectors=36 filter
-
-
diff --git a/arch/x86_64/boot/setup.S b/arch/x86_64/boot/setup.S
deleted file mode 100644
index e9e33f94969..00000000000
--- a/arch/x86_64/boot/setup.S
+++ /dev/null
@@ -1,826 +0,0 @@
-/*
- * setup.S Copyright (C) 1991, 1992 Linus Torvalds
- *
- * setup.s is responsible for getting the system data from the BIOS,
- * and putting them into the appropriate places in system memory.
- * both setup.s and system has been loaded by the bootblock.
- *
- * This code asks the bios for memory/disk/other parameters, and
- * puts them in a "safe" place: 0x90000-0x901FF, ie where the
- * boot-block used to be. It is then up to the protected mode
- * system to read them from there before the area is overwritten
- * for buffer-blocks.
- *
- * Move PS/2 aux init code to psaux.c
- * (troyer@saifr00.cfsat.Honeywell.COM) 03Oct92
- *
- * some changes and additional features by Christoph Niemann,
- * March 1993/June 1994 (Christoph.Niemann@linux.org)
- *
- * add APM BIOS checking by Stephen Rothwell, May 1994
- * (sfr@canb.auug.org.au)
- *
- * High load stuff, initrd support and position independency
- * by Hans Lermen & Werner Almesberger, February 1996
- * <lermen@elserv.ffm.fgan.de>, <almesber@lrc.epfl.ch>
- *
- * Video handling moved to video.S by Martin Mares, March 1996
- * <mj@k332.feld.cvut.cz>
- *
- * Extended memory detection scheme retwiddled by orc@pell.chi.il.us (david
- * parsons) to avoid loadlin confusion, July 1997
- *
- * Transcribed from Intel (as86) -> AT&T (gas) by Chris Noe, May 1999.
- * <stiker@northlink.com>
- *
- * Fix to work around buggy BIOSes which don't use carry bit correctly
- * and/or report extended memory in CX/DX for e801h memory size detection
- * call. As a result the kernel got wrong figures. The int15/e801h docs
- * from Ralf Brown interrupt list seem to indicate AX/BX should be used
- * anyway. So to avoid breaking many machines (presumably there was a reason
- * to orginally use CX/DX instead of AX/BX), we do a kludge to see
- * if CX/DX have been changed in the e801 call and if so use AX/BX .
- * Michael Miller, April 2001 <michaelm@mjmm.org>
- *
- * Added long mode checking and SSE force. March 2003, Andi Kleen.
- */
-
-#include <asm/segment.h>
-#include <linux/utsrelease.h>
-#include <linux/compile.h>
-#include <asm/boot.h>
-#include <asm/e820.h>
-#include <asm/page.h>
-#include <asm/setup.h>
-
-/* Signature words to ensure LILO loaded us right */
-#define SIG1 0xAA55
-#define SIG2 0x5A5A
-
-INITSEG = DEF_INITSEG # 0x9000, we move boot here, out of the way
-SYSSEG = DEF_SYSSEG # 0x1000, system loaded at 0x10000 (65536).
-SETUPSEG = DEF_SETUPSEG # 0x9020, this is the current segment
- # ... and the former contents of CS
-
-DELTA_INITSEG = SETUPSEG - INITSEG # 0x0020
-
-.code16
-.globl begtext, begdata, begbss, endtext, enddata, endbss
-
-.text
-begtext:
-.data
-begdata:
-.bss
-begbss:
-.text
-
-start:
- jmp trampoline
-
-# This is the setup header, and it must start at %cs:2 (old 0x9020:2)
-
- .ascii "HdrS" # header signature
- .word 0x0206 # header version number (>= 0x0105)
- # or else old loadlin-1.5 will fail)
-realmode_swtch: .word 0, 0 # default_switch, SETUPSEG
-start_sys_seg: .word SYSSEG
- .word kernel_version # pointing to kernel version string
- # above section of header is compatible
- # with loadlin-1.5 (header v1.5). Don't
- # change it.
-
-type_of_loader: .byte 0 # = 0, old one (LILO, Loadlin,
- # Bootlin, SYSLX, bootsect...)
- # See Documentation/i386/boot.txt for
- # assigned ids
-
-# flags, unused bits must be zero (RFU) bit within loadflags
-loadflags:
-LOADED_HIGH = 1 # If set, the kernel is loaded high
-CAN_USE_HEAP = 0x80 # If set, the loader also has set
- # heap_end_ptr to tell how much
- # space behind setup.S can be used for
- # heap purposes.
- # Only the loader knows what is free
-#ifndef __BIG_KERNEL__
- .byte 0
-#else
- .byte LOADED_HIGH
-#endif
-
-setup_move_size: .word 0x8000 # size to move, when setup is not
- # loaded at 0x90000. We will move setup
- # to 0x90000 then just before jumping
- # into the kernel. However, only the
- # loader knows how much data behind
- # us also needs to be loaded.
-
-code32_start: # here loaders can put a different
- # start address for 32-bit code.
-#ifndef __BIG_KERNEL__
- .long 0x1000 # 0x1000 = default for zImage
-#else
- .long 0x100000 # 0x100000 = default for big kernel
-#endif
-
-ramdisk_image: .long 0 # address of loaded ramdisk image
- # Here the loader puts the 32-bit
- # address where it loaded the image.
- # This only will be read by the kernel.
-
-ramdisk_size: .long 0 # its size in bytes
-
-bootsect_kludge:
- .long 0 # obsolete
-
-heap_end_ptr: .word modelist+1024 # (Header version 0x0201 or later)
- # space from here (exclusive) down to
- # end of setup code can be used by setup
- # for local heap purposes.
-
-pad1: .word 0
-cmd_line_ptr: .long 0 # (Header version 0x0202 or later)
- # If nonzero, a 32-bit pointer
- # to the kernel command line.
- # The command line should be
- # located between the start of
- # setup and the end of low
- # memory (0xa0000), or it may
- # get overwritten before it
- # gets read. If this field is
- # used, there is no longer
- # anything magical about the
- # 0x90000 segment; the setup
- # can be located anywhere in
- # low memory 0x10000 or higher.
-
-ramdisk_max: .long 0xffffffff
-kernel_alignment: .long 0x200000 # physical addr alignment required for
- # protected mode relocatable kernel
-#ifdef CONFIG_RELOCATABLE
-relocatable_kernel: .byte 1
-#else
-relocatable_kernel: .byte 0
-#endif
-pad2: .byte 0
-pad3: .word 0
-
-cmdline_size: .long COMMAND_LINE_SIZE-1 #length of the command line,
- #added with boot protocol
- #version 2.06
-
-trampoline: call start_of_setup
- .align 16
- # The offset at this point is 0x240
- .space (0xeff-0x240+1) # E820 & EDD space (ending at 0xeff)
-# End of setup header #####################################################
-
-start_of_setup:
-# Bootlin depends on this being done early
- movw $0x01500, %ax
- movb $0x81, %dl
- int $0x13
-
-#ifdef SAFE_RESET_DISK_CONTROLLER
-# Reset the disk controller.
- movw $0x0000, %ax
- movb $0x80, %dl
- int $0x13
-#endif
-
-# Set %ds = %cs, we know that SETUPSEG = %cs at this point
- movw %cs, %ax # aka SETUPSEG
- movw %ax, %ds
-# Check signature at end of setup
- cmpw $SIG1, setup_sig1
- jne bad_sig
-
- cmpw $SIG2, setup_sig2
- jne bad_sig
-
- jmp good_sig1
-
-# Routine to print asciiz string at ds:si
-prtstr:
- lodsb
- andb %al, %al
- jz fin
-
- call prtchr
- jmp prtstr
-
-fin: ret
-
-# Space printing
-prtsp2: call prtspc # Print double space
-prtspc: movb $0x20, %al # Print single space (note: fall-thru)
-
-prtchr:
- pushw %ax
- pushw %cx
- movw $0007,%bx
- movw $0x01, %cx
- movb $0x0e, %ah
- int $0x10
- popw %cx
- popw %ax
- ret
-
-beep: movb $0x07, %al
- jmp prtchr
-
-no_sig_mess: .string "No setup signature found ..."
-
-good_sig1:
- jmp good_sig
-
-# We now have to find the rest of the setup code/data
-bad_sig:
- movw %cs, %ax # SETUPSEG
- subw $DELTA_INITSEG, %ax # INITSEG
- movw %ax, %ds
- xorb %bh, %bh
- movb (497), %bl # get setup sect from bootsect
- subw $4, %bx # LILO loads 4 sectors of setup
- shlw $8, %bx # convert to words (1sect=2^8 words)
- movw %bx, %cx
- shrw $3, %bx # convert to segment
- addw $SYSSEG, %bx
- movw %bx, %cs:start_sys_seg
-# Move rest of setup code/data to here
- movw $2048, %di # four sectors loaded by LILO
- subw %si, %si
- movw %cs, %ax # aka SETUPSEG
- movw %ax, %es
- movw $SYSSEG, %ax
- movw %ax, %ds
- rep
- movsw
- movw %cs, %ax # aka SETUPSEG
- movw %ax, %ds
- cmpw $SIG1, setup_sig1
- jne no_sig
-
- cmpw $SIG2, setup_sig2
- jne no_sig
-
- jmp good_sig
-
-no_sig:
- lea no_sig_mess, %si
- call prtstr
-
-no_sig_loop:
- jmp no_sig_loop
-
-good_sig:
- movw %cs, %ax # aka SETUPSEG
- subw $DELTA_INITSEG, %ax # aka INITSEG
- movw %ax, %ds
-# Check if an old loader tries to load a big-kernel
- testb $LOADED_HIGH, %cs:loadflags # Do we have a big kernel?
- jz loader_ok # No, no danger for old loaders.
-
- cmpb $0, %cs:type_of_loader # Do we have a loader that
- # can deal with us?
- jnz loader_ok # Yes, continue.
-
- pushw %cs # No, we have an old loader,
- popw %ds # die.
- lea loader_panic_mess, %si
- call prtstr
-
- jmp no_sig_loop
-
-loader_panic_mess: .string "Wrong loader, giving up..."
-
-loader_ok:
- /* check for long mode. */
- /* we have to do this before the VESA setup, otherwise the user
- can't see the error message. */
-
- pushw %ds
- movw %cs,%ax
- movw %ax,%ds
-
- call verify_cpu
- testl %eax,%eax
- jz sse_ok
-
-no_longmode:
- call beep
- lea long_mode_panic,%si
- call prtstr
-no_longmode_loop:
- jmp no_longmode_loop
-long_mode_panic:
- .string "Your CPU does not support long mode. Use a 32bit distribution."
- .byte 0
-
-#include "../kernel/verify_cpu.S"
-sse_ok:
- popw %ds
-
-# tell BIOS we want to go to long mode
- movl $0xec00,%eax # declare target operating mode
- movl $2,%ebx # long mode
- int $0x15
-
-# Get memory size (extended mem, kB)
-
- xorl %eax, %eax
- movl %eax, (0x1e0)
-#ifndef STANDARD_MEMORY_BIOS_CALL
- movb %al, (E820NR)
-# Try three different memory detection schemes. First, try
-# e820h, which lets us assemble a memory map, then try e801h,
-# which returns a 32-bit memory size, and finally 88h, which
-# returns 0-64m
-
-# method E820H:
-# the memory map from hell. e820h returns memory classified into
-# a whole bunch of different types, and allows memory holes and
-# everything. We scan through this memory map and build a list
-# of the first 32 memory areas, which we return at [E820MAP].
-# This is documented at http://www.acpi.info/, in the ACPI 2.0 specification.
-
-#define SMAP 0x534d4150
-
-meme820:
- xorl %ebx, %ebx # continuation counter
- movw $E820MAP, %di # point into the whitelist
- # so we can have the bios
- # directly write into it.
-
-jmpe820:
- movl $0x0000e820, %eax # e820, upper word zeroed
- movl $SMAP, %edx # ascii 'SMAP'
- movl $20, %ecx # size of the e820rec
- pushw %ds # data record.
- popw %es
- int $0x15 # make the call
- jc bail820 # fall to e801 if it fails
-
- cmpl $SMAP, %eax # check the return is `SMAP'
- jne bail820 # fall to e801 if it fails
-
-# cmpl $1, 16(%di) # is this usable memory?
-# jne again820
-
- # If this is usable memory, we save it by simply advancing %di by
- # sizeof(e820rec).
- #
-good820:
- movb (E820NR), %al # up to 128 entries
- cmpb $E820MAX, %al
- jae bail820
-
- incb (E820NR)
- movw %di, %ax
- addw $20, %ax
- movw %ax, %di
-again820:
- cmpl $0, %ebx # check to see if
- jne jmpe820 # %ebx is set to EOF
-bail820:
-
-
-# method E801H:
-# memory size is in 1k chunksizes, to avoid confusing loadlin.
-# we store the 0xe801 memory size in a completely different place,
-# because it will most likely be longer than 16 bits.
-# (use 1e0 because that's what Larry Augustine uses in his
-# alternative new memory detection scheme, and it's sensible
-# to write everything into the same place.)
-
-meme801:
- stc # fix to work around buggy
- xorw %cx,%cx # BIOSes which don't clear/set
- xorw %dx,%dx # carry on pass/error of
- # e801h memory size call
- # or merely pass cx,dx though
- # without changing them.
- movw $0xe801, %ax
- int $0x15
- jc mem88
-
- cmpw $0x0, %cx # Kludge to handle BIOSes
- jne e801usecxdx # which report their extended
- cmpw $0x0, %dx # memory in AX/BX rather than
- jne e801usecxdx # CX/DX. The spec I have read
- movw %ax, %cx # seems to indicate AX/BX
- movw %bx, %dx # are more reasonable anyway...
-
-e801usecxdx:
- andl $0xffff, %edx # clear sign extend
- shll $6, %edx # and go from 64k to 1k chunks
- movl %edx, (0x1e0) # store extended memory size
- andl $0xffff, %ecx # clear sign extend
- addl %ecx, (0x1e0) # and add lower memory into
- # total size.
-
-# Ye Olde Traditional Methode. Returns the memory size (up to 16mb or
-# 64mb, depending on the bios) in ax.
-mem88:
-
-#endif
- movb $0x88, %ah
- int $0x15
- movw %ax, (2)
-
-# Set the keyboard repeat rate to the max
- movw $0x0305, %ax
- xorw %bx, %bx
- int $0x16
-
-# Check for video adapter and its parameters and allow the
-# user to browse video modes.
- call video # NOTE: we need %ds pointing
- # to bootsector
-
-# Get hd0 data...
- xorw %ax, %ax
- movw %ax, %ds
- ldsw (4 * 0x41), %si
- movw %cs, %ax # aka SETUPSEG
- subw $DELTA_INITSEG, %ax # aka INITSEG
- pushw %ax
- movw %ax, %es
- movw $0x0080, %di
- movw $0x10, %cx
- pushw %cx
- cld
- rep
- movsb
-# Get hd1 data...
- xorw %ax, %ax
- movw %ax, %ds
- ldsw (4 * 0x46), %si
- popw %cx
- popw %es
- movw $0x0090, %di
- rep
- movsb
-# Check that there IS a hd1 :-)
- movw $0x01500, %ax
- movb $0x81, %dl
- int $0x13
- jc no_disk1
-
- cmpb $3, %ah
- je is_disk1
-
-no_disk1:
- movw %cs, %ax # aka SETUPSEG
- subw $DELTA_INITSEG, %ax # aka INITSEG
- movw %ax, %es
- movw $0x0090, %di
- movw $0x10, %cx
- xorw %ax, %ax
- cld
- rep
- stosb
-is_disk1:
-
-# Check for PS/2 pointing device
- movw %cs, %ax # aka SETUPSEG
- subw $DELTA_INITSEG, %ax # aka INITSEG
- movw %ax, %ds
- movb $0, (0x1ff) # default is no pointing device
- int $0x11 # int 0x11: equipment list
- testb $0x04, %al # check if mouse installed
- jz no_psmouse
-
- movb $0xAA, (0x1ff) # device present
-no_psmouse:
-
-#include "../../i386/boot/edd.S"
-
-# Now we want to move to protected mode ...
- cmpw $0, %cs:realmode_swtch
- jz rmodeswtch_normal
-
- lcall *%cs:realmode_swtch
-
- jmp rmodeswtch_end
-
-rmodeswtch_normal:
- pushw %cs
- call default_switch
-
-rmodeswtch_end:
-# we get the code32 start address and modify the below 'jmpi'
-# (loader may have changed it)
- movl %cs:code32_start, %eax
- movl %eax, %cs:code32
-
-# Now we move the system to its rightful place ... but we check if we have a
-# big-kernel. In that case we *must* not move it ...
- testb $LOADED_HIGH, %cs:loadflags
- jz do_move0 # .. then we have a normal low
- # loaded zImage
- # .. or else we have a high
- # loaded bzImage
- jmp end_move # ... and we skip moving
-
-do_move0:
- movw $0x100, %ax # start of destination segment
- movw %cs, %bp # aka SETUPSEG
- subw $DELTA_INITSEG, %bp # aka INITSEG
- movw %cs:start_sys_seg, %bx # start of source segment
- cld
-do_move:
- movw %ax, %es # destination segment
- incb %ah # instead of add ax,#0x100
- movw %bx, %ds # source segment
- addw $0x100, %bx
- subw %di, %di
- subw %si, %si
- movw $0x800, %cx
- rep
- movsw
- cmpw %bp, %bx # assume start_sys_seg > 0x200,
- # so we will perhaps read one
- # page more than needed, but
- # never overwrite INITSEG
- # because destination is a
- # minimum one page below source
- jb do_move
-
-end_move:
-# then we load the segment descriptors
- movw %cs, %ax # aka SETUPSEG
- movw %ax, %ds
-
-# Check whether we need to be downward compatible with version <=201
- cmpl $0, cmd_line_ptr
- jne end_move_self # loader uses version >=202 features
- cmpb $0x20, type_of_loader
- je end_move_self # bootsect loader, we know of it
-
-# Boot loader doesnt support boot protocol version 2.02.
-# If we have our code not at 0x90000, we need to move it there now.
-# We also then need to move the params behind it (commandline)
-# Because we would overwrite the code on the current IP, we move
-# it in two steps, jumping high after the first one.
- movw %cs, %ax
- cmpw $SETUPSEG, %ax
- je end_move_self
-
- cli # make sure we really have
- # interrupts disabled !
- # because after this the stack
- # should not be used
- subw $DELTA_INITSEG, %ax # aka INITSEG
- movw %ss, %dx
- cmpw %ax, %dx
- jb move_self_1
-
- addw $INITSEG, %dx
- subw %ax, %dx # this will go into %ss after
- # the move
-move_self_1:
- movw %ax, %ds
- movw $INITSEG, %ax # real INITSEG
- movw %ax, %es
- movw %cs:setup_move_size, %cx
- std # we have to move up, so we use
- # direction down because the
- # areas may overlap
- movw %cx, %di
- decw %di
- movw %di, %si
- subw $move_self_here+0x200, %cx
- rep
- movsb
- ljmp $SETUPSEG, $move_self_here
-
-move_self_here:
- movw $move_self_here+0x200, %cx
- rep
- movsb
- movw $SETUPSEG, %ax
- movw %ax, %ds
- movw %dx, %ss
-end_move_self: # now we are at the right place
- lidt idt_48 # load idt with 0,0
- xorl %eax, %eax # Compute gdt_base
- movw %ds, %ax # (Convert %ds:gdt to a linear ptr)
- shll $4, %eax
- addl $gdt, %eax
- movl %eax, (gdt_48+2)
- lgdt gdt_48 # load gdt with whatever is
- # appropriate
-
-# that was painless, now we enable a20
- call empty_8042
-
- movb $0xD1, %al # command write
- outb %al, $0x64
- call empty_8042
-
- movb $0xDF, %al # A20 on
- outb %al, $0x60
- call empty_8042
-
-#
-# You must preserve the other bits here. Otherwise embarrasing things
-# like laptops powering off on boot happen. Corrected version by Kira
-# Brown from Linux 2.2
-#
- inb $0x92, %al #
- orb $02, %al # "fast A20" version
- outb %al, $0x92 # some chips have only this
-
-# wait until a20 really *is* enabled; it can take a fair amount of
-# time on certain systems; Toshiba Tecras are known to have this
-# problem. The memory location used here (0x200) is the int 0x80
-# vector, which should be safe to use.
-
- xorw %ax, %ax # segment 0x0000
- movw %ax, %fs
- decw %ax # segment 0xffff (HMA)
- movw %ax, %gs
-a20_wait:
- incw %ax # unused memory location <0xfff0
- movw %ax, %fs:(0x200) # we use the "int 0x80" vector
- cmpw %gs:(0x210), %ax # and its corresponding HMA addr
- je a20_wait # loop until no longer aliased
-
-# make sure any possible coprocessor is properly reset..
- xorw %ax, %ax
- outb %al, $0xf0
- call delay
-
- outb %al, $0xf1
- call delay
-
-# well, that went ok, I hope. Now we mask all interrupts - the rest
-# is done in init_IRQ().
- movb $0xFF, %al # mask all interrupts for now
- outb %al, $0xA1
- call delay
-
- movb $0xFB, %al # mask all irq's but irq2 which
- outb %al, $0x21 # is cascaded
-
-# Well, that certainly wasn't fun :-(. Hopefully it works, and we don't
-# need no steenking BIOS anyway (except for the initial loading :-).
-# The BIOS-routine wants lots of unnecessary data, and it's less
-# "interesting" anyway. This is how REAL programmers do it.
-#
-# Well, now's the time to actually move into protected mode. To make
-# things as simple as possible, we do no register set-up or anything,
-# we let the gnu-compiled 32-bit programs do that. We just jump to
-# absolute address 0x1000 (or the loader supplied one),
-# in 32-bit protected mode.
-#
-# Note that the short jump isn't strictly needed, although there are
-# reasons why it might be a good idea. It won't hurt in any case.
- movw $1, %ax # protected mode (PE) bit
- lmsw %ax # This is it!
- jmp flush_instr
-
-flush_instr:
- xorw %bx, %bx # Flag to indicate a boot
- xorl %esi, %esi # Pointer to real-mode code
- movw %cs, %si
- subw $DELTA_INITSEG, %si
- shll $4, %esi # Convert to 32-bit pointer
-# NOTE: For high loaded big kernels we need a
-# jmpi 0x100000,__KERNEL_CS
-#
-# but we yet haven't reloaded the CS register, so the default size
-# of the target offset still is 16 bit.
-# However, using an operand prefix (0x66), the CPU will properly
-# take our 48 bit far pointer. (INTeL 80386 Programmer's Reference
-# Manual, Mixing 16-bit and 32-bit code, page 16-6)
-
- .byte 0x66, 0xea # prefix + jmpi-opcode
-code32: .long 0x1000 # will be set to 0x100000
- # for big kernels
- .word __KERNEL_CS
-
-# Here's a bunch of information about your current kernel..
-kernel_version: .ascii UTS_RELEASE
- .ascii " ("
- .ascii LINUX_COMPILE_BY
- .ascii "@"
- .ascii LINUX_COMPILE_HOST
- .ascii ") "
- .ascii UTS_VERSION
- .byte 0
-
-# This is the default real mode switch routine.
-# to be called just before protected mode transition
-default_switch:
- cli # no interrupts allowed !
- movb $0x80, %al # disable NMI for bootup
- # sequence
- outb %al, $0x70
- lret
-
-
-# This routine checks that the keyboard command queue is empty
-# (after emptying the output buffers)
-#
-# Some machines have delusions that the keyboard buffer is always full
-# with no keyboard attached...
-#
-# If there is no keyboard controller, we will usually get 0xff
-# to all the reads. With each IO taking a microsecond and
-# a timeout of 100,000 iterations, this can take about half a
-# second ("delay" == outb to port 0x80). That should be ok,
-# and should also be plenty of time for a real keyboard controller
-# to empty.
-#
-
-empty_8042:
- pushl %ecx
- movl $100000, %ecx
-
-empty_8042_loop:
- decl %ecx
- jz empty_8042_end_loop
-
- call delay
-
- inb $0x64, %al # 8042 status port
- testb $1, %al # output buffer?
- jz no_output
-
- call delay
- inb $0x60, %al # read it
- jmp empty_8042_loop
-
-no_output:
- testb $2, %al # is input buffer full?
- jnz empty_8042_loop # yes - loop
-empty_8042_end_loop:
- popl %ecx
- ret
-
-# Read the cmos clock. Return the seconds in al
-gettime:
- pushw %cx
- movb $0x02, %ah
- int $0x1a
- movb %dh, %al # %dh contains the seconds
- andb $0x0f, %al
- movb %dh, %ah
- movb $0x04, %cl
- shrb %cl, %ah
- aad
- popw %cx
- ret
-
-# Delay is needed after doing I/O
-delay:
- outb %al,$0x80
- ret
-
-# Descriptor tables
-gdt:
- .word 0, 0, 0, 0 # dummy
-
- .word 0, 0, 0, 0 # unused
-
- .word 0xFFFF # 4Gb - (0x100000*0x1000 = 4Gb)
- .word 0 # base address = 0
- .word 0x9A00 # code read/exec
- .word 0x00CF # granularity = 4096, 386
- # (+5th nibble of limit)
-
- .word 0xFFFF # 4Gb - (0x100000*0x1000 = 4Gb)
- .word 0 # base address = 0
- .word 0x9200 # data read/write
- .word 0x00CF # granularity = 4096, 386
- # (+5th nibble of limit)
-gdt_end:
-idt_48:
- .word 0 # idt limit = 0
- .word 0, 0 # idt base = 0L
-gdt_48:
- .word gdt_end-gdt-1 # gdt limit
- .word 0, 0 # gdt base (filled in later)
-
-# Include video setup & detection code
-
-#include "../../i386/boot/video.S"
-
-# Setup signature -- must be last
-setup_sig1: .word SIG1
-setup_sig2: .word SIG2
-
-# After this point, there is some free space which is used by the video mode
-# handling code to store the temporary mode table (not used by the kernel).
-
-modelist:
-
-.text
-endtext:
-.data
-enddata:
-.bss
-endbss:
diff --git a/arch/x86_64/boot/tools/build.c b/arch/x86_64/boot/tools/build.c
deleted file mode 100644
index eae86691709..00000000000
--- a/arch/x86_64/boot/tools/build.c
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Copyright (C) 1991, 1992 Linus Torvalds
- * Copyright (C) 1997 Martin Mares
- */
-
-/*
- * This file builds a disk-image from three different files:
- *
- * - bootsect: compatibility mbr which prints an error message if
- * someone tries to boot the kernel directly.
- * - setup: 8086 machine code, sets up system parm
- * - system: 80386 code for actual system
- *
- * It does some checking that all files are of the correct type, and
- * just writes the result to stdout, removing headers and padding to
- * the right amount. It also writes some system data to stderr.
- */
-
-/*
- * Changes by tytso to allow root device specification
- * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
- * Cross compiling fixes by Gertjan van Wingerde, July 1996
- * Rewritten by Martin Mares, April 1997
- */
-
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-#include <stdarg.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/sysmacros.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <asm/boot.h>
-
-typedef unsigned char byte;
-typedef unsigned short word;
-typedef unsigned long u32;
-
-#define DEFAULT_MAJOR_ROOT 0
-#define DEFAULT_MINOR_ROOT 0
-
-/* Minimal number of setup sectors (see also bootsect.S) */
-#define SETUP_SECTS 4
-
-byte buf[1024];
-int fd;
-int is_big_kernel;
-
-void die(const char * str, ...)
-{
- va_list args;
- va_start(args, str);
- vfprintf(stderr, str, args);
- fputc('\n', stderr);
- exit(1);
-}
-
-void file_open(const char *name)
-{
- if ((fd = open(name, O_RDONLY, 0)) < 0)
- die("Unable to open `%s': %m", name);
-}
-
-void usage(void)
-{
- die("Usage: build [-b] bootsect setup system [rootdev] [> image]");
-}
-
-int main(int argc, char ** argv)
-{
- unsigned int i, c, sz, setup_sectors;
- u32 sys_size;
- byte major_root, minor_root;
- struct stat sb;
-
- if (argc > 2 && !strcmp(argv[1], "-b"))
- {
- is_big_kernel = 1;
- argc--, argv++;
- }
- if ((argc < 4) || (argc > 5))
- usage();
- if (argc > 4) {
- if (!strcmp(argv[4], "CURRENT")) {
- if (stat("/", &sb)) {
- perror("/");
- die("Couldn't stat /");
- }
- major_root = major(sb.st_dev);
- minor_root = minor(sb.st_dev);
- } else if (strcmp(argv[4], "FLOPPY")) {
- if (stat(argv[4], &sb)) {
- perror(argv[4]);
- die("Couldn't stat root device.");
- }
- major_root = major(sb.st_rdev);
- minor_root = minor(sb.st_rdev);
- } else {
- major_root = 0;
- minor_root = 0;
- }
- } else {
- major_root = DEFAULT_MAJOR_ROOT;
- minor_root = DEFAULT_MINOR_ROOT;
- }
- fprintf(stderr, "Root device is (%d, %d)\n", major_root, minor_root);
-
- file_open(argv[1]);
- i = read(fd, buf, sizeof(buf));
- fprintf(stderr,"Boot sector %d bytes.\n",i);
- if (i != 512)
- die("Boot block must be exactly 512 bytes");
- if (buf[510] != 0x55 || buf[511] != 0xaa)
- die("Boot block hasn't got boot flag (0xAA55)");
- buf[508] = minor_root;
- buf[509] = major_root;
- if (write(1, buf, 512) != 512)
- die("Write call failed");
- close (fd);
-
- file_open(argv[2]); /* Copy the setup code */
- for (i=0 ; (c=read(fd, buf, sizeof(buf)))>0 ; i+=c )
- if (write(1, buf, c) != c)
- die("Write call failed");
- if (c != 0)
- die("read-error on `setup'");
- close (fd);
-
- setup_sectors = (i + 511) / 512; /* Pad unused space with zeros */
- /* for compatibility with ancient versions of LILO. */
- if (setup_sectors < SETUP_SECTS)
- setup_sectors = SETUP_SECTS;
- fprintf(stderr, "Setup is %d bytes.\n", i);
- memset(buf, 0, sizeof(buf));
- while (i < setup_sectors * 512) {
- c = setup_sectors * 512 - i;
- if (c > sizeof(buf))
- c = sizeof(buf);
- if (write(1, buf, c) != c)
- die("Write call failed");
- i += c;
- }
-
- file_open(argv[3]);
- if (fstat (fd, &sb))
- die("Unable to stat `%s': %m", argv[3]);
- sz = sb.st_size;
- fprintf (stderr, "System is %d kB\n", sz/1024);
- sys_size = (sz + 15) / 16;
- if (!is_big_kernel && sys_size > DEF_SYSSIZE)
- die("System is too big. Try using bzImage or modules.");
- while (sz > 0) {
- int l, n;
-
- l = (sz > sizeof(buf)) ? sizeof(buf) : sz;
- if ((n=read(fd, buf, l)) != l) {
- if (n < 0)
- die("Error reading %s: %m", argv[3]);
- else
- die("%s: Unexpected EOF", argv[3]);
- }
- if (write(1, buf, l) != l)
- die("Write failed");
- sz -= l;
- }
- close(fd);
-
- if (lseek(1, 497, SEEK_SET) != 497) /* Write sizes to the bootsector */
- die("Output: seek failed");
- buf[0] = setup_sectors;
- if (write(1, buf, 1) != 1)
- die("Write of setup sector count failed");
- if (lseek(1, 500, SEEK_SET) != 500)
- die("Output: seek failed");
- buf[0] = (sys_size & 0xff);
- buf[1] = ((sys_size >> 8) & 0xff);
- buf[2] = ((sys_size >> 16) & 0xff);
- buf[3] = ((sys_size >> 24) & 0xff);
- if (write(1, buf, 4) != 4)
- die("Write of image length failed");
-
- return 0; /* Everything is OK */
-}
diff --git a/arch/x86_64/defconfig b/arch/x86_64/defconfig
index 40178e5c310..b7c4cd04bfc 100644
--- a/arch/x86_64/defconfig
+++ b/arch/x86_64/defconfig
@@ -1,19 +1,22 @@
#
# Automatically generated make config: don't edit
-# Linux kernel version: 2.6.22-rc2
-# Mon May 21 13:23:40 2007
+# Linux kernel version: 2.6.22-git14
+# Fri Jul 20 09:53:15 2007
#
CONFIG_X86_64=y
CONFIG_64BIT=y
CONFIG_X86=y
CONFIG_GENERIC_TIME=y
CONFIG_GENERIC_TIME_VSYSCALL=y
+CONFIG_GENERIC_CMOS_UPDATE=y
CONFIG_ZONE_DMA32=y
CONFIG_LOCKDEP_SUPPORT=y
CONFIG_STACKTRACE_SUPPORT=y
CONFIG_SEMAPHORE_SLEEPERS=y
CONFIG_MMU=y
CONFIG_ZONE_DMA=y
+CONFIG_QUICKLIST=y
+CONFIG_NR_QUICK=2
CONFIG_RWSEM_GENERIC_SPINLOCK=y
CONFIG_GENERIC_HWEIGHT=y
CONFIG_GENERIC_CALIBRATE_DELAY=y
@@ -44,19 +47,18 @@ CONFIG_LOCALVERSION=""
CONFIG_LOCALVERSION_AUTO=y
CONFIG_SWAP=y
CONFIG_SYSVIPC=y
-# CONFIG_IPC_NS is not set
CONFIG_SYSVIPC_SYSCTL=y
CONFIG_POSIX_MQUEUE=y
# CONFIG_BSD_PROCESS_ACCT is not set
# CONFIG_TASKSTATS is not set
-# CONFIG_UTS_NS is not set
+# CONFIG_USER_NS is not set
# CONFIG_AUDIT is not set
CONFIG_IKCONFIG=y
CONFIG_IKCONFIG_PROC=y
CONFIG_LOG_BUF_SHIFT=18
# CONFIG_CPUSETS is not set
CONFIG_SYSFS_DEPRECATED=y
-# CONFIG_RELAY is not set
+CONFIG_RELAY=y
CONFIG_BLK_DEV_INITRD=y
CONFIG_INITRAMFS_SOURCE=""
CONFIG_CC_OPTIMIZE_FOR_SIZE=y
@@ -86,10 +88,6 @@ CONFIG_SLAB=y
CONFIG_RT_MUTEXES=y
# CONFIG_TINY_SHMEM is not set
CONFIG_BASE_SMALL=0
-
-#
-# Loadable module support
-#
CONFIG_MODULES=y
CONFIG_MODULE_UNLOAD=y
CONFIG_MODULE_FORCE_UNLOAD=y
@@ -97,12 +95,9 @@ CONFIG_MODULE_FORCE_UNLOAD=y
# CONFIG_MODULE_SRCVERSION_ALL is not set
# CONFIG_KMOD is not set
CONFIG_STOP_MACHINE=y
-
-#
-# Block layer
-#
CONFIG_BLOCK=y
# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_BLK_DEV_BSG is not set
#
# IO Schedulers
@@ -165,9 +160,12 @@ CONFIG_SPLIT_PTLOCK_CPUS=4
CONFIG_MIGRATION=y
CONFIG_RESOURCES_64BIT=y
CONFIG_ZONE_DMA_FLAG=1
+CONFIG_BOUNCE=y
+CONFIG_VIRT_TO_BUS=y
CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y
CONFIG_OUT_OF_LINE_PFN_TO_PAGE=y
CONFIG_NR_CPUS=32
+CONFIG_PHYSICAL_ALIGN=0x200000
CONFIG_HOTPLUG_CPU=y
CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y
CONFIG_HPET_TIMER=y
@@ -180,7 +178,7 @@ CONFIG_X86_MCE_INTEL=y
CONFIG_X86_MCE_AMD=y
# CONFIG_KEXEC is not set
# CONFIG_CRASH_DUMP is not set
-CONFIG_RELOCATABLE=y
+# CONFIG_RELOCATABLE is not set
CONFIG_PHYSICAL_START=0x200000
CONFIG_SECCOMP=y
# CONFIG_CC_STACKPROTECTOR is not set
@@ -201,7 +199,6 @@ CONFIG_GENERIC_PENDING_IRQ=y
CONFIG_PM=y
# CONFIG_PM_LEGACY is not set
# CONFIG_PM_DEBUG is not set
-# CONFIG_PM_SYSFS_DEPRECATED is not set
CONFIG_SOFTWARE_SUSPEND=y
CONFIG_PM_STD_PARTITION=""
CONFIG_SUSPEND_SMP=y
@@ -248,7 +245,7 @@ CONFIG_CPU_FREQ_GOV_PERFORMANCE=y
# CONFIG_CPU_FREQ_GOV_POWERSAVE is not set
CONFIG_CPU_FREQ_GOV_USERSPACE=y
CONFIG_CPU_FREQ_GOV_ONDEMAND=y
-# CONFIG_CPU_FREQ_GOV_CONSERVATIVE is not set
+CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y
#
# CPUFreq processor drivers
@@ -351,20 +348,8 @@ CONFIG_IPV6_SIT=y
# CONFIG_IPV6_MULTIPLE_TABLES is not set
# CONFIG_NETWORK_SECMARK is not set
# CONFIG_NETFILTER is not set
-
-#
-# DCCP Configuration (EXPERIMENTAL)
-#
# CONFIG_IP_DCCP is not set
-
-#
-# SCTP Configuration (EXPERIMENTAL)
-#
# CONFIG_IP_SCTP is not set
-
-#
-# TIPC Configuration (EXPERIMENTAL)
-#
# CONFIG_TIPC is not set
# CONFIG_ATM is not set
# CONFIG_BRIDGE is not set
@@ -401,6 +386,7 @@ CONFIG_IPV6_SIT=y
# CONFIG_MAC80211 is not set
# CONFIG_IEEE80211 is not set
# CONFIG_RFKILL is not set
+# CONFIG_NET_9P is not set
#
# Device Drivers
@@ -415,21 +401,9 @@ CONFIG_FW_LOADER=y
# CONFIG_DEBUG_DRIVER is not set
# CONFIG_DEBUG_DEVRES is not set
# CONFIG_SYS_HYPERVISOR is not set
-
-#
-# Connector - unified userspace <-> kernelspace linker
-#
# CONFIG_CONNECTOR is not set
# CONFIG_MTD is not set
-
-#
-# Parallel port support
-#
# CONFIG_PARPORT is not set
-
-#
-# Plug and Play support
-#
CONFIG_PNP=y
# CONFIG_PNP_DEBUG is not set
@@ -437,10 +411,7 @@ CONFIG_PNP=y
# Protocols
#
CONFIG_PNPACPI=y
-
-#
-# Block devices
-#
+CONFIG_BLK_DEV=y
CONFIG_BLK_DEV_FD=y
# CONFIG_BLK_CPQ_DA is not set
# CONFIG_BLK_CPQ_CISS_DA is not set
@@ -458,17 +429,14 @@ CONFIG_BLK_DEV_RAM_SIZE=4096
CONFIG_BLK_DEV_RAM_BLOCKSIZE=1024
# CONFIG_CDROM_PKTCDVD is not set
# CONFIG_ATA_OVER_ETH is not set
-
-#
-# Misc devices
-#
+CONFIG_MISC_DEVICES=y
# CONFIG_IBM_ASM is not set
# CONFIG_PHANTOM is not set
+# CONFIG_EEPROM_93CX6 is not set
# CONFIG_SGI_IOC4 is not set
# CONFIG_TIFM_CORE is not set
# CONFIG_SONY_LAPTOP is not set
# CONFIG_THINKPAD_ACPI is not set
-# CONFIG_BLINK is not set
CONFIG_IDE=y
CONFIG_BLK_DEV_IDE=y
@@ -539,6 +507,7 @@ CONFIG_BLK_DEV_IDEDMA=y
#
# CONFIG_RAID_ATTRS is not set
CONFIG_SCSI=y
+CONFIG_SCSI_DMA=y
# CONFIG_SCSI_TGT is not set
CONFIG_SCSI_NETLINK=y
# CONFIG_SCSI_PROC_FS is not set
@@ -590,11 +559,9 @@ CONFIG_AIC79XX_DEBUG_MASK=0
# CONFIG_AIC79XX_REG_PRETTY_PRINT is not set
# CONFIG_SCSI_AIC94XX is not set
# CONFIG_SCSI_ARCMSR is not set
-CONFIG_MEGARAID_NEWGEN=y
-CONFIG_MEGARAID_MM=y
-CONFIG_MEGARAID_MAILBOX=y
+# CONFIG_MEGARAID_NEWGEN is not set
# CONFIG_MEGARAID_LEGACY is not set
-CONFIG_MEGARAID_SAS=y
+# CONFIG_MEGARAID_SAS is not set
# CONFIG_SCSI_HPTIOP is not set
# CONFIG_SCSI_BUSLOGIC is not set
# CONFIG_SCSI_DMX3191D is not set
@@ -614,7 +581,6 @@ CONFIG_MEGARAID_SAS=y
# CONFIG_SCSI_DC395x is not set
# CONFIG_SCSI_DC390T is not set
# CONFIG_SCSI_DEBUG is not set
-# CONFIG_SCSI_ESP_CORE is not set
# CONFIG_SCSI_SRP is not set
CONFIG_ATA=y
# CONFIG_ATA_NONSTANDARD is not set
@@ -671,10 +637,6 @@ CONFIG_SATA_VIA=y
# CONFIG_PATA_SIS is not set
# CONFIG_PATA_VIA is not set
# CONFIG_PATA_WINBOND is not set
-
-#
-# Multi-device support (RAID and LVM)
-#
CONFIG_MD=y
# CONFIG_BLK_DEV_MD is not set
CONFIG_BLK_DEV_DM=y
@@ -692,7 +654,7 @@ CONFIG_BLK_DEV_DM=y
CONFIG_FUSION=y
CONFIG_FUSION_SPI=y
# CONFIG_FUSION_FC is not set
-CONFIG_FUSION_SAS=y
+# CONFIG_FUSION_SAS is not set
CONFIG_FUSION_MAX_SGE=128
# CONFIG_FUSION_CTL is not set
@@ -710,7 +672,10 @@ CONFIG_IEEE1394=y
#
# Controllers
#
-# CONFIG_IEEE1394_PCILYNX is not set
+
+#
+# Texas Instruments PCILynx requires I2C
+#
CONFIG_IEEE1394_OHCI1394=y
#
@@ -722,32 +687,19 @@ CONFIG_IEEE1394_OHCI1394=y
# CONFIG_IEEE1394_ETH1394 is not set
# CONFIG_IEEE1394_DV1394 is not set
CONFIG_IEEE1394_RAWIO=y
-
-#
-# I2O device support
-#
# CONFIG_I2O is not set
-# CONFIG_MACINTOSH_DRIVERS is not set
-
-#
-# Network device support
-#
+CONFIG_MACINTOSH_DRIVERS=y
+# CONFIG_MAC_EMUMOUSEBTN is not set
CONFIG_NETDEVICES=y
+CONFIG_NETDEVICES_MULTIQUEUE=y
# CONFIG_DUMMY is not set
# CONFIG_BONDING is not set
+# CONFIG_MACVLAN is not set
# CONFIG_EQUALIZER is not set
CONFIG_TUN=y
# CONFIG_NET_SB1000 is not set
-
-#
-# ARCnet devices
-#
# CONFIG_ARCNET is not set
# CONFIG_PHYLIB is not set
-
-#
-# Ethernet (10 or 100Mbit)
-#
CONFIG_NET_ETHERNET=y
CONFIG_MII=y
# CONFIG_HAPPYMEAL is not set
@@ -756,10 +708,6 @@ CONFIG_MII=y
CONFIG_NET_VENDOR_3COM=y
CONFIG_VORTEX=y
# CONFIG_TYPHOON is not set
-
-#
-# Tulip family network device support
-#
CONFIG_NET_TULIP=y
# CONFIG_DE2104X is not set
CONFIG_TULIP=y
@@ -773,7 +721,8 @@ CONFIG_TULIP=y
# CONFIG_HP100 is not set
CONFIG_NET_PCI=y
# CONFIG_PCNET32 is not set
-# CONFIG_AMD8111_ETH is not set
+CONFIG_AMD8111_ETH=y
+# CONFIG_AMD8111E_NAPI is not set
# CONFIG_ADAPTEC_STARFIRE is not set
CONFIG_B44=y
CONFIG_FORCEDETH=y
@@ -808,7 +757,6 @@ CONFIG_E1000=y
# CONFIG_SIS190 is not set
# CONFIG_SKGE is not set
# CONFIG_SKY2 is not set
-# CONFIG_SK98LIN is not set
# CONFIG_VIA_VELOCITY is not set
CONFIG_TIGON3=y
CONFIG_BNX2=y
@@ -823,10 +771,6 @@ CONFIG_S2IO=m
# CONFIG_MYRI10GE is not set
# CONFIG_NETXEN_NIC is not set
# CONFIG_MLX4_CORE is not set
-
-#
-# Token Ring devices
-#
# CONFIG_TR is not set
#
@@ -855,15 +799,7 @@ CONFIG_NETCONSOLE=y
CONFIG_NETPOLL=y
# CONFIG_NETPOLL_TRAP is not set
CONFIG_NET_POLL_CONTROLLER=y
-
-#
-# ISDN subsystem
-#
# CONFIG_ISDN is not set
-
-#
-# Telephony Support
-#
# CONFIG_PHONE is not set
#
@@ -871,6 +807,7 @@ CONFIG_NET_POLL_CONTROLLER=y
#
CONFIG_INPUT=y
# CONFIG_INPUT_FF_MEMLESS is not set
+# CONFIG_INPUT_POLLDEV is not set
#
# Userland interfaces
@@ -936,6 +873,7 @@ CONFIG_HW_CONSOLE=y
#
CONFIG_SERIAL_8250=y
CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_FIX_EARLYCON_MEM=y
CONFIG_SERIAL_8250_PCI=y
CONFIG_SERIAL_8250_PNP=y
CONFIG_SERIAL_8250_NR_UARTS=4
@@ -951,16 +889,11 @@ CONFIG_SERIAL_CORE_CONSOLE=y
CONFIG_UNIX98_PTYS=y
CONFIG_LEGACY_PTYS=y
CONFIG_LEGACY_PTY_COUNT=256
-
-#
-# IPMI
-#
# CONFIG_IPMI_HANDLER is not set
# CONFIG_WATCHDOG is not set
CONFIG_HW_RANDOM=y
CONFIG_HW_RANDOM_INTEL=y
CONFIG_HW_RANDOM_AMD=y
-# CONFIG_HW_RANDOM_GEODE is not set
# CONFIG_NVRAM is not set
CONFIG_RTC=y
# CONFIG_R3964 is not set
@@ -979,127 +912,19 @@ CONFIG_HPET=y
# CONFIG_HPET_RTC_IRQ is not set
CONFIG_HPET_MMAP=y
# CONFIG_HANGCHECK_TIMER is not set
-
-#
-# TPM devices
-#
# CONFIG_TCG_TPM is not set
# CONFIG_TELCLOCK is not set
CONFIG_DEVPORT=y
-CONFIG_I2C=m
-CONFIG_I2C_BOARDINFO=y
-CONFIG_I2C_CHARDEV=m
-
-#
-# I2C Algorithms
-#
-# CONFIG_I2C_ALGOBIT is not set
-# CONFIG_I2C_ALGOPCF is not set
-# CONFIG_I2C_ALGOPCA is not set
-
-#
-# I2C Hardware Bus support
-#
-# CONFIG_I2C_ALI1535 is not set
-# CONFIG_I2C_ALI1563 is not set
-# CONFIG_I2C_ALI15X3 is not set
-# CONFIG_I2C_AMD756 is not set
-# CONFIG_I2C_AMD8111 is not set
-# CONFIG_I2C_I801 is not set
-# CONFIG_I2C_I810 is not set
-# CONFIG_I2C_PIIX4 is not set
-# CONFIG_I2C_NFORCE2 is not set
-# CONFIG_I2C_OCORES is not set
-# CONFIG_I2C_PARPORT_LIGHT is not set
-# CONFIG_I2C_PROSAVAGE is not set
-# CONFIG_I2C_SAVAGE4 is not set
-# CONFIG_I2C_SIMTEC is not set
-# CONFIG_I2C_SIS5595 is not set
-# CONFIG_I2C_SIS630 is not set
-# CONFIG_I2C_SIS96X is not set
-# CONFIG_I2C_STUB is not set
-# CONFIG_I2C_TINY_USB is not set
-# CONFIG_I2C_VIA is not set
-# CONFIG_I2C_VIAPRO is not set
-# CONFIG_I2C_VOODOO3 is not set
-
-#
-# Miscellaneous I2C Chip support
-#
-# CONFIG_SENSORS_DS1337 is not set
-# CONFIG_SENSORS_DS1374 is not set
-# CONFIG_SENSORS_EEPROM is not set
-# CONFIG_SENSORS_PCF8574 is not set
-# CONFIG_SENSORS_PCA9539 is not set
-# CONFIG_SENSORS_PCF8591 is not set
-# CONFIG_SENSORS_MAX6875 is not set
-# CONFIG_I2C_DEBUG_CORE is not set
-# CONFIG_I2C_DEBUG_ALGO is not set
-# CONFIG_I2C_DEBUG_BUS is not set
-# CONFIG_I2C_DEBUG_CHIP is not set
+# CONFIG_I2C is not set
#
# SPI support
#
# CONFIG_SPI is not set
# CONFIG_SPI_MASTER is not set
-
-#
-# Dallas's 1-wire bus
-#
# CONFIG_W1 is not set
-CONFIG_HWMON=y
-# CONFIG_HWMON_VID is not set
-# CONFIG_SENSORS_ABITUGURU is not set
-# CONFIG_SENSORS_AD7418 is not set
-# CONFIG_SENSORS_ADM1021 is not set
-# CONFIG_SENSORS_ADM1025 is not set
-# CONFIG_SENSORS_ADM1026 is not set
-# CONFIG_SENSORS_ADM1029 is not set
-# CONFIG_SENSORS_ADM1031 is not set
-# CONFIG_SENSORS_ADM9240 is not set
-# CONFIG_SENSORS_K8TEMP is not set
-# CONFIG_SENSORS_ASB100 is not set
-# CONFIG_SENSORS_ATXP1 is not set
-# CONFIG_SENSORS_DS1621 is not set
-# CONFIG_SENSORS_F71805F is not set
-# CONFIG_SENSORS_FSCHER is not set
-# CONFIG_SENSORS_FSCPOS is not set
-# CONFIG_SENSORS_GL518SM is not set
-# CONFIG_SENSORS_GL520SM is not set
-CONFIG_SENSORS_CORETEMP=y
-# CONFIG_SENSORS_IT87 is not set
-# CONFIG_SENSORS_LM63 is not set
-# CONFIG_SENSORS_LM75 is not set
-# CONFIG_SENSORS_LM77 is not set
-# CONFIG_SENSORS_LM78 is not set
-# CONFIG_SENSORS_LM80 is not set
-# CONFIG_SENSORS_LM83 is not set
-# CONFIG_SENSORS_LM85 is not set
-# CONFIG_SENSORS_LM87 is not set
-# CONFIG_SENSORS_LM90 is not set
-# CONFIG_SENSORS_LM92 is not set
-# CONFIG_SENSORS_MAX1619 is not set
-# CONFIG_SENSORS_MAX6650 is not set
-# CONFIG_SENSORS_PC87360 is not set
-# CONFIG_SENSORS_PC87427 is not set
-# CONFIG_SENSORS_SIS5595 is not set
-# CONFIG_SENSORS_SMSC47M1 is not set
-# CONFIG_SENSORS_SMSC47M192 is not set
-CONFIG_SENSORS_SMSC47B397=m
-# CONFIG_SENSORS_VIA686A is not set
-# CONFIG_SENSORS_VT1211 is not set
-# CONFIG_SENSORS_VT8231 is not set
-# CONFIG_SENSORS_W83781D is not set
-# CONFIG_SENSORS_W83791D is not set
-# CONFIG_SENSORS_W83792D is not set
-# CONFIG_SENSORS_W83793 is not set
-# CONFIG_SENSORS_W83L785TS is not set
-# CONFIG_SENSORS_W83627HF is not set
-# CONFIG_SENSORS_W83627EHF is not set
-# CONFIG_SENSORS_HDAPS is not set
-# CONFIG_SENSORS_APPLESMC is not set
-# CONFIG_HWMON_DEBUG_CHIP is not set
+# CONFIG_POWER_SUPPLY is not set
+# CONFIG_HWMON is not set
#
# Multifunction device drivers
@@ -1149,15 +974,11 @@ CONFIG_SOUND=y
# Open Sound System
#
CONFIG_SOUND_PRIME=y
-# CONFIG_OSS_OBSOLETE is not set
# CONFIG_SOUND_TRIDENT is not set
# CONFIG_SOUND_MSNDCLAS is not set
# CONFIG_SOUND_MSNDPIN is not set
# CONFIG_SOUND_OSS is not set
-
-#
-# HID Devices
-#
+CONFIG_HID_SUPPORT=y
CONFIG_HID=y
# CONFIG_HID_DEBUG is not set
@@ -1168,10 +989,7 @@ CONFIG_USB_HID=y
# CONFIG_USB_HIDINPUT_POWERBOOK is not set
# CONFIG_HID_FF is not set
# CONFIG_USB_HIDDEV is not set
-
-#
-# USB support
-#
+CONFIG_USB_SUPPORT=y
CONFIG_USB_ARCH_HAS_HCD=y
CONFIG_USB_ARCH_HAS_OHCI=y
CONFIG_USB_ARCH_HAS_EHCI=y
@@ -1185,6 +1003,7 @@ CONFIG_USB_DEVICEFS=y
# CONFIG_USB_DEVICE_CLASS is not set
# CONFIG_USB_DYNAMIC_MINORS is not set
# CONFIG_USB_SUSPEND is not set
+# CONFIG_USB_PERSIST is not set
# CONFIG_USB_OTG is not set
#
@@ -1194,7 +1013,6 @@ CONFIG_USB_EHCI_HCD=y
# CONFIG_USB_EHCI_SPLIT_ISO is not set
# CONFIG_USB_EHCI_ROOT_HUB_TT is not set
# CONFIG_USB_EHCI_TT_NEWSCHED is not set
-# CONFIG_USB_EHCI_BIG_ENDIAN_MMIO is not set
# CONFIG_USB_ISP116X_HCD is not set
CONFIG_USB_OHCI_HCD=y
# CONFIG_USB_OHCI_BIG_ENDIAN_DESC is not set
@@ -1202,6 +1020,7 @@ CONFIG_USB_OHCI_HCD=y
CONFIG_USB_OHCI_LITTLE_ENDIAN=y
CONFIG_USB_UHCI_HCD=y
# CONFIG_USB_SL811_HCD is not set
+# CONFIG_USB_R8A66597_HCD is not set
#
# USB Device Class drivers
@@ -1292,15 +1111,7 @@ CONFIG_USB_MON=y
#
# LED Triggers
#
-
-#
-# InfiniBand support
-#
# CONFIG_INFINIBAND is not set
-
-#
-# EDAC - error detection and reporting (RAS) (EXPERIMENTAL)
-#
# CONFIG_EDAC is not set
#
@@ -1320,11 +1131,13 @@ CONFIG_USB_MON=y
#
# DMA Devices
#
+CONFIG_VIRTUALIZATION=y
+# CONFIG_KVM is not set
#
-# Virtualization
+# Userspace I/O
#
-# CONFIG_KVM is not set
+# CONFIG_UIO is not set
#
# Firmware Drivers
@@ -1332,6 +1145,7 @@ CONFIG_USB_MON=y
# CONFIG_EDD is not set
# CONFIG_DELL_RBU is not set
# CONFIG_DCDBAS is not set
+CONFIG_DMIID=y
#
# File systems
@@ -1447,7 +1261,6 @@ CONFIG_SUNRPC=y
# CONFIG_NCP_FS is not set
# CONFIG_CODA_FS is not set
# CONFIG_AFS_FS is not set
-# CONFIG_9P_FS is not set
#
# Partition Types
@@ -1524,8 +1337,9 @@ CONFIG_DEBUG_FS=y
CONFIG_DEBUG_KERNEL=y
# CONFIG_DEBUG_SHIRQ is not set
CONFIG_DETECT_SOFTLOCKUP=y
+# CONFIG_SCHED_DEBUG is not set
# CONFIG_SCHEDSTATS is not set
-# CONFIG_TIMER_STATS is not set
+CONFIG_TIMER_STATS=y
# CONFIG_DEBUG_SLAB is not set
# CONFIG_DEBUG_RT_MUTEXES is not set
# CONFIG_RT_MUTEX_TESTER is not set
@@ -1533,6 +1347,7 @@ CONFIG_DETECT_SOFTLOCKUP=y
# CONFIG_DEBUG_MUTEXES is not set
# CONFIG_DEBUG_LOCK_ALLOC is not set
# CONFIG_PROVE_LOCKING is not set
+# CONFIG_LOCK_STAT is not set
# CONFIG_DEBUG_SPINLOCK_SLEEP is not set
# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set
# CONFIG_DEBUG_KOBJECT is not set
@@ -1541,8 +1356,6 @@ CONFIG_DEBUG_BUGVERBOSE=y
# CONFIG_DEBUG_VM is not set
# CONFIG_DEBUG_LIST is not set
# CONFIG_FRAME_POINTER is not set
-CONFIG_UNWIND_INFO=y
-CONFIG_STACK_UNWIND=y
# CONFIG_FORCED_INLINING is not set
# CONFIG_RCU_TORTURE_TEST is not set
# CONFIG_LKDTM is not set
@@ -1557,10 +1370,6 @@ CONFIG_DEBUG_STACKOVERFLOW=y
#
# CONFIG_KEYS is not set
# CONFIG_SECURITY is not set
-
-#
-# Cryptographic options
-#
# CONFIG_CRYPTO is not set
#
@@ -1571,6 +1380,7 @@ CONFIG_BITREVERSE=y
# CONFIG_CRC16 is not set
# CONFIG_CRC_ITU_T is not set
CONFIG_CRC32=y
+# CONFIG_CRC7 is not set
# CONFIG_LIBCRC32C is not set
CONFIG_ZLIB_INFLATE=y
CONFIG_PLIST=y
diff --git a/arch/x86_64/ia32/ia32_aout.c b/arch/x86_64/ia32/ia32_aout.c
index fe83edb93c1..08781370256 100644
--- a/arch/x86_64/ia32/ia32_aout.c
+++ b/arch/x86_64/ia32/ia32_aout.c
@@ -404,7 +404,7 @@ beyond_if:
set_brk(current->mm->start_brk, current->mm->brk);
- retval = ia32_setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT);
+ retval = setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT);
if (retval < 0) {
/* Someone check-me: is this error path enough? */
send_sig(SIGKILL, current, 0);
diff --git a/arch/x86_64/ia32/ia32_binfmt.c b/arch/x86_64/ia32/ia32_binfmt.c
index 185399baaf6..b70f3e7cf06 100644
--- a/arch/x86_64/ia32/ia32_binfmt.c
+++ b/arch/x86_64/ia32/ia32_binfmt.c
@@ -38,6 +38,7 @@
int sysctl_vsyscall32 = 1;
+#undef ARCH_DLINFO
#define ARCH_DLINFO do { \
if (sysctl_vsyscall32) { \
NEW_AUX_ENT(AT_SYSINFO, (u32)(u64)VSYSCALL32_VSYSCALL); \
@@ -232,9 +233,6 @@ do { \
#define load_elf_binary load_elf32_binary
#define ELF_PLAT_INIT(r, load_addr) elf32_init(r)
-#define setup_arg_pages(bprm, stack_top, exec_stack) \
- ia32_setup_arg_pages(bprm, stack_top, exec_stack)
-int ia32_setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top, int executable_stack);
#undef start_thread
#define start_thread(regs,new_rip,new_rsp) do { \
@@ -286,61 +284,6 @@ static void elf32_init(struct pt_regs *regs)
me->thread.es = __USER_DS;
}
-int ia32_setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top,
- int executable_stack)
-{
- unsigned long stack_base;
- struct vm_area_struct *mpnt;
- struct mm_struct *mm = current->mm;
- int i, ret;
-
- stack_base = stack_top - MAX_ARG_PAGES * PAGE_SIZE;
- mm->arg_start = bprm->p + stack_base;
-
- bprm->p += stack_base;
- if (bprm->loader)
- bprm->loader += stack_base;
- bprm->exec += stack_base;
-
- mpnt = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
- if (!mpnt)
- return -ENOMEM;
-
- down_write(&mm->mmap_sem);
- {
- mpnt->vm_mm = mm;
- mpnt->vm_start = PAGE_MASK & (unsigned long) bprm->p;
- mpnt->vm_end = stack_top;
- if (executable_stack == EXSTACK_ENABLE_X)
- mpnt->vm_flags = VM_STACK_FLAGS | VM_EXEC;
- else if (executable_stack == EXSTACK_DISABLE_X)
- mpnt->vm_flags = VM_STACK_FLAGS & ~VM_EXEC;
- else
- mpnt->vm_flags = VM_STACK_FLAGS;
- mpnt->vm_page_prot = (mpnt->vm_flags & VM_EXEC) ?
- PAGE_COPY_EXEC : PAGE_COPY;
- if ((ret = insert_vm_struct(mm, mpnt))) {
- up_write(&mm->mmap_sem);
- kmem_cache_free(vm_area_cachep, mpnt);
- return ret;
- }
- mm->stack_vm = mm->total_vm = vma_pages(mpnt);
- }
-
- for (i = 0 ; i < MAX_ARG_PAGES ; i++) {
- struct page *page = bprm->page[i];
- if (page) {
- bprm->page[i] = NULL;
- install_arg_page(mpnt, page, stack_base);
- }
- stack_base += PAGE_SIZE;
- }
- up_write(&mm->mmap_sem);
-
- return 0;
-}
-EXPORT_SYMBOL(ia32_setup_arg_pages);
-
#ifdef CONFIG_SYSCTL
/* Register vsyscall32 into the ABI table */
#include <linux/sysctl.h>
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index 47565c3345d..938278697e2 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -104,7 +104,7 @@ ENTRY(ia32_sysenter_target)
pushq %rax
CFI_ADJUST_CFA_OFFSET 8
cld
- SAVE_ARGS 0,0,0
+ SAVE_ARGS 0,0,1
/* no need to do an access_ok check here because rbp has been
32bit zero extended */
1: movl (%rbp),%r9d
@@ -294,7 +294,7 @@ ia32_badarg:
*/
ENTRY(ia32_syscall)
- CFI_STARTPROC simple
+ CFI_STARTPROC32 simple
CFI_SIGNAL_FRAME
CFI_DEF_CFA rsp,SS+8-RIP
/*CFI_REL_OFFSET ss,SS-RIP*/
@@ -330,6 +330,7 @@ ia32_sysret:
ia32_tracesys:
SAVE_REST
+ CLEAR_RREGS
movq $-ENOSYS,RAX(%rsp) /* really needed? */
movq %rsp,%rdi /* &pt_regs -> arg1 */
call syscall_trace_enter
@@ -526,7 +527,7 @@ ia32_sys_call_table:
.quad sys_init_module
.quad sys_delete_module
.quad quiet_ni_syscall /* 130 get_kernel_syms */
- .quad sys_quotactl
+ .quad sys32_quotactl
.quad sys_getpgid
.quad sys_fchdir
.quad quiet_ni_syscall /* bdflush */
@@ -719,4 +720,5 @@ ia32_sys_call_table:
.quad compat_sys_signalfd
.quad compat_sys_timerfd
.quad sys_eventfd
+ .quad sys32_fallocate
ia32_syscall_end:
diff --git a/arch/x86_64/ia32/sys_ia32.c b/arch/x86_64/ia32/sys_ia32.c
index 99a78a3cce7..bee96d61443 100644
--- a/arch/x86_64/ia32/sys_ia32.c
+++ b/arch/x86_64/ia32/sys_ia32.c
@@ -879,3 +879,11 @@ asmlinkage long sys32_fadvise64(int fd, unsigned offset_lo, unsigned offset_hi,
return sys_fadvise64_64(fd, ((u64)offset_hi << 32) | offset_lo,
len, advice);
}
+
+asmlinkage long sys32_fallocate(int fd, int mode, unsigned offset_lo,
+ unsigned offset_hi, unsigned len_lo,
+ unsigned len_hi)
+{
+ return sys_fallocate(fd, mode, ((u64)offset_hi << 32) | offset_lo,
+ ((u64)len_hi << 32) | len_lo);
+}
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index de1de8a2fd8..47f1dc30bf5 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -44,6 +44,7 @@ obj-$(CONFIG_PCI) += early-quirks.o
obj-y += topology.o
obj-y += intel_cacheinfo.o
+obj-y += addon_cpuid_features.o
obj-y += pcspeaker.o
CFLAGS_vsyscall.o := $(PROFILING) -g0
@@ -55,6 +56,7 @@ cpuid-$(subst m,y,$(CONFIG_X86_CPUID)) += ../../i386/kernel/cpuid.o
topology-y += ../../i386/kernel/topology.o
microcode-$(subst m,y,$(CONFIG_MICROCODE)) += ../../i386/kernel/microcode.o
intel_cacheinfo-y += ../../i386/kernel/cpu/intel_cacheinfo.o
+addon_cpuid_features-y += ../../i386/kernel/cpu/addon_cpuid_features.o
quirks-y += ../../i386/kernel/quirks.o
i8237-y += ../../i386/kernel/i8237.o
msr-$(subst m,y,$(CONFIG_X86_MSR)) += ../../i386/kernel/msr.o
diff --git a/arch/x86_64/kernel/acpi/sleep.c b/arch/x86_64/kernel/acpi/sleep.c
index 195b7034a14..4277f2b27e6 100644
--- a/arch/x86_64/kernel/acpi/sleep.c
+++ b/arch/x86_64/kernel/acpi/sleep.c
@@ -55,7 +55,7 @@
/* address in low memory of the wakeup routine. */
unsigned long acpi_wakeup_address = 0;
-unsigned long acpi_video_flags;
+unsigned long acpi_realmode_flags;
extern char wakeup_start, wakeup_end;
extern unsigned long acpi_copy_wakeup_routine(unsigned long);
@@ -103,9 +103,11 @@ static int __init acpi_sleep_setup(char *str)
{
while ((str != NULL) && (*str != '\0')) {
if (strncmp(str, "s3_bios", 7) == 0)
- acpi_video_flags = 1;
+ acpi_realmode_flags |= 1;
if (strncmp(str, "s3_mode", 7) == 0)
- acpi_video_flags |= 2;
+ acpi_realmode_flags |= 2;
+ if (strncmp(str, "s3_beep", 7) == 0)
+ acpi_realmode_flags |= 4;
str = strchr(str, ',');
if (str != NULL)
str += strspn(str, ", \t");
diff --git a/arch/x86_64/kernel/acpi/wakeup.S b/arch/x86_64/kernel/acpi/wakeup.S
index 8550a6ffa27..13f1480cbec 100644
--- a/arch/x86_64/kernel/acpi/wakeup.S
+++ b/arch/x86_64/kernel/acpi/wakeup.S
@@ -16,6 +16,21 @@
# cs = 0x1234, eip = 0x05
#
+#define BEEP \
+ inb $97, %al; \
+ outb %al, $0x80; \
+ movb $3, %al; \
+ outb %al, $97; \
+ outb %al, $0x80; \
+ movb $-74, %al; \
+ outb %al, $67; \
+ outb %al, $0x80; \
+ movb $-119, %al; \
+ outb %al, $66; \
+ outb %al, $0x80; \
+ movb $15, %al; \
+ outb %al, $66;
+
ALIGN
.align 16
@@ -33,6 +48,13 @@ wakeup_code:
movw %cs, %ax
movw %ax, %ds # Make ds:0 point to wakeup_start
movw %ax, %ss
+
+ # Data segment must be set up before we can see whether to beep.
+ testl $4, realmode_flags - wakeup_code
+ jz 1f
+ BEEP
+1:
+
# Private stack is needed for ASUS board
mov $(wakeup_stack - wakeup_code), %sp
@@ -48,7 +70,7 @@ wakeup_code:
testl %eax, %eax
jnz no_longmode
- testl $1, video_flags - wakeup_code
+ testl $1, realmode_flags - wakeup_code
jz 1f
lcall $0xc000,$3
movw %cs, %ax
@@ -56,7 +78,7 @@ wakeup_code:
movw %ax, %ss
1:
- testl $2, video_flags - wakeup_code
+ testl $2, realmode_flags - wakeup_code
jz 1f
mov video_mode - wakeup_code, %ax
call mode_seta
@@ -230,7 +252,7 @@ gdt_48a:
real_magic: .quad 0
video_mode: .quad 0
-video_flags: .quad 0
+realmode_flags: .quad 0
.code16
bogus_real_magic:
@@ -346,8 +368,8 @@ ENTRY(acpi_copy_wakeup_routine)
movl saved_video_mode, %edx
movl %edx, video_mode - wakeup_start (,%rdi)
- movl acpi_video_flags, %edx
- movl %edx, video_flags - wakeup_start (,%rdi)
+ movl acpi_realmode_flags, %edx
+ movl %edx, realmode_flags - wakeup_start (,%rdi)
movq $0x12345678, real_magic - wakeup_start (,%rdi)
movq $0x123456789abcdef0, %rdx
movq %rdx, saved_magic
diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c
index a3d450d6c15..8f681cae7bf 100644
--- a/arch/x86_64/kernel/aperture.c
+++ b/arch/x86_64/kernel/aperture.c
@@ -20,7 +20,7 @@
#include <linux/ioport.h>
#include <asm/e820.h>
#include <asm/io.h>
-#include <asm/proto.h>
+#include <asm/iommu.h>
#include <asm/pci-direct.h>
#include <asm/dma.h>
#include <asm/k8.h>
@@ -214,7 +214,7 @@ void __init iommu_hole_init(void)
if (iommu_aperture_disabled || !fix_aperture || !early_pci_allowed())
return;
- printk("Checking aperture...\n");
+ printk(KERN_INFO "Checking aperture...\n");
fix = 0;
for (num = 24; num < 32; num++) {
diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c
index 1b0e07bb872..900ff38d68d 100644
--- a/arch/x86_64/kernel/apic.c
+++ b/arch/x86_64/kernel/apic.c
@@ -92,8 +92,9 @@ unsigned int safe_apic_wait_icr_idle(void)
void enable_NMI_through_LVT0 (void * dummy)
{
unsigned int v;
-
- v = APIC_DM_NMI; /* unmask and set to NMI */
+
+ /* unmask and set to NMI */
+ v = APIC_DM_NMI;
apic_write(APIC_LVT0, v);
}
@@ -120,7 +121,7 @@ void ack_bad_irq(unsigned int irq)
* holds up an irq slot - in excessive cases (when multiple
* unexpected vectors occur) that might lock up the APIC
* completely.
- * But don't ack when the APIC is disabled. -AK
+ * But don't ack when the APIC is disabled. -AK
*/
if (!disable_apic)
ack_APIC_irq();
@@ -616,7 +617,7 @@ early_param("apic", apic_set_verbosity);
* Detect and enable local APICs on non-SMP boards.
* Original code written by Keir Fraser.
* On AMD64 we trust the BIOS - if it says no APIC it is likely
- * not correctly set up (usually the APIC timer won't work etc.)
+ * not correctly set up (usually the APIC timer won't work etc.)
*/
static int __init detect_init_APIC (void)
@@ -789,13 +790,13 @@ static void setup_APIC_timer(unsigned int clocks)
local_irq_save(flags);
/* wait for irq slice */
- if (hpet_address && hpet_use_timer) {
- int trigger = hpet_readl(HPET_T0_CMP);
- while (hpet_readl(HPET_COUNTER) >= trigger)
- /* do nothing */ ;
- while (hpet_readl(HPET_COUNTER) < trigger)
- /* do nothing */ ;
- } else {
+ if (hpet_address && hpet_use_timer) {
+ int trigger = hpet_readl(HPET_T0_CMP);
+ while (hpet_readl(HPET_COUNTER) >= trigger)
+ /* do nothing */ ;
+ while (hpet_readl(HPET_COUNTER) < trigger)
+ /* do nothing */ ;
+ } else {
int c1, c2;
outb_p(0x00, 0x43);
c2 = inb_p(0x40);
@@ -881,10 +882,10 @@ static unsigned int calibration_result;
void __init setup_boot_APIC_clock (void)
{
- if (disable_apic_timer) {
- printk(KERN_INFO "Disabling APIC timer\n");
- return;
- }
+ if (disable_apic_timer) {
+ printk(KERN_INFO "Disabling APIC timer\n");
+ return;
+ }
printk(KERN_INFO "Using local APIC timer interrupts.\n");
using_apic_timer = 1;
@@ -990,8 +991,8 @@ int setup_profiling_timer(unsigned int multiplier)
return -EINVAL;
}
-void setup_APIC_extened_lvt(unsigned char lvt_off, unsigned char vector,
- unsigned char msg_type, unsigned char mask)
+void setup_APIC_extended_lvt(unsigned char lvt_off, unsigned char vector,
+ unsigned char msg_type, unsigned char mask)
{
unsigned long reg = (lvt_off << 4) + K8_APIC_EXT_LVT_BASE;
unsigned int v = (mask << 16) | (msg_type << 8) | vector;
@@ -1128,20 +1129,6 @@ asmlinkage void smp_spurious_interrupt(void)
if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
ack_APIC_irq();
-#if 0
- static unsigned long last_warning;
- static unsigned long skipped;
-
- /* see sw-dev-man vol 3, chapter 7.4.13.5 */
- if (time_before(last_warning+30*HZ,jiffies)) {
- printk(KERN_INFO "spurious APIC interrupt on CPU#%d, %ld skipped.\n",
- smp_processor_id(), skipped);
- last_warning = jiffies;
- skipped = 0;
- } else {
- skipped++;
- }
-#endif
irq_exit();
}
@@ -1173,11 +1160,11 @@ asmlinkage void smp_error_interrupt(void)
7: Illegal register address
*/
printk (KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n",
- smp_processor_id(), v , v1);
+ smp_processor_id(), v , v1);
irq_exit();
}
-int disable_apic;
+int disable_apic;
/*
* This initializes the IO-APIC and APIC hardware if this is
@@ -1185,11 +1172,11 @@ int disable_apic;
*/
int __init APIC_init_uniprocessor (void)
{
- if (disable_apic) {
+ if (disable_apic) {
printk(KERN_INFO "Apic disabled\n");
- return -1;
+ return -1;
}
- if (!cpu_has_apic) {
+ if (!cpu_has_apic) {
disable_apic = 1;
printk(KERN_INFO "Apic disabled by BIOS\n");
return -1;
@@ -1211,8 +1198,8 @@ int __init APIC_init_uniprocessor (void)
return 0;
}
-static __init int setup_disableapic(char *str)
-{
+static __init int setup_disableapic(char *str)
+{
disable_apic = 1;
clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
return 0;
@@ -1220,10 +1207,10 @@ static __init int setup_disableapic(char *str)
early_param("disableapic", setup_disableapic);
/* same as disableapic, for compatibility */
-static __init int setup_nolapic(char *str)
-{
+static __init int setup_nolapic(char *str)
+{
return setup_disableapic(str);
-}
+}
early_param("nolapic", setup_nolapic);
static int __init parse_lapic_timer_c2_ok(char *arg)
@@ -1233,13 +1220,13 @@ static int __init parse_lapic_timer_c2_ok(char *arg)
}
early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
-static __init int setup_noapictimer(char *str)
-{
+static __init int setup_noapictimer(char *str)
+{
if (str[0] != ' ' && str[0] != 0)
return 0;
disable_apic_timer = 1;
return 1;
-}
+}
static __init int setup_apicmaintimer(char *str)
{
@@ -1264,5 +1251,5 @@ static __init int setup_apicpmtimer(char *s)
}
__setup("apicpmtimer", setup_apicpmtimer);
-__setup("noapictimer", setup_noapictimer);
+__setup("noapictimer", setup_noapictimer);
diff --git a/arch/x86_64/kernel/cpufreq/Kconfig b/arch/x86_64/kernel/cpufreq/Kconfig
index c0749d2479f..a3fd51926cb 100644
--- a/arch/x86_64/kernel/cpufreq/Kconfig
+++ b/arch/x86_64/kernel/cpufreq/Kconfig
@@ -48,10 +48,6 @@ config X86_SPEEDSTEP_CENTRINO
If in doubt, say N.
-config X86_SPEEDSTEP_CENTRINO_ACPI
- bool
- depends on X86_SPEEDSTEP_CENTRINO
-
config X86_ACPI_CPUFREQ
tristate "ACPI Processor P-States driver"
select CPU_FREQ_TABLE
@@ -73,7 +69,7 @@ comment "shared options"
config X86_ACPI_CPUFREQ_PROC_INTF
bool "/proc/acpi/processor/../performance interface (deprecated)"
depends on PROC_FS
- depends on X86_ACPI_CPUFREQ || X86_SPEEDSTEP_CENTRINO_ACPI || X86_POWERNOW_K8_ACPI
+ depends on X86_ACPI_CPUFREQ || X86_POWERNOW_K8_ACPI
help
This enables the deprecated /proc/acpi/processor/../performance
interface. While it is helpful for debugging, the generic,
diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c
index 13c6c37610e..0f4d5e209e9 100644
--- a/arch/x86_64/kernel/e820.c
+++ b/arch/x86_64/kernel/e820.c
@@ -194,37 +194,6 @@ unsigned long __init e820_end_of_ram(void)
}
/*
- * Find the hole size in the range.
- */
-unsigned long __init e820_hole_size(unsigned long start, unsigned long end)
-{
- unsigned long ram = 0;
- int i;
-
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
- unsigned long last, addr;
-
- if (ei->type != E820_RAM ||
- ei->addr+ei->size <= start ||
- ei->addr >= end)
- continue;
-
- addr = round_up(ei->addr, PAGE_SIZE);
- if (addr < start)
- addr = start;
-
- last = round_down(ei->addr + ei->size, PAGE_SIZE);
- if (last >= end)
- last = end;
-
- if (last > addr)
- ram += last - addr;
- }
- return ((end - start) - ram);
-}
-
-/*
* Mark e820 reserved areas as busy for the resource manager.
*/
void __init e820_reserve_resources(void)
@@ -289,47 +258,61 @@ void __init e820_mark_nosave_regions(void)
}
}
-/* Walk the e820 map and register active regions within a node */
-void __init
-e820_register_active_regions(int nid, unsigned long start_pfn,
- unsigned long end_pfn)
+/*
+ * Finds an active region in the address range from start_pfn to end_pfn and
+ * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
+ */
+static int __init e820_find_active_region(const struct e820entry *ei,
+ unsigned long start_pfn,
+ unsigned long end_pfn,
+ unsigned long *ei_startpfn,
+ unsigned long *ei_endpfn)
{
- int i;
- unsigned long ei_startpfn, ei_endpfn;
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
- ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT;
- ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE)
- >> PAGE_SHIFT;
+ *ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT;
+ *ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE) >> PAGE_SHIFT;
- /* Skip map entries smaller than a page */
- if (ei_startpfn >= ei_endpfn)
- continue;
+ /* Skip map entries smaller than a page */
+ if (*ei_startpfn >= *ei_endpfn)
+ return 0;
- /* Check if end_pfn_map should be updated */
- if (ei->type != E820_RAM && ei_endpfn > end_pfn_map)
- end_pfn_map = ei_endpfn;
+ /* Check if end_pfn_map should be updated */
+ if (ei->type != E820_RAM && *ei_endpfn > end_pfn_map)
+ end_pfn_map = *ei_endpfn;
- /* Skip if map is outside the node */
- if (ei->type != E820_RAM ||
- ei_endpfn <= start_pfn ||
- ei_startpfn >= end_pfn)
- continue;
+ /* Skip if map is outside the node */
+ if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
+ *ei_startpfn >= end_pfn)
+ return 0;
- /* Check for overlaps */
- if (ei_startpfn < start_pfn)
- ei_startpfn = start_pfn;
- if (ei_endpfn > end_pfn)
- ei_endpfn = end_pfn;
+ /* Check for overlaps */
+ if (*ei_startpfn < start_pfn)
+ *ei_startpfn = start_pfn;
+ if (*ei_endpfn > end_pfn)
+ *ei_endpfn = end_pfn;
- /* Obey end_user_pfn to save on memmap */
- if (ei_startpfn >= end_user_pfn)
- continue;
- if (ei_endpfn > end_user_pfn)
- ei_endpfn = end_user_pfn;
+ /* Obey end_user_pfn to save on memmap */
+ if (*ei_startpfn >= end_user_pfn)
+ return 0;
+ if (*ei_endpfn > end_user_pfn)
+ *ei_endpfn = end_user_pfn;
- add_active_range(nid, ei_startpfn, ei_endpfn);
- }
+ return 1;
+}
+
+/* Walk the e820 map and register active regions within a node */
+void __init
+e820_register_active_regions(int nid, unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ unsigned long ei_startpfn;
+ unsigned long ei_endpfn;
+ int i;
+
+ for (i = 0; i < e820.nr_map; i++)
+ if (e820_find_active_region(&e820.map[i],
+ start_pfn, end_pfn,
+ &ei_startpfn, &ei_endpfn))
+ add_active_range(nid, ei_startpfn, ei_endpfn);
}
/*
@@ -350,12 +333,35 @@ void __init add_memory_region(unsigned long start, unsigned long size, int type)
e820.nr_map++;
}
+/*
+ * Find the hole size (in bytes) in the memory range.
+ * @start: starting address of the memory range to scan
+ * @end: ending address of the memory range to scan
+ */
+unsigned long __init e820_hole_size(unsigned long start, unsigned long end)
+{
+ unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long end_pfn = end >> PAGE_SHIFT;
+ unsigned long ei_startpfn;
+ unsigned long ei_endpfn;
+ unsigned long ram = 0;
+ int i;
+
+ for (i = 0; i < e820.nr_map; i++) {
+ if (e820_find_active_region(&e820.map[i],
+ start_pfn, end_pfn,
+ &ei_startpfn, &ei_endpfn))
+ ram += ei_endpfn - ei_startpfn;
+ }
+ return end - start - (ram << PAGE_SHIFT);
+}
+
void __init e820_print_map(char *who)
{
int i;
for (i = 0; i < e820.nr_map; i++) {
- printk(" %s: %016Lx - %016Lx ", who,
+ printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
(unsigned long long) e820.map[i].addr,
(unsigned long long) (e820.map[i].addr + e820.map[i].size));
switch (e820.map[i].type) {
diff --git a/arch/x86_64/kernel/early-quirks.c b/arch/x86_64/kernel/early-quirks.c
index 990d9c218a5..13aa4fd728f 100644
--- a/arch/x86_64/kernel/early-quirks.c
+++ b/arch/x86_64/kernel/early-quirks.c
@@ -14,6 +14,7 @@
#include <linux/pci_ids.h>
#include <asm/pci-direct.h>
#include <asm/proto.h>
+#include <asm/iommu.h>
#include <asm/dma.h>
static void __init via_bugs(void)
diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c
index 296d2b0c5d8..fd9aff3f389 100644
--- a/arch/x86_64/kernel/early_printk.c
+++ b/arch/x86_64/kernel/early_printk.c
@@ -6,6 +6,7 @@
#include <asm/io.h>
#include <asm/processor.h>
#include <asm/fcntl.h>
+#include <xen/hvc-console.h>
/* Simple VGA output */
@@ -242,6 +243,10 @@ static int __init setup_early_printk(char *buf)
simnow_init(buf + 6);
early_console = &simnow_console;
keep_early = 1;
+#ifdef CONFIG_HVC_XEN
+ } else if (!strncmp(buf, "xen", 3)) {
+ early_console = &xenboot_console;
+#endif
}
if (keep_early)
diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
index a67f87bf401..830cfc6ee8c 100644
--- a/arch/x86_64/kernel/entry.S
+++ b/arch/x86_64/kernel/entry.S
@@ -282,7 +282,7 @@ sysret_careful:
sysret_signal:
TRACE_IRQS_ON
sti
- testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
+ testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
jz 1f
/* Really a signal */
@@ -375,7 +375,7 @@ int_very_careful:
jmp int_restore_rest
int_signal:
- testl $(_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_SINGLESTEP),%edx
+ testl $(_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
jz 1f
movq %rsp,%rdi # &ptregs -> arg1
xorl %esi,%esi # oldset -> arg2
@@ -599,7 +599,7 @@ retint_careful:
jmp retint_check
retint_signal:
- testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
+ testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
jz retint_swapgs
TRACE_IRQS_ON
sti
diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S
index 1fab487dee8..e89abcdbdde 100644
--- a/arch/x86_64/kernel/head.S
+++ b/arch/x86_64/kernel/head.S
@@ -25,7 +25,7 @@
*/
.text
- .section .bootstrap.text
+ .section .text.head
.code64
.globl startup_64
startup_64:
@@ -73,7 +73,11 @@ startup_64:
addq %rbp, init_level4_pgt + (511*8)(%rip)
addq %rbp, level3_ident_pgt + 0(%rip)
+
addq %rbp, level3_kernel_pgt + (510*8)(%rip)
+ addq %rbp, level3_kernel_pgt + (511*8)(%rip)
+
+ addq %rbp, level2_fixmap_pgt + (506*8)(%rip)
/* Add an Identity mapping if I am above 1G */
leaq _text(%rip), %rdi
@@ -239,10 +243,16 @@ ENTRY(secondary_startup_64)
lretq
/* SMP bootup changes these two */
+#ifndef CONFIG_HOTPLUG_CPU
+ .pushsection .init.data
+#endif
.align 8
.globl initial_code
initial_code:
.quad x86_64_start_kernel
+#ifndef CONFIG_HOTPLUG_CPU
+ .popsection
+#endif
.globl init_rsp
init_rsp:
.quad init_thread_union+THREAD_SIZE-8
@@ -314,7 +324,16 @@ NEXT_PAGE(level3_kernel_pgt)
.fill 510,8,0
/* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
.quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
- .fill 1,8,0
+ .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
+
+NEXT_PAGE(level2_fixmap_pgt)
+ .fill 506,8,0
+ .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
+ /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
+ .fill 5,8,0
+
+NEXT_PAGE(level1_fixmap_pgt)
+ .fill 512,8,0
NEXT_PAGE(level2_ident_pgt)
/* Since I easily can, map the first 1G.
diff --git a/arch/x86_64/kernel/hpet.c b/arch/x86_64/kernel/hpet.c
index b8286968662..e2d1b912e15 100644
--- a/arch/x86_64/kernel/hpet.c
+++ b/arch/x86_64/kernel/hpet.c
@@ -133,7 +133,7 @@ struct clocksource clocksource_hpet = {
.vread = vread_hpet,
};
-int hpet_arch_init(void)
+int __init hpet_arch_init(void)
{
unsigned int id;
u64 tmp;
@@ -190,7 +190,7 @@ int hpet_reenable(void)
*/
#define TICK_COUNT 100000000
-#define TICK_MIN 5000
+#define SMI_THRESHOLD 50000
#define MAX_TRIES 5
/*
@@ -205,7 +205,7 @@ static void __init read_hpet_tsc(int *hpet, int *tsc)
tsc1 = get_cycles_sync();
hpet1 = hpet_readl(HPET_COUNTER);
tsc2 = get_cycles_sync();
- if (tsc2 - tsc1 > TICK_MIN)
+ if ((tsc2 - tsc1) < SMI_THRESHOLD)
break;
}
*hpet = hpet1;
@@ -439,7 +439,7 @@ int hpet_rtc_dropped_irq(void)
return 1;
}
-irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
{
struct rtc_time curr_time;
unsigned long rtc_int_flag = 0;
diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c
index 4b326655b20..948cae64609 100644
--- a/arch/x86_64/kernel/i8259.c
+++ b/arch/x86_64/kernel/i8259.c
@@ -444,24 +444,6 @@ void __init init_ISA_irqs (void)
}
}
-void apic_timer_interrupt(void);
-void spurious_interrupt(void);
-void error_interrupt(void);
-void reschedule_interrupt(void);
-void call_function_interrupt(void);
-void irq_move_cleanup_interrupt(void);
-void invalidate_interrupt0(void);
-void invalidate_interrupt1(void);
-void invalidate_interrupt2(void);
-void invalidate_interrupt3(void);
-void invalidate_interrupt4(void);
-void invalidate_interrupt5(void);
-void invalidate_interrupt6(void);
-void invalidate_interrupt7(void);
-void thermal_interrupt(void);
-void threshold_interrupt(void);
-void i8254_timer_resume(void);
-
static void setup_timer_hardware(void)
{
outb_p(0x34,0x43); /* binary, mode 2, LSB/MSB, ch 0 */
diff --git a/arch/x86_64/kernel/init_task.c b/arch/x86_64/kernel/init_task.c
index 3dc5854ba21..4ff33d4f855 100644
--- a/arch/x86_64/kernel/init_task.c
+++ b/arch/x86_64/kernel/init_task.c
@@ -44,7 +44,7 @@ EXPORT_SYMBOL(init_task);
* section. Since TSS's are completely CPU-local, we want them
* on exact cacheline boundaries, to eliminate cacheline ping-pong.
*/
-DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_internodealigned_in_smp = INIT_TSS;
+DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
/* Copies of the original ist values from the tss are only accessed during
* debugging, no special alignment required.
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 1c6c6f72457..050141c0602 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -152,6 +152,32 @@ static inline void io_apic_modify(unsigned int apic, unsigned int value)
writel(value, &io_apic->data);
}
+static int io_apic_level_ack_pending(unsigned int irq)
+{
+ struct irq_pin_list *entry;
+ unsigned long flags;
+ int pending = 0;
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ entry = irq_2_pin + irq;
+ for (;;) {
+ unsigned int reg;
+ int pin;
+
+ pin = entry->pin;
+ if (pin == -1)
+ break;
+ reg = io_apic_read(entry->apic, 0x10 + pin*2);
+ /* Is the remote IRR bit set? */
+ pending |= (reg >> 14) & 1;
+ if (!entry->next)
+ break;
+ entry = irq_2_pin + entry->next;
+ }
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+ return pending;
+}
+
/*
* Synchronize the IO-APIC and the CPU by doing
* a dummy read from the IO-APIC
@@ -1418,9 +1444,37 @@ static void ack_apic_level(unsigned int irq)
ack_APIC_irq();
/* Now we can move and renable the irq */
- move_masked_irq(irq);
- if (unlikely(do_unmask_irq))
+ if (unlikely(do_unmask_irq)) {
+ /* Only migrate the irq if the ack has been received.
+ *
+ * On rare occasions the broadcast level triggered ack gets
+ * delayed going to ioapics, and if we reprogram the
+ * vector while Remote IRR is still set the irq will never
+ * fire again.
+ *
+ * To prevent this scenario we read the Remote IRR bit
+ * of the ioapic. This has two effects.
+ * - On any sane system the read of the ioapic will
+ * flush writes (and acks) going to the ioapic from
+ * this cpu.
+ * - We get to see if the ACK has actually been delivered.
+ *
+ * Based on failed experiments of reprogramming the
+ * ioapic entry from outside of irq context starting
+ * with masking the ioapic entry and then polling until
+ * Remote IRR was clear before reprogramming the
+ * ioapic I don't trust the Remote IRR bit to be
+ * completey accurate.
+ *
+ * However there appears to be no other way to plug
+ * this race, so if the Remote IRR bit is not
+ * accurate and is causing problems then it is a hardware bug
+ * and you can go talk to the chipset vendor about it.
+ */
+ if (!io_apic_level_ack_pending(irq))
+ move_masked_irq(irq);
unmask_IO_APIC_irq(irq);
+ }
}
static struct irq_chip ioapic_chip __read_mostly = {
diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c
index d4a0d0ac993..a30e004682e 100644
--- a/arch/x86_64/kernel/kprobes.c
+++ b/arch/x86_64/kernel/kprobes.c
@@ -39,9 +39,9 @@
#include <linux/module.h>
#include <linux/kdebug.h>
-#include <asm/cacheflush.h>
#include <asm/pgtable.h>
#include <asm/uaccess.h>
+#include <asm/alternative.h>
void jprobe_return_end(void);
static void __kprobes arch_copy_kprobe(struct kprobe *p);
@@ -209,16 +209,12 @@ static void __kprobes arch_copy_kprobe(struct kprobe *p)
void __kprobes arch_arm_kprobe(struct kprobe *p)
{
- *p->addr = BREAKPOINT_INSTRUCTION;
- flush_icache_range((unsigned long) p->addr,
- (unsigned long) p->addr + sizeof(kprobe_opcode_t));
+ text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
}
void __kprobes arch_disarm_kprobe(struct kprobe *p)
{
- *p->addr = p->opcode;
- flush_icache_range((unsigned long) p->addr,
- (unsigned long) p->addr + sizeof(kprobe_opcode_t));
+ text_poke(p->addr, &p->opcode, 1);
}
void __kprobes arch_remove_kprobe(struct kprobe *p)
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index aa1d1599179..a66d607f5b9 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -18,6 +18,8 @@
#include <linux/capability.h>
#include <linux/cpu.h>
#include <linux/percpu.h>
+#include <linux/poll.h>
+#include <linux/thread_info.h>
#include <linux/ctype.h>
#include <linux/kmod.h>
#include <linux/kdebug.h>
@@ -26,6 +28,7 @@
#include <asm/mce.h>
#include <asm/uaccess.h>
#include <asm/smp.h>
+#include <asm/idle.h>
#define MISC_MCELOG_MINOR 227
#define NR_BANKS 6
@@ -34,13 +37,17 @@ atomic_t mce_entry;
static int mce_dont_init;
-/* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic,
- 3: never panic or exit (for testing only) */
+/*
+ * Tolerant levels:
+ * 0: always panic on uncorrected errors, log corrected errors
+ * 1: panic or SIGBUS on uncorrected errors, log corrected errors
+ * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors
+ * 3: never panic or SIGBUS, log all errors (for testing only)
+ */
static int tolerant = 1;
static int banks;
static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
-static unsigned long console_logged;
-static int notify_user;
+static unsigned long notify_user;
static int rip_msr;
static int mce_bootlog = 1;
static atomic_t mce_events;
@@ -48,6 +55,8 @@ static atomic_t mce_events;
static char trigger[128];
static char *trigger_argv[2] = { trigger, NULL };
+static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
+
/*
* Lockless MCE logging infrastructure.
* This avoids deadlocks on printk locks without having to break locks. Also
@@ -94,8 +103,7 @@ void mce_log(struct mce *mce)
mcelog.entry[entry].finished = 1;
wmb();
- if (!test_and_set_bit(0, &console_logged))
- notify_user = 1;
+ set_bit(0, &notify_user);
}
static void print_mce(struct mce *m)
@@ -128,6 +136,7 @@ static void print_mce(struct mce *m)
static void mce_panic(char *msg, struct mce *backup, unsigned long start)
{
int i;
+
oops_begin();
for (i = 0; i < MCE_LOG_LEN; i++) {
unsigned long tsc = mcelog.entry[i].tsc;
@@ -139,10 +148,7 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start)
}
if (backup)
print_mce(backup);
- if (tolerant >= 3)
- printk("Fake panic: %s\n", msg);
- else
- panic(msg);
+ panic(msg);
}
static int mce_available(struct cpuinfo_x86 *c)
@@ -167,17 +173,6 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
}
}
-static void do_mce_trigger(void)
-{
- static atomic_t mce_logged;
- int events = atomic_read(&mce_events);
- if (events != atomic_read(&mce_logged) && trigger[0]) {
- /* Small race window, but should be harmless. */
- atomic_set(&mce_logged, events);
- call_usermodehelper(trigger, trigger_argv, NULL, -1);
- }
-}
-
/*
* The actual machine check handler
*/
@@ -185,11 +180,19 @@ static void do_mce_trigger(void)
void do_machine_check(struct pt_regs * regs, long error_code)
{
struct mce m, panicm;
- int nowayout = (tolerant < 1);
- int kill_it = 0;
u64 mcestart = 0;
int i;
int panicm_found = 0;
+ /*
+ * If no_way_out gets set, there is no safe way to recover from this
+ * MCE. If tolerant is cranked up, we'll try anyway.
+ */
+ int no_way_out = 0;
+ /*
+ * If kill_it gets set, there might be a way to recover from this
+ * error.
+ */
+ int kill_it = 0;
atomic_inc(&mce_entry);
@@ -201,8 +204,9 @@ void do_machine_check(struct pt_regs * regs, long error_code)
memset(&m, 0, sizeof(struct mce));
m.cpu = smp_processor_id();
rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
+ /* if the restart IP is not valid, we're done for */
if (!(m.mcgstatus & MCG_STATUS_RIPV))
- kill_it = 1;
+ no_way_out = 1;
rdtscll(mcestart);
barrier();
@@ -221,10 +225,18 @@ void do_machine_check(struct pt_regs * regs, long error_code)
continue;
if (m.status & MCI_STATUS_EN) {
- /* In theory _OVER could be a nowayout too, but
- assume any overflowed errors were no fatal. */
- nowayout |= !!(m.status & MCI_STATUS_PCC);
- kill_it |= !!(m.status & MCI_STATUS_UC);
+ /* if PCC was set, there's no way out */
+ no_way_out |= !!(m.status & MCI_STATUS_PCC);
+ /*
+ * If this error was uncorrectable and there was
+ * an overflow, we're in trouble. If no overflow,
+ * we might get away with just killing a task.
+ */
+ if (m.status & MCI_STATUS_UC) {
+ if (tolerant < 1 || m.status & MCI_STATUS_OVER)
+ no_way_out = 1;
+ kill_it = 1;
+ }
}
if (m.status & MCI_STATUS_MISCV)
@@ -235,7 +247,6 @@ void do_machine_check(struct pt_regs * regs, long error_code)
mce_get_rip(&m, regs);
if (error_code >= 0)
rdtscll(m.tsc);
- wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0);
if (error_code != -2)
mce_log(&m);
@@ -251,45 +262,59 @@ void do_machine_check(struct pt_regs * regs, long error_code)
}
/* Never do anything final in the polling timer */
- if (!regs) {
- /* Normal interrupt context here. Call trigger for any new
- events. */
- do_mce_trigger();
+ if (!regs)
goto out;
- }
/* If we didn't find an uncorrectable error, pick
the last one (shouldn't happen, just being safe). */
if (!panicm_found)
panicm = m;
- if (nowayout)
+
+ /*
+ * If we have decided that we just CAN'T continue, and the user
+ * has not set tolerant to an insane level, give up and die.
+ */
+ if (no_way_out && tolerant < 3)
mce_panic("Machine check", &panicm, mcestart);
- if (kill_it) {
+
+ /*
+ * If the error seems to be unrecoverable, something should be
+ * done. Try to kill as little as possible. If we can kill just
+ * one task, do that. If the user has set the tolerance very
+ * high, don't try to do anything at all.
+ */
+ if (kill_it && tolerant < 3) {
int user_space = 0;
- if (m.mcgstatus & MCG_STATUS_RIPV)
+ /*
+ * If the EIPV bit is set, it means the saved IP is the
+ * instruction which caused the MCE.
+ */
+ if (m.mcgstatus & MCG_STATUS_EIPV)
user_space = panicm.rip && (panicm.cs & 3);
-
- /* When the machine was in user space and the CPU didn't get
- confused it's normally not necessary to panic, unless you
- are paranoid (tolerant == 0)
-
- RED-PEN could be more tolerant for MCEs in idle,
- but most likely they occur at boot anyways, where
- it is best to just halt the machine. */
- if ((!user_space && (panic_on_oops || tolerant < 2)) ||
- (unsigned)current->pid <= 1)
- mce_panic("Uncorrected machine check", &panicm, mcestart);
-
- /* do_exit takes an awful lot of locks and has as
- slight risk of deadlocking. If you don't want that
- don't set tolerant >= 2 */
- if (tolerant < 3)
+
+ /*
+ * If we know that the error was in user space, send a
+ * SIGBUS. Otherwise, panic if tolerance is low.
+ *
+ * do_exit() takes an awful lot of locks and has a slight
+ * risk of deadlocking.
+ */
+ if (user_space) {
do_exit(SIGBUS);
+ } else if (panic_on_oops || tolerant < 2) {
+ mce_panic("Uncorrected machine check",
+ &panicm, mcestart);
+ }
}
+ /* notify userspace ASAP */
+ set_thread_flag(TIF_MCE_NOTIFY);
+
out:
- /* Last thing done in the machine check exception to clear state. */
+ /* the last thing we do is clear state */
+ for (i = 0; i < banks; i++)
+ wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
wrmsrl(MSR_IA32_MCG_STATUS, 0);
out2:
atomic_dec(&mce_entry);
@@ -344,37 +369,69 @@ static void mcheck_timer(struct work_struct *work)
on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
/*
- * It's ok to read stale data here for notify_user and
- * console_logged as we'll simply get the updated versions
- * on the next mcheck_timer execution and atomic operations
- * on console_logged act as synchronization for notify_user
- * writes.
+ * Alert userspace if needed. If we logged an MCE, reduce the
+ * polling interval, otherwise increase the polling interval.
*/
- if (notify_user && console_logged) {
+ if (mce_notify_user()) {
+ next_interval = max(next_interval/2, HZ/100);
+ } else {
+ next_interval = min(next_interval*2,
+ (int)round_jiffies_relative(check_interval*HZ));
+ }
+
+ schedule_delayed_work(&mcheck_work, next_interval);
+}
+
+/*
+ * This is only called from process context. This is where we do
+ * anything we need to alert userspace about new MCEs. This is called
+ * directly from the poller and also from entry.S and idle, thanks to
+ * TIF_MCE_NOTIFY.
+ */
+int mce_notify_user(void)
+{
+ clear_thread_flag(TIF_MCE_NOTIFY);
+ if (test_and_clear_bit(0, &notify_user)) {
static unsigned long last_print;
unsigned long now = jiffies;
- /* if we logged an MCE, reduce the polling interval */
- next_interval = max(next_interval/2, HZ/100);
- notify_user = 0;
- clear_bit(0, &console_logged);
+ wake_up_interruptible(&mce_wait);
+ if (trigger[0])
+ call_usermodehelper(trigger, trigger_argv, NULL,
+ UMH_NO_WAIT);
+
if (time_after_eq(now, last_print + (check_interval*HZ))) {
last_print = now;
printk(KERN_INFO "Machine check events logged\n");
}
- } else {
- next_interval = min(next_interval*2, check_interval*HZ);
+
+ return 1;
}
+ return 0;
+}
- schedule_delayed_work(&mcheck_work, next_interval);
+/* see if the idle task needs to notify userspace */
+static int
+mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk)
+{
+ /* IDLE_END should be safe - interrupts are back on */
+ if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY))
+ mce_notify_user();
+
+ return NOTIFY_OK;
}
+static struct notifier_block mce_idle_notifier = {
+ .notifier_call = mce_idle_callback,
+};
static __init int periodic_mcheck_init(void)
{
next_interval = check_interval * HZ;
if (next_interval)
- schedule_delayed_work(&mcheck_work, next_interval);
+ schedule_delayed_work(&mcheck_work,
+ round_jiffies_relative(next_interval));
+ idle_notifier_register(&mce_idle_notifier);
return 0;
}
__initcall(periodic_mcheck_init);
@@ -465,6 +522,40 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
* Character device to read and clear the MCE log.
*/
+static DEFINE_SPINLOCK(mce_state_lock);
+static int open_count; /* #times opened */
+static int open_exclu; /* already open exclusive? */
+
+static int mce_open(struct inode *inode, struct file *file)
+{
+ spin_lock(&mce_state_lock);
+
+ if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
+ spin_unlock(&mce_state_lock);
+ return -EBUSY;
+ }
+
+ if (file->f_flags & O_EXCL)
+ open_exclu = 1;
+ open_count++;
+
+ spin_unlock(&mce_state_lock);
+
+ return nonseekable_open(inode, file);
+}
+
+static int mce_release(struct inode *inode, struct file *file)
+{
+ spin_lock(&mce_state_lock);
+
+ open_count--;
+ open_exclu = 0;
+
+ spin_unlock(&mce_state_lock);
+
+ return 0;
+}
+
static void collect_tscs(void *data)
{
unsigned long *cpu_tsc = (unsigned long *)data;
@@ -532,6 +623,14 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff
return err ? -EFAULT : buf - ubuf;
}
+static unsigned int mce_poll(struct file *file, poll_table *wait)
+{
+ poll_wait(file, &mce_wait, wait);
+ if (rcu_dereference(mcelog.next))
+ return POLLIN | POLLRDNORM;
+ return 0;
+}
+
static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg)
{
int __user *p = (int __user *)arg;
@@ -555,7 +654,10 @@ static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned
}
static const struct file_operations mce_chrdev_ops = {
+ .open = mce_open,
+ .release = mce_release,
.read = mce_read,
+ .poll = mce_poll,
.ioctl = mce_ioctl,
};
@@ -565,6 +667,20 @@ static struct miscdevice mce_log_device = {
&mce_chrdev_ops,
};
+static unsigned long old_cr4 __initdata;
+
+void __init stop_mce(void)
+{
+ old_cr4 = read_cr4();
+ clear_in_cr4(X86_CR4_MCE);
+}
+
+void __init restart_mce(void)
+{
+ if (old_cr4 & X86_CR4_MCE)
+ set_in_cr4(X86_CR4_MCE);
+}
+
/*
* Old style boot options parsing. Only for compatibility.
*/
@@ -620,7 +736,8 @@ static void mce_restart(void)
on_each_cpu(mce_init, NULL, 1, 1);
next_interval = check_interval * HZ;
if (next_interval)
- schedule_delayed_work(&mcheck_work, next_interval);
+ schedule_delayed_work(&mcheck_work,
+ round_jiffies_relative(next_interval));
}
static struct sysdev_class mce_sysclass = {
diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c
index 03356e64f9c..2f8a7f18b0f 100644
--- a/arch/x86_64/kernel/mce_amd.c
+++ b/arch/x86_64/kernel/mce_amd.c
@@ -157,9 +157,9 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
high |= K8_APIC_EXT_LVT_ENTRY_THRESHOLD << 20;
wrmsr(address, low, high);
- setup_APIC_extened_lvt(K8_APIC_EXT_LVT_ENTRY_THRESHOLD,
- THRESHOLD_APIC_VECTOR,
- K8_APIC_EXT_INT_MSG_FIX, 0);
+ setup_APIC_extended_lvt(K8_APIC_EXT_LVT_ENTRY_THRESHOLD,
+ THRESHOLD_APIC_VECTOR,
+ K8_APIC_EXT_INT_MSG_FIX, 0);
threshold_defaults.address = address;
threshold_restart_bank(&threshold_defaults, 0, 0);
diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c
index 61ae57eb9e4..8bf0ca03ac8 100644
--- a/arch/x86_64/kernel/mpparse.c
+++ b/arch/x86_64/kernel/mpparse.c
@@ -32,7 +32,6 @@
/* Have we found an MP table */
int smp_found_config;
-unsigned int __initdata maxcpus = NR_CPUS;
/*
* Various Linux-internal data structures created from the
@@ -649,6 +648,20 @@ static int mp_find_ioapic(int gsi)
return -1;
}
+static u8 uniq_ioapic_id(u8 id)
+{
+ int i;
+ DECLARE_BITMAP(used, 256);
+ bitmap_zero(used, 256);
+ for (i = 0; i < nr_ioapics; i++) {
+ struct mpc_config_ioapic *ia = &mp_ioapics[i];
+ __set_bit(ia->mpc_apicid, used);
+ }
+ if (!test_bit(id, used))
+ return id;
+ return find_first_zero_bit(used, 256);
+}
+
void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
{
int idx = 0;
@@ -656,14 +669,14 @@ void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
if (bad_ioapic(address))
return;
- idx = nr_ioapics++;
+ idx = nr_ioapics;
mp_ioapics[idx].mpc_type = MP_IOAPIC;
mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
mp_ioapics[idx].mpc_apicaddr = address;
set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
- mp_ioapics[idx].mpc_apicid = id;
+ mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id);
mp_ioapics[idx].mpc_apicver = 0;
/*
@@ -680,6 +693,8 @@ void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
mp_ioapics[idx].mpc_apicaddr,
mp_ioapic_routing[idx].gsi_start,
mp_ioapic_routing[idx].gsi_end);
+
+ nr_ioapics++;
}
void __init
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index 931c64bad5e..cb8ee9d02f8 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -296,7 +296,7 @@ static DEFINE_PER_CPU(unsigned, last_irq_sum);
static DEFINE_PER_CPU(local_t, alert_counter);
static DEFINE_PER_CPU(int, nmi_touch);
-void touch_nmi_watchdog (void)
+void touch_nmi_watchdog(void)
{
if (nmi_watchdog > 0) {
unsigned cpu;
@@ -306,8 +306,10 @@ void touch_nmi_watchdog (void)
* do it ourselves because the alert count increase is not
* atomic.
*/
- for_each_present_cpu (cpu)
- per_cpu(nmi_touch, cpu) = 1;
+ for_each_present_cpu(cpu) {
+ if (per_cpu(nmi_touch, cpu) != 1)
+ per_cpu(nmi_touch, cpu) = 1;
+ }
}
touch_softlockup_watchdog();
@@ -382,11 +384,14 @@ int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
return rc;
}
+static unsigned ignore_nmis;
+
asmlinkage __kprobes void do_nmi(struct pt_regs * regs, long error_code)
{
nmi_enter();
add_pda(__nmi_count,1);
- default_do_nmi(regs);
+ if (!ignore_nmis)
+ default_do_nmi(regs);
nmi_exit();
}
@@ -399,6 +404,18 @@ int do_nmi_callback(struct pt_regs * regs, int cpu)
return 0;
}
+void stop_nmi(void)
+{
+ acpi_nmi_disable();
+ ignore_nmis++;
+}
+
+void restart_nmi(void)
+{
+ ignore_nmis--;
+ acpi_nmi_enable();
+}
+
#ifdef CONFIG_SYSCTL
static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c
index 5bd20b542c1..ba16c968ca3 100644
--- a/arch/x86_64/kernel/pci-calgary.c
+++ b/arch/x86_64/kernel/pci-calgary.c
@@ -1,7 +1,7 @@
/*
* Derived from arch/powerpc/kernel/iommu.c
*
- * Copyright (C) IBM Corporation, 2006
+ * Copyright IBM Corporation, 2006-2007
* Copyright (C) 2006 Jon Mason <jdmason@kudzu.us>
*
* Author: Jon Mason <jdmason@kudzu.us>
@@ -35,7 +35,7 @@
#include <linux/pci_ids.h>
#include <linux/pci.h>
#include <linux/delay.h>
-#include <asm/proto.h>
+#include <asm/iommu.h>
#include <asm/calgary.h>
#include <asm/tce.h>
#include <asm/pci-direct.h>
@@ -50,13 +50,7 @@ int use_calgary __read_mostly = 0;
#endif /* CONFIG_CALGARY_DEFAULT_ENABLED */
#define PCI_DEVICE_ID_IBM_CALGARY 0x02a1
-#define PCI_VENDOR_DEVICE_ID_CALGARY \
- (PCI_VENDOR_ID_IBM | PCI_DEVICE_ID_IBM_CALGARY << 16)
-
-/* we need these for register space address calculation */
-#define START_ADDRESS 0xfe000000
-#define CHASSIS_BASE 0
-#define ONE_BASED_CHASSIS_NUM 1
+#define PCI_DEVICE_ID_IBM_CALIOC2 0x0308
/* register offsets inside the host bridge space */
#define CALGARY_CONFIG_REG 0x0108
@@ -80,6 +74,12 @@ int use_calgary __read_mostly = 0;
#define PHB_MEM_2_SIZE_LOW 0x02E0
#define PHB_DOSHOLE_OFFSET 0x08E0
+/* CalIOC2 specific */
+#define PHB_SAVIOR_L2 0x0DB0
+#define PHB_PAGE_MIG_CTRL 0x0DA8
+#define PHB_PAGE_MIG_DEBUG 0x0DA0
+#define PHB_ROOT_COMPLEX_STATUS 0x0CB0
+
/* PHB_CONFIG_RW */
#define PHB_TCE_ENABLE 0x20000000
#define PHB_SLOT_DISABLE 0x1C000000
@@ -92,7 +92,11 @@ int use_calgary __read_mostly = 0;
/* CSR (Channel/DMA Status Register) */
#define CSR_AGENT_MASK 0xffe0ffff
/* CCR (Calgary Configuration Register) */
-#define CCR_2SEC_TIMEOUT 0x000000000000000EUL
+#define CCR_2SEC_TIMEOUT 0x000000000000000EUL
+/* PMCR/PMDR (Page Migration Control/Debug Registers */
+#define PMR_SOFTSTOP 0x80000000
+#define PMR_SOFTSTOPFAULT 0x40000000
+#define PMR_HARDSTOP 0x20000000
#define MAX_NUM_OF_PHBS 8 /* how many PHBs in total? */
#define MAX_NUM_CHASSIS 8 /* max number of chassis */
@@ -155,9 +159,26 @@ struct calgary_bus_info {
void __iomem *bbar;
};
-static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, };
+static void calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev);
+static void calgary_tce_cache_blast(struct iommu_table *tbl);
+static void calgary_dump_error_regs(struct iommu_table *tbl);
+static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev);
+static void calioc2_tce_cache_blast(struct iommu_table *tbl);
+static void calioc2_dump_error_regs(struct iommu_table *tbl);
+
+static struct cal_chipset_ops calgary_chip_ops = {
+ .handle_quirks = calgary_handle_quirks,
+ .tce_cache_blast = calgary_tce_cache_blast,
+ .dump_error_regs = calgary_dump_error_regs
+};
-static void tce_cache_blast(struct iommu_table *tbl);
+static struct cal_chipset_ops calioc2_chip_ops = {
+ .handle_quirks = calioc2_handle_quirks,
+ .tce_cache_blast = calioc2_tce_cache_blast,
+ .dump_error_regs = calioc2_dump_error_regs
+};
+
+static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, };
/* enable this to stress test the chip's TCE cache */
#ifdef CONFIG_IOMMU_DEBUG
@@ -187,6 +208,7 @@ static inline unsigned long verify_bit_range(unsigned long* bitmap,
{
return ~0UL;
}
+
#endif /* CONFIG_IOMMU_DEBUG */
static inline unsigned int num_dma_pages(unsigned long dma, unsigned int dmalen)
@@ -206,11 +228,12 @@ static inline int translate_phb(struct pci_dev* dev)
}
static void iommu_range_reserve(struct iommu_table *tbl,
- unsigned long start_addr, unsigned int npages)
+ unsigned long start_addr, unsigned int npages)
{
unsigned long index;
unsigned long end;
unsigned long badbit;
+ unsigned long flags;
index = start_addr >> PAGE_SHIFT;
@@ -222,6 +245,8 @@ static void iommu_range_reserve(struct iommu_table *tbl,
if (end > tbl->it_size) /* don't go off the table */
end = tbl->it_size;
+ spin_lock_irqsave(&tbl->it_lock, flags);
+
badbit = verify_bit_range(tbl->it_map, 0, index, end);
if (badbit != ~0UL) {
if (printk_ratelimit())
@@ -231,23 +256,29 @@ static void iommu_range_reserve(struct iommu_table *tbl,
}
set_bit_string(tbl->it_map, index, npages);
+
+ spin_unlock_irqrestore(&tbl->it_lock, flags);
}
static unsigned long iommu_range_alloc(struct iommu_table *tbl,
unsigned int npages)
{
+ unsigned long flags;
unsigned long offset;
BUG_ON(npages == 0);
+ spin_lock_irqsave(&tbl->it_lock, flags);
+
offset = find_next_zero_string(tbl->it_map, tbl->it_hint,
tbl->it_size, npages);
if (offset == ~0UL) {
- tce_cache_blast(tbl);
+ tbl->chip_ops->tce_cache_blast(tbl);
offset = find_next_zero_string(tbl->it_map, 0,
tbl->it_size, npages);
if (offset == ~0UL) {
printk(KERN_WARNING "Calgary: IOMMU full.\n");
+ spin_unlock_irqrestore(&tbl->it_lock, flags);
if (panic_on_overflow)
panic("Calgary: fix the allocator.\n");
else
@@ -259,17 +290,17 @@ static unsigned long iommu_range_alloc(struct iommu_table *tbl,
tbl->it_hint = offset + npages;
BUG_ON(tbl->it_hint > tbl->it_size);
+ spin_unlock_irqrestore(&tbl->it_lock, flags);
+
return offset;
}
static dma_addr_t iommu_alloc(struct iommu_table *tbl, void *vaddr,
unsigned int npages, int direction)
{
- unsigned long entry, flags;
+ unsigned long entry;
dma_addr_t ret = bad_dma_address;
- spin_lock_irqsave(&tbl->it_lock, flags);
-
entry = iommu_range_alloc(tbl, npages);
if (unlikely(entry == bad_dma_address))
@@ -282,23 +313,21 @@ static dma_addr_t iommu_alloc(struct iommu_table *tbl, void *vaddr,
tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK,
direction);
- spin_unlock_irqrestore(&tbl->it_lock, flags);
-
return ret;
error:
- spin_unlock_irqrestore(&tbl->it_lock, flags);
printk(KERN_WARNING "Calgary: failed to allocate %u pages in "
"iommu %p\n", npages, tbl);
return bad_dma_address;
}
-static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
+static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
unsigned int npages)
{
unsigned long entry;
unsigned long badbit;
unsigned long badend;
+ unsigned long flags;
/* were we called with bad_dma_address? */
badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE);
@@ -315,6 +344,8 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
tce_free(tbl, entry, npages);
+ spin_lock_irqsave(&tbl->it_lock, flags);
+
badbit = verify_bit_range(tbl->it_map, 1, entry, entry + npages);
if (badbit != ~0UL) {
if (printk_ratelimit())
@@ -324,23 +355,40 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
}
__clear_bit_string(tbl->it_map, entry, npages);
+
+ spin_unlock_irqrestore(&tbl->it_lock, flags);
}
-static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
- unsigned int npages)
+static inline struct iommu_table *find_iommu_table(struct device *dev)
{
- unsigned long flags;
+ struct pci_dev *pdev;
+ struct pci_bus *pbus;
+ struct iommu_table *tbl;
- spin_lock_irqsave(&tbl->it_lock, flags);
+ pdev = to_pci_dev(dev);
- __iommu_free(tbl, dma_addr, npages);
+ /* is the device behind a bridge? */
+ if (unlikely(pdev->bus->parent))
+ pbus = pdev->bus->parent;
+ else
+ pbus = pdev->bus;
- spin_unlock_irqrestore(&tbl->it_lock, flags);
+ tbl = pci_iommu(pbus);
+
+ BUG_ON(pdev->bus->parent &&
+ (tbl->it_busno != pdev->bus->parent->number));
+
+ return tbl;
}
-static void __calgary_unmap_sg(struct iommu_table *tbl,
+static void calgary_unmap_sg(struct device *dev,
struct scatterlist *sglist, int nelems, int direction)
{
+ struct iommu_table *tbl = find_iommu_table(dev);
+
+ if (!translate_phb(to_pci_dev(dev)))
+ return;
+
while (nelems--) {
unsigned int npages;
dma_addr_t dma = sglist->dma_address;
@@ -350,33 +398,17 @@ static void __calgary_unmap_sg(struct iommu_table *tbl,
break;
npages = num_dma_pages(dma, dmalen);
- __iommu_free(tbl, dma, npages);
+ iommu_free(tbl, dma, npages);
sglist++;
}
}
-void calgary_unmap_sg(struct device *dev, struct scatterlist *sglist,
- int nelems, int direction)
-{
- unsigned long flags;
- struct iommu_table *tbl = to_pci_dev(dev)->bus->self->sysdata;
-
- if (!translate_phb(to_pci_dev(dev)))
- return;
-
- spin_lock_irqsave(&tbl->it_lock, flags);
-
- __calgary_unmap_sg(tbl, sglist, nelems, direction);
-
- spin_unlock_irqrestore(&tbl->it_lock, flags);
-}
-
static int calgary_nontranslate_map_sg(struct device* dev,
struct scatterlist *sg, int nelems, int direction)
{
int i;
- for (i = 0; i < nelems; i++ ) {
+ for (i = 0; i < nelems; i++ ) {
struct scatterlist *s = &sg[i];
BUG_ON(!s->page);
s->dma_address = virt_to_bus(page_address(s->page) +s->offset);
@@ -385,11 +417,10 @@ static int calgary_nontranslate_map_sg(struct device* dev,
return nelems;
}
-int calgary_map_sg(struct device *dev, struct scatterlist *sg,
+static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
int nelems, int direction)
{
- struct iommu_table *tbl = to_pci_dev(dev)->bus->self->sysdata;
- unsigned long flags;
+ struct iommu_table *tbl = find_iommu_table(dev);
unsigned long vaddr;
unsigned int npages;
unsigned long entry;
@@ -398,8 +429,6 @@ int calgary_map_sg(struct device *dev, struct scatterlist *sg,
if (!translate_phb(to_pci_dev(dev)))
return calgary_nontranslate_map_sg(dev, sg, nelems, direction);
- spin_lock_irqsave(&tbl->it_lock, flags);
-
for (i = 0; i < nelems; i++ ) {
struct scatterlist *s = &sg[i];
BUG_ON(!s->page);
@@ -423,26 +452,23 @@ int calgary_map_sg(struct device *dev, struct scatterlist *sg,
s->dma_length = s->length;
}
- spin_unlock_irqrestore(&tbl->it_lock, flags);
-
return nelems;
error:
- __calgary_unmap_sg(tbl, sg, nelems, direction);
+ calgary_unmap_sg(dev, sg, nelems, direction);
for (i = 0; i < nelems; i++) {
sg[i].dma_address = bad_dma_address;
sg[i].dma_length = 0;
}
- spin_unlock_irqrestore(&tbl->it_lock, flags);
return 0;
}
-dma_addr_t calgary_map_single(struct device *dev, void *vaddr,
+static dma_addr_t calgary_map_single(struct device *dev, void *vaddr,
size_t size, int direction)
{
dma_addr_t dma_handle = bad_dma_address;
unsigned long uaddr;
unsigned int npages;
- struct iommu_table *tbl = to_pci_dev(dev)->bus->self->sysdata;
+ struct iommu_table *tbl = find_iommu_table(dev);
uaddr = (unsigned long)vaddr;
npages = num_dma_pages(uaddr, size);
@@ -455,10 +481,10 @@ dma_addr_t calgary_map_single(struct device *dev, void *vaddr,
return dma_handle;
}
-void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle,
+static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle,
size_t size, int direction)
{
- struct iommu_table *tbl = to_pci_dev(dev)->bus->self->sysdata;
+ struct iommu_table *tbl = find_iommu_table(dev);
unsigned int npages;
if (!translate_phb(to_pci_dev(dev)))
@@ -468,15 +494,13 @@ void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle,
iommu_free(tbl, dma_handle, npages);
}
-void* calgary_alloc_coherent(struct device *dev, size_t size,
+static void* calgary_alloc_coherent(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t flag)
{
void *ret = NULL;
dma_addr_t mapping;
unsigned int npages, order;
- struct iommu_table *tbl;
-
- tbl = to_pci_dev(dev)->bus->self->sysdata;
+ struct iommu_table *tbl = find_iommu_table(dev);
size = PAGE_ALIGN(size); /* size rounded up to full pages */
npages = size >> PAGE_SHIFT;
@@ -552,7 +576,22 @@ static inline void __iomem* calgary_reg(void __iomem *bar, unsigned long offset)
return (void __iomem*)target;
}
-static void tce_cache_blast(struct iommu_table *tbl)
+static inline int is_calioc2(unsigned short device)
+{
+ return (device == PCI_DEVICE_ID_IBM_CALIOC2);
+}
+
+static inline int is_calgary(unsigned short device)
+{
+ return (device == PCI_DEVICE_ID_IBM_CALGARY);
+}
+
+static inline int is_cal_pci_dev(unsigned short device)
+{
+ return (is_calgary(device) || is_calioc2(device));
+}
+
+static void calgary_tce_cache_blast(struct iommu_table *tbl)
{
u64 val;
u32 aer;
@@ -589,6 +628,85 @@ static void tce_cache_blast(struct iommu_table *tbl)
(void)readl(target); /* flush */
}
+static void calioc2_tce_cache_blast(struct iommu_table *tbl)
+{
+ void __iomem *bbar = tbl->bbar;
+ void __iomem *target;
+ u64 val64;
+ u32 val;
+ int i = 0;
+ int count = 1;
+ unsigned char bus = tbl->it_busno;
+
+begin:
+ printk(KERN_DEBUG "Calgary: CalIOC2 bus 0x%x entering tce cache blast "
+ "sequence - count %d\n", bus, count);
+
+ /* 1. using the Page Migration Control reg set SoftStop */
+ target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL);
+ val = be32_to_cpu(readl(target));
+ printk(KERN_DEBUG "1a. read 0x%x [LE] from %p\n", val, target);
+ val |= PMR_SOFTSTOP;
+ printk(KERN_DEBUG "1b. writing 0x%x [LE] to %p\n", val, target);
+ writel(cpu_to_be32(val), target);
+
+ /* 2. poll split queues until all DMA activity is done */
+ printk(KERN_DEBUG "2a. starting to poll split queues\n");
+ target = calgary_reg(bbar, split_queue_offset(bus));
+ do {
+ val64 = readq(target);
+ i++;
+ } while ((val64 & 0xff) != 0xff && i < 100);
+ if (i == 100)
+ printk(KERN_WARNING "CalIOC2: PCI bus not quiesced, "
+ "continuing anyway\n");
+
+ /* 3. poll Page Migration DEBUG for SoftStopFault */
+ target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG);
+ val = be32_to_cpu(readl(target));
+ printk(KERN_DEBUG "3. read 0x%x [LE] from %p\n", val, target);
+
+ /* 4. if SoftStopFault - goto (1) */
+ if (val & PMR_SOFTSTOPFAULT) {
+ if (++count < 100)
+ goto begin;
+ else {
+ printk(KERN_WARNING "CalIOC2: too many SoftStopFaults, "
+ "aborting TCE cache flush sequence!\n");
+ return; /* pray for the best */
+ }
+ }
+
+ /* 5. Slam into HardStop by reading PHB_PAGE_MIG_CTRL */
+ target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL);
+ printk(KERN_DEBUG "5a. slamming into HardStop by reading %p\n", target);
+ val = be32_to_cpu(readl(target));
+ printk(KERN_DEBUG "5b. read 0x%x [LE] from %p\n", val, target);
+ target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG);
+ val = be32_to_cpu(readl(target));
+ printk(KERN_DEBUG "5c. read 0x%x [LE] from %p (debug)\n", val, target);
+
+ /* 6. invalidate TCE cache */
+ printk(KERN_DEBUG "6. invalidating TCE cache\n");
+ target = calgary_reg(bbar, tar_offset(bus));
+ writeq(tbl->tar_val, target);
+
+ /* 7. Re-read PMCR */
+ printk(KERN_DEBUG "7a. Re-reading PMCR\n");
+ target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL);
+ val = be32_to_cpu(readl(target));
+ printk(KERN_DEBUG "7b. read 0x%x [LE] from %p\n", val, target);
+
+ /* 8. Remove HardStop */
+ printk(KERN_DEBUG "8a. removing HardStop from PMCR\n");
+ target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL);
+ val = 0;
+ printk(KERN_DEBUG "8b. writing 0x%x [LE] to %p\n", val, target);
+ writel(cpu_to_be32(val), target);
+ val = be32_to_cpu(readl(target));
+ printk(KERN_DEBUG "8c. read 0x%x [LE] from %p\n", val, target);
+}
+
static void __init calgary_reserve_mem_region(struct pci_dev *dev, u64 start,
u64 limit)
{
@@ -598,7 +716,7 @@ static void __init calgary_reserve_mem_region(struct pci_dev *dev, u64 start,
limit++;
numpages = ((limit - start) >> PAGE_SHIFT);
- iommu_range_reserve(dev->sysdata, start, numpages);
+ iommu_range_reserve(pci_iommu(dev->bus), start, numpages);
}
static void __init calgary_reserve_peripheral_mem_1(struct pci_dev *dev)
@@ -606,7 +724,7 @@ static void __init calgary_reserve_peripheral_mem_1(struct pci_dev *dev)
void __iomem *target;
u64 low, high, sizelow;
u64 start, limit;
- struct iommu_table *tbl = dev->sysdata;
+ struct iommu_table *tbl = pci_iommu(dev->bus);
unsigned char busnum = dev->bus->number;
void __iomem *bbar = tbl->bbar;
@@ -630,7 +748,7 @@ static void __init calgary_reserve_peripheral_mem_2(struct pci_dev *dev)
u32 val32;
u64 low, high, sizelow, sizehigh;
u64 start, limit;
- struct iommu_table *tbl = dev->sysdata;
+ struct iommu_table *tbl = pci_iommu(dev->bus);
unsigned char busnum = dev->bus->number;
void __iomem *bbar = tbl->bbar;
@@ -666,14 +784,20 @@ static void __init calgary_reserve_regions(struct pci_dev *dev)
{
unsigned int npages;
u64 start;
- struct iommu_table *tbl = dev->sysdata;
+ struct iommu_table *tbl = pci_iommu(dev->bus);
/* reserve EMERGENCY_PAGES from bad_dma_address and up */
iommu_range_reserve(tbl, bad_dma_address, EMERGENCY_PAGES);
/* avoid the BIOS/VGA first 640KB-1MB region */
- start = (640 * 1024);
- npages = ((1024 - 640) * 1024) >> PAGE_SHIFT;
+ /* for CalIOC2 - avoid the entire first MB */
+ if (is_calgary(dev->device)) {
+ start = (640 * 1024);
+ npages = ((1024 - 640) * 1024) >> PAGE_SHIFT;
+ } else { /* calioc2 */
+ start = 0;
+ npages = (1 * 1024 * 1024) >> PAGE_SHIFT;
+ }
iommu_range_reserve(tbl, start, npages);
/* reserve the two PCI peripheral memory regions in IO space */
@@ -694,10 +818,17 @@ static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar)
if (ret)
return ret;
- tbl = dev->sysdata;
+ tbl = pci_iommu(dev->bus);
tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space;
tce_free(tbl, 0, tbl->it_size);
+ if (is_calgary(dev->device))
+ tbl->chip_ops = &calgary_chip_ops;
+ else if (is_calioc2(dev->device))
+ tbl->chip_ops = &calioc2_chip_ops;
+ else
+ BUG();
+
calgary_reserve_regions(dev);
/* set TARs for each PHB */
@@ -706,15 +837,15 @@ static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar)
/* zero out all TAR bits under sw control */
val64 &= ~TAR_SW_BITS;
-
- tbl = dev->sysdata;
table_phys = (u64)__pa(tbl->it_base);
+
val64 |= table_phys;
BUG_ON(specified_table_size > TCE_TABLE_SIZE_8M);
val64 |= (u64) specified_table_size;
tbl->tar_val = cpu_to_be64(val64);
+
writeq(tbl->tar_val, target);
readq(target); /* flush */
@@ -724,7 +855,7 @@ static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar)
static void __init calgary_free_bus(struct pci_dev *dev)
{
u64 val64;
- struct iommu_table *tbl = dev->sysdata;
+ struct iommu_table *tbl = pci_iommu(dev->bus);
void __iomem *target;
unsigned int bitmapsz;
@@ -739,16 +870,81 @@ static void __init calgary_free_bus(struct pci_dev *dev)
tbl->it_map = NULL;
kfree(tbl);
- dev->sysdata = NULL;
+
+ set_pci_iommu(dev->bus, NULL);
/* Can't free bootmem allocated memory after system is up :-( */
bus_info[dev->bus->number].tce_space = NULL;
}
+static void calgary_dump_error_regs(struct iommu_table *tbl)
+{
+ void __iomem *bbar = tbl->bbar;
+ void __iomem *target;
+ u32 csr, plssr;
+
+ target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_CSR_OFFSET);
+ csr = be32_to_cpu(readl(target));
+
+ target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_PLSSR_OFFSET);
+ plssr = be32_to_cpu(readl(target));
+
+ /* If no error, the agent ID in the CSR is not valid */
+ printk(KERN_EMERG "Calgary: DMA error on Calgary PHB 0x%x, "
+ "0x%08x@CSR 0x%08x@PLSSR\n", tbl->it_busno, csr, plssr);
+}
+
+static void calioc2_dump_error_regs(struct iommu_table *tbl)
+{
+ void __iomem *bbar = tbl->bbar;
+ u32 csr, csmr, plssr, mck, rcstat;
+ void __iomem *target;
+ unsigned long phboff = phb_offset(tbl->it_busno);
+ unsigned long erroff;
+ u32 errregs[7];
+ int i;
+
+ /* dump CSR */
+ target = calgary_reg(bbar, phboff | PHB_CSR_OFFSET);
+ csr = be32_to_cpu(readl(target));
+ /* dump PLSSR */
+ target = calgary_reg(bbar, phboff | PHB_PLSSR_OFFSET);
+ plssr = be32_to_cpu(readl(target));
+ /* dump CSMR */
+ target = calgary_reg(bbar, phboff | 0x290);
+ csmr = be32_to_cpu(readl(target));
+ /* dump mck */
+ target = calgary_reg(bbar, phboff | 0x800);
+ mck = be32_to_cpu(readl(target));
+
+ printk(KERN_EMERG "Calgary: DMA error on CalIOC2 PHB 0x%x\n",
+ tbl->it_busno);
+
+ printk(KERN_EMERG "Calgary: 0x%08x@CSR 0x%08x@PLSSR 0x%08x@CSMR 0x%08x@MCK\n",
+ csr, plssr, csmr, mck);
+
+ /* dump rest of error regs */
+ printk(KERN_EMERG "Calgary: ");
+ for (i = 0; i < ARRAY_SIZE(errregs); i++) {
+ /* err regs are at 0x810 - 0x870 */
+ erroff = (0x810 + (i * 0x10));
+ target = calgary_reg(bbar, phboff | erroff);
+ errregs[i] = be32_to_cpu(readl(target));
+ printk("0x%08x@0x%lx ", errregs[i], erroff);
+ }
+ printk("\n");
+
+ /* root complex status */
+ target = calgary_reg(bbar, phboff | PHB_ROOT_COMPLEX_STATUS);
+ rcstat = be32_to_cpu(readl(target));
+ printk(KERN_EMERG "Calgary: 0x%08x@0x%x\n", rcstat,
+ PHB_ROOT_COMPLEX_STATUS);
+}
+
static void calgary_watchdog(unsigned long data)
{
struct pci_dev *dev = (struct pci_dev *)data;
- struct iommu_table *tbl = dev->sysdata;
+ struct iommu_table *tbl = pci_iommu(dev->bus);
void __iomem *bbar = tbl->bbar;
u32 val32;
void __iomem *target;
@@ -758,13 +954,14 @@ static void calgary_watchdog(unsigned long data)
/* If no error, the agent ID in the CSR is not valid */
if (val32 & CSR_AGENT_MASK) {
- printk(KERN_EMERG "calgary_watchdog: DMA error on PHB %#x, "
- "CSR = %#x\n", dev->bus->number, val32);
+ tbl->chip_ops->dump_error_regs(tbl);
+
+ /* reset error */
writel(0, target);
/* Disable bus that caused the error */
target = calgary_reg(bbar, phb_offset(tbl->it_busno) |
- PHB_CONFIG_RW_OFFSET);
+ PHB_CONFIG_RW_OFFSET);
val32 = be32_to_cpu(readl(target));
val32 |= PHB_SLOT_DISABLE;
writel(cpu_to_be32(val32), target);
@@ -775,8 +972,8 @@ static void calgary_watchdog(unsigned long data)
}
}
-static void __init calgary_increase_split_completion_timeout(void __iomem *bbar,
- unsigned char busnum)
+static void __init calgary_set_split_completion_timeout(void __iomem *bbar,
+ unsigned char busnum, unsigned long timeout)
{
u64 val64;
void __iomem *target;
@@ -802,11 +999,40 @@ static void __init calgary_increase_split_completion_timeout(void __iomem *bbar,
/* zero out this PHB's timer bits */
mask = ~(0xFUL << phb_shift);
val64 &= mask;
- val64 |= (CCR_2SEC_TIMEOUT << phb_shift);
+ val64 |= (timeout << phb_shift);
writeq(cpu_to_be64(val64), target);
readq(target); /* flush */
}
+static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev)
+{
+ unsigned char busnum = dev->bus->number;
+ void __iomem *bbar = tbl->bbar;
+ void __iomem *target;
+ u32 val;
+
+ /*
+ * CalIOC2 designers recommend setting bit 8 in 0xnDB0 to 1
+ */
+ target = calgary_reg(bbar, phb_offset(busnum) | PHB_SAVIOR_L2);
+ val = cpu_to_be32(readl(target));
+ val |= 0x00800000;
+ writel(cpu_to_be32(val), target);
+}
+
+static void calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev)
+{
+ unsigned char busnum = dev->bus->number;
+
+ /*
+ * Give split completion a longer timeout on bus 1 for aic94xx
+ * http://bugzilla.kernel.org/show_bug.cgi?id=7180
+ */
+ if (is_calgary(dev->device) && (busnum == 1))
+ calgary_set_split_completion_timeout(tbl->bbar, busnum,
+ CCR_2SEC_TIMEOUT);
+}
+
static void __init calgary_enable_translation(struct pci_dev *dev)
{
u32 val32;
@@ -816,7 +1042,7 @@ static void __init calgary_enable_translation(struct pci_dev *dev)
struct iommu_table *tbl;
busnum = dev->bus->number;
- tbl = dev->sysdata;
+ tbl = pci_iommu(dev->bus);
bbar = tbl->bbar;
/* enable TCE in PHB Config Register */
@@ -824,20 +1050,15 @@ static void __init calgary_enable_translation(struct pci_dev *dev)
val32 = be32_to_cpu(readl(target));
val32 |= PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE;
- printk(KERN_INFO "Calgary: enabling translation on PHB %#x\n", busnum);
+ printk(KERN_INFO "Calgary: enabling translation on %s PHB %#x\n",
+ (dev->device == PCI_DEVICE_ID_IBM_CALGARY) ?
+ "Calgary" : "CalIOC2", busnum);
printk(KERN_INFO "Calgary: errant DMAs will now be prevented on this "
"bus.\n");
writel(cpu_to_be32(val32), target);
readl(target); /* flush */
- /*
- * Give split completion a longer timeout on bus 1 for aic94xx
- * http://bugzilla.kernel.org/show_bug.cgi?id=7180
- */
- if (busnum == 1)
- calgary_increase_split_completion_timeout(bbar, busnum);
-
init_timer(&tbl->watchdog_timer);
tbl->watchdog_timer.function = &calgary_watchdog;
tbl->watchdog_timer.data = (unsigned long)dev;
@@ -853,7 +1074,7 @@ static void __init calgary_disable_translation(struct pci_dev *dev)
struct iommu_table *tbl;
busnum = dev->bus->number;
- tbl = dev->sysdata;
+ tbl = pci_iommu(dev->bus);
bbar = tbl->bbar;
/* disable TCE in PHB Config Register */
@@ -871,13 +1092,19 @@ static void __init calgary_disable_translation(struct pci_dev *dev)
static void __init calgary_init_one_nontraslated(struct pci_dev *dev)
{
pci_dev_get(dev);
- dev->sysdata = NULL;
- dev->bus->self = dev;
+ set_pci_iommu(dev->bus, NULL);
+
+ /* is the device behind a bridge? */
+ if (dev->bus->parent)
+ dev->bus->parent->self = dev;
+ else
+ dev->bus->self = dev;
}
static int __init calgary_init_one(struct pci_dev *dev)
{
void __iomem *bbar;
+ struct iommu_table *tbl;
int ret;
BUG_ON(dev->bus->number >= MAX_PHB_BUS_NUM);
@@ -888,7 +1115,18 @@ static int __init calgary_init_one(struct pci_dev *dev)
goto done;
pci_dev_get(dev);
- dev->bus->self = dev;
+
+ if (dev->bus->parent) {
+ if (dev->bus->parent->self)
+ printk(KERN_WARNING "Calgary: IEEEE, dev %p has "
+ "bus->parent->self!\n", dev);
+ dev->bus->parent->self = dev;
+ } else
+ dev->bus->self = dev;
+
+ tbl = pci_iommu(dev->bus);
+ tbl->chip_ops->handle_quirks(tbl, dev);
+
calgary_enable_translation(dev);
return 0;
@@ -924,11 +1162,18 @@ static int __init calgary_locate_bbars(void)
target = calgary_reg(bbar, offset);
val = be32_to_cpu(readl(target));
+
start_bus = (u8)((val & 0x00FF0000) >> 16);
end_bus = (u8)((val & 0x0000FF00) >> 8);
- for (bus = start_bus; bus <= end_bus; bus++) {
- bus_info[bus].bbar = bbar;
- bus_info[bus].phbid = phb;
+
+ if (end_bus) {
+ for (bus = start_bus; bus <= end_bus; bus++) {
+ bus_info[bus].bbar = bbar;
+ bus_info[bus].phbid = phb;
+ }
+ } else {
+ bus_info[start_bus].bbar = bbar;
+ bus_info[start_bus].phbid = phb;
}
}
}
@@ -948,22 +1193,24 @@ static int __init calgary_init(void)
{
int ret;
struct pci_dev *dev = NULL;
+ void *tce_space;
ret = calgary_locate_bbars();
if (ret)
return ret;
do {
- dev = pci_get_device(PCI_VENDOR_ID_IBM,
- PCI_DEVICE_ID_IBM_CALGARY,
- dev);
+ dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev);
if (!dev)
break;
+ if (!is_cal_pci_dev(dev->device))
+ continue;
if (!translate_phb(dev)) {
calgary_init_one_nontraslated(dev);
continue;
}
- if (!bus_info[dev->bus->number].tce_space && !translate_empty_slots)
+ tce_space = bus_info[dev->bus->number].tce_space;
+ if (!tce_space && !translate_empty_slots)
continue;
ret = calgary_init_one(dev);
@@ -976,10 +1223,11 @@ static int __init calgary_init(void)
error:
do {
dev = pci_get_device_reverse(PCI_VENDOR_ID_IBM,
- PCI_DEVICE_ID_IBM_CALGARY,
- dev);
+ PCI_ANY_ID, dev);
if (!dev)
break;
+ if (!is_cal_pci_dev(dev->device))
+ continue;
if (!translate_phb(dev)) {
pci_dev_put(dev);
continue;
@@ -1057,9 +1305,29 @@ static int __init build_detail_arrays(void)
return 0;
}
-void __init detect_calgary(void)
+static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev)
{
+ int dev;
u32 val;
+
+ if (pci_dev == PCI_DEVICE_ID_IBM_CALIOC2) {
+ /*
+ * FIXME: properly scan for devices accross the
+ * PCI-to-PCI bridge on every CalIOC2 port.
+ */
+ return 1;
+ }
+
+ for (dev = 1; dev < 8; dev++) {
+ val = read_pci_config(bus, dev, 0, 0);
+ if (val != 0xffffffff)
+ break;
+ }
+ return (val != 0xffffffff);
+}
+
+void __init detect_calgary(void)
+{
int bus;
void *tbl;
int calgary_found = 0;
@@ -1116,29 +1384,26 @@ void __init detect_calgary(void)
specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE);
for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
- int dev;
struct calgary_bus_info *info = &bus_info[bus];
+ unsigned short pci_device;
+ u32 val;
+
+ val = read_pci_config(bus, 0, 0, 0);
+ pci_device = (val & 0xFFFF0000) >> 16;
- if (read_pci_config(bus, 0, 0, 0) != PCI_VENDOR_DEVICE_ID_CALGARY)
+ if (!is_cal_pci_dev(pci_device))
continue;
if (info->translation_disabled)
continue;
- /*
- * Scan the slots of the PCI bus to see if there is a device present.
- * The parent bus will be the zero-ith device, so start at 1.
- */
- for (dev = 1; dev < 8; dev++) {
- val = read_pci_config(bus, dev, 0, 0);
- if (val != 0xffffffff || translate_empty_slots) {
- tbl = alloc_tce_table();
- if (!tbl)
- goto cleanup;
- info->tce_space = tbl;
- calgary_found = 1;
- break;
- }
+ if (calgary_bus_has_devices(bus, pci_device) ||
+ translate_empty_slots) {
+ tbl = alloc_tce_table();
+ if (!tbl)
+ goto cleanup;
+ info->tce_space = tbl;
+ calgary_found = 1;
}
}
@@ -1249,3 +1514,66 @@ static int __init calgary_parse_options(char *p)
return 1;
}
__setup("calgary=", calgary_parse_options);
+
+static void __init calgary_fixup_one_tce_space(struct pci_dev *dev)
+{
+ struct iommu_table *tbl;
+ unsigned int npages;
+ int i;
+
+ tbl = pci_iommu(dev->bus);
+
+ for (i = 0; i < 4; i++) {
+ struct resource *r = &dev->resource[PCI_BRIDGE_RESOURCES + i];
+
+ /* Don't give out TCEs that map MEM resources */
+ if (!(r->flags & IORESOURCE_MEM))
+ continue;
+
+ /* 0-based? we reserve the whole 1st MB anyway */
+ if (!r->start)
+ continue;
+
+ /* cover the whole region */
+ npages = (r->end - r->start) >> PAGE_SHIFT;
+ npages++;
+
+ iommu_range_reserve(tbl, r->start, npages);
+ }
+}
+
+static int __init calgary_fixup_tce_spaces(void)
+{
+ struct pci_dev *dev = NULL;
+ void *tce_space;
+
+ if (no_iommu || swiotlb || !calgary_detected)
+ return -ENODEV;
+
+ printk(KERN_DEBUG "Calgary: fixing up tce spaces\n");
+
+ do {
+ dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev);
+ if (!dev)
+ break;
+ if (!is_cal_pci_dev(dev->device))
+ continue;
+ if (!translate_phb(dev))
+ continue;
+
+ tce_space = bus_info[dev->bus->number].tce_space;
+ if (!tce_space)
+ continue;
+
+ calgary_fixup_one_tce_space(dev);
+
+ } while (1);
+
+ return 0;
+}
+
+/*
+ * We need to be call after pcibios_assign_resources (fs_initcall level)
+ * and before device_initcall.
+ */
+rootfs_initcall(calgary_fixup_tce_spaces);
diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c
index 9f80aad3fe2..05d745ede56 100644
--- a/arch/x86_64/kernel/pci-dma.c
+++ b/arch/x86_64/kernel/pci-dma.c
@@ -8,7 +8,7 @@
#include <linux/pci.h>
#include <linux/module.h>
#include <asm/io.h>
-#include <asm/proto.h>
+#include <asm/iommu.h>
#include <asm/calgary.h>
int iommu_merge __read_mostly = 0;
@@ -22,8 +22,7 @@ EXPORT_SYMBOL(bad_dma_address);
int iommu_bio_merge __read_mostly = 0;
EXPORT_SYMBOL(iommu_bio_merge);
-int iommu_sac_force __read_mostly = 0;
-EXPORT_SYMBOL(iommu_sac_force);
+static int iommu_sac_force __read_mostly = 0;
int no_iommu __read_mostly;
#ifdef CONFIG_IOMMU_DEBUG
@@ -322,6 +321,11 @@ static int __init pci_iommu_init(void)
return 0;
}
+void pci_iommu_shutdown(void)
+{
+ gart_iommu_shutdown();
+}
+
#ifdef CONFIG_PCI
/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c
index ae091cdc1a4..4918c575d58 100644
--- a/arch/x86_64/kernel/pci-gart.c
+++ b/arch/x86_64/kernel/pci-gart.c
@@ -28,6 +28,7 @@
#include <asm/mtrr.h>
#include <asm/pgtable.h>
#include <asm/proto.h>
+#include <asm/iommu.h>
#include <asm/cacheflush.h>
#include <asm/swiotlb.h>
#include <asm/dma.h>
@@ -235,7 +236,7 @@ static dma_addr_t gart_map_simple(struct device *dev, char *buf,
}
/* Map a single area into the IOMMU */
-dma_addr_t gart_map_single(struct device *dev, void *addr, size_t size, int dir)
+static dma_addr_t gart_map_single(struct device *dev, void *addr, size_t size, int dir)
{
unsigned long phys_mem, bus;
@@ -253,7 +254,7 @@ dma_addr_t gart_map_single(struct device *dev, void *addr, size_t size, int dir)
/*
* Free a DMA mapping.
*/
-void gart_unmap_single(struct device *dev, dma_addr_t dma_addr,
+static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr,
size_t size, int direction)
{
unsigned long iommu_page;
@@ -275,7 +276,7 @@ void gart_unmap_single(struct device *dev, dma_addr_t dma_addr,
/*
* Wrapper for pci_unmap_single working with scatterlists.
*/
-void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
+static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
{
int i;
@@ -571,6 +572,26 @@ static const struct dma_mapping_ops gart_dma_ops = {
.unmap_sg = gart_unmap_sg,
};
+void gart_iommu_shutdown(void)
+{
+ struct pci_dev *dev;
+ int i;
+
+ if (no_agp && (dma_ops != &gart_dma_ops))
+ return;
+
+ for (i = 0; i < num_k8_northbridges; i++) {
+ u32 ctl;
+
+ dev = k8_northbridges[i];
+ pci_read_config_dword(dev, 0x90, &ctl);
+
+ ctl &= ~1;
+
+ pci_write_config_dword(dev, 0x90, ctl);
+ }
+}
+
void __init gart_iommu_init(void)
{
struct agp_kern_info info;
diff --git a/arch/x86_64/kernel/pci-nommu.c b/arch/x86_64/kernel/pci-nommu.c
index 6dade0c867c..2a34c6c025a 100644
--- a/arch/x86_64/kernel/pci-nommu.c
+++ b/arch/x86_64/kernel/pci-nommu.c
@@ -6,7 +6,7 @@
#include <linux/string.h>
#include <linux/dma-mapping.h>
-#include <asm/proto.h>
+#include <asm/iommu.h>
#include <asm/processor.h>
#include <asm/dma.h>
@@ -34,7 +34,7 @@ nommu_map_single(struct device *hwdev, void *ptr, size_t size,
return bus;
}
-void nommu_unmap_single(struct device *dev, dma_addr_t addr,size_t size,
+static void nommu_unmap_single(struct device *dev, dma_addr_t addr,size_t size,
int direction)
{
}
@@ -54,7 +54,7 @@ void nommu_unmap_single(struct device *dev, dma_addr_t addr,size_t size,
* Device ownership issues as mentioned above for pci_map_single are
* the same here.
*/
-int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
+static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
int nents, int direction)
{
int i;
@@ -74,7 +74,7 @@ int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
* Again, cpu read rules concerning calls here are the same as for
* pci_unmap_single() above.
*/
-void nommu_unmap_sg(struct device *dev, struct scatterlist *sg,
+static void nommu_unmap_sg(struct device *dev, struct scatterlist *sg,
int nents, int dir)
{
}
diff --git a/arch/x86_64/kernel/pci-swiotlb.c b/arch/x86_64/kernel/pci-swiotlb.c
index 4b4569abc60..b2f405ea7c8 100644
--- a/arch/x86_64/kernel/pci-swiotlb.c
+++ b/arch/x86_64/kernel/pci-swiotlb.c
@@ -5,7 +5,7 @@
#include <linux/module.h>
#include <linux/dma-mapping.h>
-#include <asm/proto.h>
+#include <asm/iommu.h>
#include <asm/swiotlb.h>
#include <asm/dma.h>
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index 5909039f37a..e7ac629d4c4 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -207,6 +207,7 @@ void cpu_idle (void)
if (__get_cpu_var(cpu_idle_state))
__get_cpu_var(cpu_idle_state) = 0;
+ check_pgt_cache();
rmb();
idle = pm_idle;
if (!idle)
@@ -278,7 +279,7 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
*/
if (!pm_idle) {
if (!printed) {
- printk("using mwait in idle threads.\n");
+ printk(KERN_INFO "using mwait in idle threads.\n");
printed = 1;
}
pm_idle = mwait_idle;
@@ -305,6 +306,7 @@ early_param("idle", idle_setup);
void __show_regs(struct pt_regs * regs)
{
unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
+ unsigned long d0, d1, d2, d3, d6, d7;
unsigned int fsindex,gsindex;
unsigned int ds,cs,es;
@@ -340,15 +342,24 @@ void __show_regs(struct pt_regs * regs)
rdmsrl(MSR_GS_BASE, gs);
rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
- asm("movq %%cr0, %0": "=r" (cr0));
- asm("movq %%cr2, %0": "=r" (cr2));
- asm("movq %%cr3, %0": "=r" (cr3));
- asm("movq %%cr4, %0": "=r" (cr4));
+ cr0 = read_cr0();
+ cr2 = read_cr2();
+ cr3 = read_cr3();
+ cr4 = read_cr4();
printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
fs,fsindex,gs,gsindex,shadowgs);
printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
+
+ get_debugreg(d0, 0);
+ get_debugreg(d1, 1);
+ get_debugreg(d2, 2);
+ printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
+ get_debugreg(d3, 3);
+ get_debugreg(d6, 6);
+ get_debugreg(d7, 7);
+ printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
}
void show_regs(struct pt_regs *regs)
diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c
index 9409117b9f1..e83cc67155a 100644
--- a/arch/x86_64/kernel/ptrace.c
+++ b/arch/x86_64/kernel/ptrace.c
@@ -102,16 +102,25 @@ unsigned long convert_rip_to_linear(struct task_struct *child, struct pt_regs *r
u32 *desc;
unsigned long base;
- down(&child->mm->context.sem);
- desc = child->mm->context.ldt + (seg & ~7);
- base = (desc[0] >> 16) | ((desc[1] & 0xff) << 16) | (desc[1] & 0xff000000);
+ seg &= ~7UL;
- /* 16-bit code segment? */
- if (!((desc[1] >> 22) & 1))
- addr &= 0xffff;
- addr += base;
+ down(&child->mm->context.sem);
+ if (unlikely((seg >> 3) >= child->mm->context.size))
+ addr = -1L; /* bogus selector, access would fault */
+ else {
+ desc = child->mm->context.ldt + seg;
+ base = ((desc[0] >> 16) |
+ ((desc[1] & 0xff) << 16) |
+ (desc[1] & 0xff000000));
+
+ /* 16-bit code segment? */
+ if (!((desc[1] >> 22) & 1))
+ addr &= 0xffff;
+ addr += base;
+ }
up(&child->mm->context.sem);
}
+
return addr;
}
@@ -313,17 +322,9 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
switch (request) {
/* when I and D space are separate, these will need to be fixed. */
case PTRACE_PEEKTEXT: /* read word at location addr. */
- case PTRACE_PEEKDATA: {
- unsigned long tmp;
- int copied;
-
- copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0);
- ret = -EIO;
- if (copied != sizeof(tmp))
- break;
- ret = put_user(tmp,(unsigned long __user *) data);
+ case PTRACE_PEEKDATA:
+ ret = generic_ptrace_peekdata(child, addr, data);
break;
- }
/* read the word at location addr in the USER area. */
case PTRACE_PEEKUSR: {
@@ -367,10 +368,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
/* when I and D space are separate, this will have to be fixed. */
case PTRACE_POKETEXT: /* write the word at location addr. */
case PTRACE_POKEDATA:
- ret = 0;
- if (access_process_vm(child, addr, &data, sizeof(data), 1) == sizeof(data))
- break;
- ret = -EIO;
+ ret = generic_ptrace_pokedata(child, addr, data);
break;
case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
diff --git a/arch/x86_64/kernel/reboot.c b/arch/x86_64/kernel/reboot.c
index 7503068e788..368db2b9c5a 100644
--- a/arch/x86_64/kernel/reboot.c
+++ b/arch/x86_64/kernel/reboot.c
@@ -16,6 +16,7 @@
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/apic.h>
+#include <asm/iommu.h>
/*
* Power off function, if any
@@ -81,6 +82,7 @@ static inline void kb_wait(void)
void machine_shutdown(void)
{
unsigned long flags;
+
/* Stop the cpus and apics */
#ifdef CONFIG_SMP
int reboot_cpu_id;
@@ -111,6 +113,8 @@ void machine_shutdown(void)
disable_IO_APIC();
local_irq_restore(flags);
+
+ pci_iommu_shutdown();
}
void machine_emergency_restart(void)
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index eb6524f3ac2..af838f6b0b7 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -575,6 +575,8 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
level = cpuid_eax(1);
if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58))
set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
+ if (c->x86 == 0x10)
+ set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
/* Enable workaround for FXSAVE leak */
if (c->x86 >= 6)
@@ -600,8 +602,14 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
if (c->extended_cpuid_level >= 0x80000008)
amd_detect_cmp(c);
- /* Fix cpuid4 emulation for more */
- num_cache_leaves = 3;
+ if (c->extended_cpuid_level >= 0x80000006 &&
+ (cpuid_edx(0x80000006) & 0xf000))
+ num_cache_leaves = 4;
+ else
+ num_cache_leaves = 3;
+
+ if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11)
+ set_bit(X86_FEATURE_K8, &c->x86_capability);
/* RDTSC can be speculated around */
clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
@@ -846,6 +854,8 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
c->x86_capability[2] = cpuid_edx(0x80860001);
}
+ init_scattered_cpuid_features(c);
+
c->apicid = phys_pkg_id(0);
/*
@@ -931,7 +941,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
"fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
"cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
"pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
- "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", NULL,
+ "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe",
/* AMD-defined */
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
@@ -947,10 +957,11 @@ static int show_cpuinfo(struct seq_file *m, void *v)
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
/* Other (Linux-defined) */
- "cxmmx", NULL, "cyrix_arr", "centaur_mcr", NULL,
- "constant_tsc", NULL, NULL,
- "up", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr",
+ NULL, NULL, NULL, NULL,
+ "constant_tsc", "up", NULL, "arch_perfmon",
+ "pebs", "bts", NULL, "sync_rdtsc",
+ "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
/* Intel-defined (#2) */
@@ -961,7 +972,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
/* VIA/Cyrix/Centaur-defined */
NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en",
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ "ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
@@ -972,6 +983,12 @@ static int show_cpuinfo(struct seq_file *m, void *v)
"osvw", "ibs", NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+
+ /* Auxiliary (Linux-defined) */
+ "ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
};
static char *x86_power_flags[] = {
"ts", /* temperature sensor */
diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c
index 290f5d8037c..739175b01e0 100644
--- a/arch/x86_64/kernel/signal.c
+++ b/arch/x86_64/kernel/signal.c
@@ -26,6 +26,7 @@
#include <asm/i387.h>
#include <asm/proto.h>
#include <asm/ia32_unistd.h>
+#include <asm/mce.h>
/* #define DEBUG_SIG 1 */
@@ -472,6 +473,12 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
clear_thread_flag(TIF_SINGLESTEP);
}
+#ifdef CONFIG_X86_MCE
+ /* notify userspace of pending MCEs */
+ if (thread_info_flags & _TIF_MCE_NOTIFY)
+ mce_notify_user();
+#endif /* CONFIG_X86_MCE */
+
/* deal with pending signal delivery */
if (thread_info_flags & (_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK))
do_signal(regs);
@@ -480,7 +487,7 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
{
struct task_struct *me = current;
- if (exception_trace)
+ if (show_unhandled_signals && printk_ratelimit())
printk("%s[%d] bad frame in %s frame:%p rip:%lx rsp:%lx orax:%lx\n",
me->comm,me->pid,where,frame,regs->rip,regs->rsp,regs->orig_rax);
diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c
index 2ff46859162..673a300b594 100644
--- a/arch/x86_64/kernel/smp.c
+++ b/arch/x86_64/kernel/smp.c
@@ -241,7 +241,7 @@ void flush_tlb_mm (struct mm_struct * mm)
}
if (!cpus_empty(cpu_mask))
flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
-
+ check_pgt_cache();
preempt_enable();
}
EXPORT_SYMBOL(flush_tlb_mm);
@@ -357,7 +357,7 @@ __smp_call_function_single(int cpu, void (*func) (void *info), void *info,
}
/*
- * smp_call_function_single - Run a function on another CPU
+ * smp_call_function_single - Run a function on a specific CPU
* @func: The function to run. This must be fast and non-blocking.
* @info: An arbitrary pointer to pass to the function.
* @nonatomic: Currently unused.
@@ -374,17 +374,21 @@ int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
{
/* prevent preemption and reschedule on another processor */
int me = get_cpu();
+
+ /* Can deadlock when called with interrupts disabled */
+ WARN_ON(irqs_disabled());
+
if (cpu == me) {
+ local_irq_disable();
+ func(info);
+ local_irq_enable();
put_cpu();
return 0;
}
- /* Can deadlock when called with interrupts disabled */
- WARN_ON(irqs_disabled());
-
- spin_lock_bh(&call_lock);
+ spin_lock(&call_lock);
__smp_call_function_single(cpu, func, info, nonatomic, wait);
- spin_unlock_bh(&call_lock);
+ spin_unlock(&call_lock);
put_cpu();
return 0;
}
diff --git a/arch/x86_64/kernel/suspend.c b/arch/x86_64/kernel/suspend.c
index 6a5a98f2a75..ea83a9f9196 100644
--- a/arch/x86_64/kernel/suspend.c
+++ b/arch/x86_64/kernel/suspend.c
@@ -55,11 +55,11 @@ void __save_processor_state(struct saved_context *ctxt)
* control registers
*/
rdmsrl(MSR_EFER, ctxt->efer);
- asm volatile ("movq %%cr0, %0" : "=r" (ctxt->cr0));
- asm volatile ("movq %%cr2, %0" : "=r" (ctxt->cr2));
- asm volatile ("movq %%cr3, %0" : "=r" (ctxt->cr3));
- asm volatile ("movq %%cr4, %0" : "=r" (ctxt->cr4));
- asm volatile ("movq %%cr8, %0" : "=r" (ctxt->cr8));
+ ctxt->cr0 = read_cr0();
+ ctxt->cr2 = read_cr2();
+ ctxt->cr3 = read_cr3();
+ ctxt->cr4 = read_cr4();
+ ctxt->cr8 = read_cr8();
}
void save_processor_state(void)
@@ -81,11 +81,11 @@ void __restore_processor_state(struct saved_context *ctxt)
* control registers
*/
wrmsrl(MSR_EFER, ctxt->efer);
- asm volatile ("movq %0, %%cr8" :: "r" (ctxt->cr8));
- asm volatile ("movq %0, %%cr4" :: "r" (ctxt->cr4));
- asm volatile ("movq %0, %%cr3" :: "r" (ctxt->cr3));
- asm volatile ("movq %0, %%cr2" :: "r" (ctxt->cr2));
- asm volatile ("movq %0, %%cr0" :: "r" (ctxt->cr0));
+ write_cr8(ctxt->cr8);
+ write_cr4(ctxt->cr4);
+ write_cr3(ctxt->cr3);
+ write_cr2(ctxt->cr2);
+ write_cr0(ctxt->cr0);
/*
* now restore the descriptor tables to their proper values
diff --git a/arch/x86_64/kernel/tce.c b/arch/x86_64/kernel/tce.c
index f61fb8e4f12..3aeae2fa2e2 100644
--- a/arch/x86_64/kernel/tce.c
+++ b/arch/x86_64/kernel/tce.c
@@ -136,9 +136,9 @@ int build_tce_table(struct pci_dev *dev, void __iomem *bbar)
struct iommu_table *tbl;
int ret;
- if (dev->sysdata) {
- printk(KERN_ERR "Calgary: dev %p has sysdata %p\n",
- dev, dev->sysdata);
+ if (pci_iommu(dev->bus)) {
+ printk(KERN_ERR "Calgary: dev %p has sysdata->iommu %p\n",
+ dev, pci_iommu(dev->bus));
BUG();
}
@@ -155,11 +155,7 @@ int build_tce_table(struct pci_dev *dev, void __iomem *bbar)
tbl->bbar = bbar;
- /*
- * NUMA is already using the bus's sysdata pointer, so we use
- * the bus's pci_dev's sysdata instead.
- */
- dev->sysdata = tbl;
+ set_pci_iommu(dev->bus, tbl);
return 0;
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 4a0895bacf5..6d48a4e826d 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -33,6 +33,7 @@
#include <acpi/acpi_bus.h>
#endif
#include <asm/8253pit.h>
+#include <asm/i8253.h>
#include <asm/pgtable.h>
#include <asm/vsyscall.h>
#include <asm/timex.h>
@@ -44,12 +45,14 @@
#include <asm/hpet.h>
#include <asm/mpspec.h>
#include <asm/nmi.h>
+#include <asm/vgtod.h>
static char *timename = NULL;
DEFINE_SPINLOCK(rtc_lock);
EXPORT_SYMBOL(rtc_lock);
DEFINE_SPINLOCK(i8253_lock);
+EXPORT_SYMBOL(i8253_lock);
volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
@@ -79,8 +82,9 @@ EXPORT_SYMBOL(profile_pc);
* sheet for details.
*/
-static void set_rtc_mmss(unsigned long nowtime)
+static int set_rtc_mmss(unsigned long nowtime)
{
+ int retval = 0;
int real_seconds, real_minutes, cmos_minutes;
unsigned char control, freq_select;
@@ -120,6 +124,7 @@ static void set_rtc_mmss(unsigned long nowtime)
if (abs(real_minutes - cmos_minutes) >= 30) {
printk(KERN_WARNING "time.c: can't update CMOS clock "
"from %d to %d\n", cmos_minutes, real_minutes);
+ retval = -1;
} else {
BIN_TO_BCD(real_seconds);
BIN_TO_BCD(real_minutes);
@@ -139,12 +144,17 @@ static void set_rtc_mmss(unsigned long nowtime)
CMOS_WRITE(freq_select, RTC_FREQ_SELECT);
spin_unlock(&rtc_lock);
+
+ return retval;
}
+int update_persistent_clock(struct timespec now)
+{
+ return set_rtc_mmss(now.tv_sec);
+}
void main_timer_handler(void)
{
- static unsigned long rtc_update = 0;
/*
* Here we are in the timer irq handler. We have irqs locally disabled (so we
* don't need spin_lock_irqsave()) but we don't know if the timer_bh is running
@@ -172,20 +182,6 @@ void main_timer_handler(void)
if (!using_apic_timer)
smp_local_timer_interrupt();
-/*
- * If we have an externally synchronized Linux clock, then update CMOS clock
- * accordingly every ~11 minutes. set_rtc_mmss() will be called in the jiffy
- * closest to exactly 500 ms before the next second. If the update fails, we
- * don't care, as it'll be updated on the next turn, and the problem (time way
- * off) isn't likely to go away much sooner anyway.
- */
-
- if (ntp_synced() && xtime.tv_sec > rtc_update &&
- abs(xtime.tv_nsec - 500000000) <= tick_nsec / 2) {
- set_rtc_mmss(xtime.tv_sec);
- rtc_update = xtime.tv_sec + 660;
- }
-
write_sequnlock(&xtime_lock);
}
@@ -199,7 +195,7 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id)
return IRQ_HANDLED;
}
-static unsigned long get_cmos_time(void)
+unsigned long read_persistent_clock(void)
{
unsigned int year, mon, day, hour, min, sec;
unsigned long flags;
@@ -226,7 +222,7 @@ static unsigned long get_cmos_time(void)
/*
* We know that x86-64 always uses BCD format, no need to check the
* config register.
- */
+ */
BCD_TO_BIN(sec);
BCD_TO_BIN(min);
@@ -239,11 +235,11 @@ static unsigned long get_cmos_time(void)
BCD_TO_BIN(century);
year += century * 100;
printk(KERN_INFO "Extended CMOS year: %d\n", century * 100);
- } else {
+ } else {
/*
* x86-64 systems only exists since 2002.
* This will work up to Dec 31, 2100
- */
+ */
year += 2000;
}
@@ -255,45 +251,45 @@ static unsigned long get_cmos_time(void)
#define TICK_COUNT 100000000
static unsigned int __init tsc_calibrate_cpu_khz(void)
{
- int tsc_start, tsc_now;
- int i, no_ctr_free;
- unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0;
- unsigned long flags;
-
- for (i = 0; i < 4; i++)
- if (avail_to_resrv_perfctr_nmi_bit(i))
- break;
- no_ctr_free = (i == 4);
- if (no_ctr_free) {
- i = 3;
- rdmsrl(MSR_K7_EVNTSEL3, evntsel3);
- wrmsrl(MSR_K7_EVNTSEL3, 0);
- rdmsrl(MSR_K7_PERFCTR3, pmc3);
- } else {
- reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i);
- reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
- }
- local_irq_save(flags);
- /* start meauring cycles, incrementing from 0 */
- wrmsrl(MSR_K7_PERFCTR0 + i, 0);
- wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76);
- rdtscl(tsc_start);
- do {
- rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now);
- tsc_now = get_cycles_sync();
- } while ((tsc_now - tsc_start) < TICK_COUNT);
-
- local_irq_restore(flags);
- if (no_ctr_free) {
- wrmsrl(MSR_K7_EVNTSEL3, 0);
- wrmsrl(MSR_K7_PERFCTR3, pmc3);
- wrmsrl(MSR_K7_EVNTSEL3, evntsel3);
- } else {
- release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
- release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
- }
-
- return pmc_now * tsc_khz / (tsc_now - tsc_start);
+ int tsc_start, tsc_now;
+ int i, no_ctr_free;
+ unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0;
+ unsigned long flags;
+
+ for (i = 0; i < 4; i++)
+ if (avail_to_resrv_perfctr_nmi_bit(i))
+ break;
+ no_ctr_free = (i == 4);
+ if (no_ctr_free) {
+ i = 3;
+ rdmsrl(MSR_K7_EVNTSEL3, evntsel3);
+ wrmsrl(MSR_K7_EVNTSEL3, 0);
+ rdmsrl(MSR_K7_PERFCTR3, pmc3);
+ } else {
+ reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i);
+ reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
+ }
+ local_irq_save(flags);
+ /* start meauring cycles, incrementing from 0 */
+ wrmsrl(MSR_K7_PERFCTR0 + i, 0);
+ wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76);
+ rdtscl(tsc_start);
+ do {
+ rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now);
+ tsc_now = get_cycles_sync();
+ } while ((tsc_now - tsc_start) < TICK_COUNT);
+
+ local_irq_restore(flags);
+ if (no_ctr_free) {
+ wrmsrl(MSR_K7_EVNTSEL3, 0);
+ wrmsrl(MSR_K7_PERFCTR3, pmc3);
+ wrmsrl(MSR_K7_EVNTSEL3, evntsel3);
+ } else {
+ release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
+ release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
+ }
+
+ return pmc_now * tsc_khz / (tsc_now - tsc_start);
}
/*
@@ -321,7 +317,7 @@ static unsigned int __init pit_calibrate_tsc(void)
end = get_cycles_sync();
spin_unlock_irqrestore(&i8253_lock, flags);
-
+
return (end - start) / 50;
}
@@ -366,25 +362,20 @@ static struct irqaction irq0 = {
.handler = timer_interrupt,
.flags = IRQF_DISABLED | IRQF_IRQPOLL,
.mask = CPU_MASK_NONE,
- .name = "timer"
+ .name = "timer"
};
void __init time_init(void)
{
if (nohpet)
hpet_address = 0;
- xtime.tv_sec = get_cmos_time();
- xtime.tv_nsec = 0;
-
- set_normalized_timespec(&wall_to_monotonic,
- -xtime.tv_sec, -xtime.tv_nsec);
if (hpet_arch_init())
hpet_address = 0;
if (hpet_use_timer) {
/* set tick_nsec to use the proper rate for HPET */
- tick_nsec = TICK_NSEC_HPET;
+ tick_nsec = TICK_NSEC_HPET;
tsc_khz = hpet_calibrate_tsc();
timename = "HPET";
} else {
@@ -415,54 +406,21 @@ void __init time_init(void)
setup_irq(0, &irq0);
}
-
-static long clock_cmos_diff;
-static unsigned long sleep_start;
-
/*
* sysfs support for the timer.
*/
static int timer_suspend(struct sys_device *dev, pm_message_t state)
{
- /*
- * Estimate time zone so that set_time can update the clock
- */
- long cmos_time = get_cmos_time();
-
- clock_cmos_diff = -cmos_time;
- clock_cmos_diff += get_seconds();
- sleep_start = cmos_time;
return 0;
}
static int timer_resume(struct sys_device *dev)
{
- unsigned long flags;
- unsigned long sec;
- unsigned long ctime = get_cmos_time();
- long sleep_length = (ctime - sleep_start) * HZ;
-
- if (sleep_length < 0) {
- printk(KERN_WARNING "Time skew detected in timer resume!\n");
- /* The time after the resume must not be earlier than the time
- * before the suspend or some nasty things will happen
- */
- sleep_length = 0;
- ctime = sleep_start;
- }
if (hpet_address)
hpet_reenable();
else
i8254_timer_resume();
-
- sec = ctime + clock_cmos_diff;
- write_seqlock_irqsave(&xtime_lock,flags);
- xtime.tv_sec = sec;
- xtime.tv_nsec = 0;
- jiffies += sleep_length;
- write_sequnlock_irqrestore(&xtime_lock,flags);
- touch_softlockup_watchdog();
return 0;
}
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index aac1c0be54c..03888420775 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -34,6 +34,10 @@
#include <linux/bug.h>
#include <linux/kdebug.h>
+#if defined(CONFIG_EDAC)
+#include <linux/edac.h>
+#endif
+
#include <asm/system.h>
#include <asm/io.h>
#include <asm/atomic.h>
@@ -330,6 +334,7 @@ static int print_trace_stack(void *data, char *name)
static void print_trace_address(void *data, unsigned long addr)
{
+ touch_nmi_watchdog();
printk_address(addr);
}
@@ -518,6 +523,7 @@ void __kprobes __die(const char * str, struct pt_regs * regs, long err)
printk("\n");
notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV);
show_registers(regs);
+ add_taint(TAINT_DIE);
/* Executive summary in case the oops scrolled away */
printk(KERN_ALERT "RIP ");
printk_address(regs->rip);
@@ -531,7 +537,7 @@ void die(const char * str, struct pt_regs * regs, long err)
unsigned long flags = oops_begin();
if (!user_mode(regs))
- report_bug(regs->rip);
+ report_bug(regs->rip, regs);
__die(str, regs, err);
oops_end(flags);
@@ -578,7 +584,8 @@ static void __kprobes do_trap(int trapnr, int signr, char *str,
tsk->thread.error_code = error_code;
tsk->thread.trap_no = trapnr;
- if (exception_trace && unhandled_signal(tsk, signr))
+ if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
+ printk_ratelimit())
printk(KERN_INFO
"%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n",
tsk->comm, tsk->pid, str,
@@ -682,7 +689,8 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
tsk->thread.error_code = error_code;
tsk->thread.trap_no = 13;
- if (exception_trace && unhandled_signal(tsk, SIGSEGV))
+ if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+ printk_ratelimit())
printk(KERN_INFO
"%s[%d] general protection rip:%lx rsp:%lx error:%lx\n",
tsk->comm, tsk->pid,
@@ -717,6 +725,13 @@ mem_parity_error(unsigned char reason, struct pt_regs * regs)
reason);
printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
+#if defined(CONFIG_EDAC)
+ if(edac_handler_set()) {
+ edac_atomic_assert_error();
+ return;
+ }
+#endif
+
if (panic_on_unrecovered_nmi)
panic("NMI: Not continuing");
diff --git a/arch/x86_64/kernel/tsc.c b/arch/x86_64/kernel/tsc.c
index 48f9a8e6aa9..9b76b03d060 100644
--- a/arch/x86_64/kernel/tsc.c
+++ b/arch/x86_64/kernel/tsc.c
@@ -44,7 +44,7 @@ unsigned long long sched_clock(void)
static int tsc_unstable;
-static inline int check_tsc_unstable(void)
+inline int check_tsc_unstable(void)
{
return tsc_unstable;
}
@@ -61,25 +61,9 @@ static inline int check_tsc_unstable(void)
* first tick after the change will be slightly wrong.
*/
-#include <linux/workqueue.h>
-
-static unsigned int cpufreq_delayed_issched = 0;
-static unsigned int cpufreq_init = 0;
-static struct work_struct cpufreq_delayed_get_work;
-
-static void handle_cpufreq_delayed_get(struct work_struct *v)
-{
- unsigned int cpu;
- for_each_online_cpu(cpu) {
- cpufreq_get(cpu);
- }
- cpufreq_delayed_issched = 0;
-}
-
-static unsigned int ref_freq = 0;
-static unsigned long loops_per_jiffy_ref = 0;
-
-static unsigned long tsc_khz_ref = 0;
+static unsigned int ref_freq;
+static unsigned long loops_per_jiffy_ref;
+static unsigned long tsc_khz_ref;
static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
void *data)
@@ -125,10 +109,8 @@ static struct notifier_block time_cpufreq_notifier_block = {
static int __init cpufreq_tsc(void)
{
- INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get);
- if (!cpufreq_register_notifier(&time_cpufreq_notifier_block,
- CPUFREQ_TRANSITION_NOTIFIER))
- cpufreq_init = 1;
+ cpufreq_register_notifier(&time_cpufreq_notifier_block,
+ CPUFREQ_TRANSITION_NOTIFIER);
return 0;
}
@@ -153,17 +135,18 @@ __cpuinit int unsynchronized_tsc(void)
#endif
/* Most intel systems have synchronized TSCs except for
multi node systems */
- if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
#ifdef CONFIG_ACPI
/* But TSC doesn't tick in C3 so don't use it there */
- if (acpi_gbl_FADT.header.length > 0 && acpi_gbl_FADT.C3latency < 1000)
+ if (acpi_gbl_FADT.header.length > 0 &&
+ acpi_gbl_FADT.C3latency < 1000)
return 1;
#endif
- return 0;
+ return 0;
}
- /* Assume multi socket systems are not synchronized */
- return num_present_cpus() > 1;
+ /* Assume multi socket systems are not synchronized */
+ return num_present_cpus() > 1;
}
int __init notsc_setup(char *s)
diff --git a/arch/x86_64/kernel/verify_cpu.S b/arch/x86_64/kernel/verify_cpu.S
index e035f594819..45b6f8a975a 100644
--- a/arch/x86_64/kernel/verify_cpu.S
+++ b/arch/x86_64/kernel/verify_cpu.S
@@ -37,20 +37,6 @@ verify_cpu:
pushl $0 # Kill any dangerous flags
popfl
- /* minimum CPUID flags for x86-64 as defined by AMD */
-#define M(x) (1<<(x))
-#define M2(a,b) M(a)|M(b)
-#define M4(a,b,c,d) M(a)|M(b)|M(c)|M(d)
-
-#define SSE_MASK \
- (M2(X86_FEATURE_XMM,X86_FEATURE_XMM2))
-#define REQUIRED_MASK1 \
- (M4(X86_FEATURE_FPU,X86_FEATURE_PSE,X86_FEATURE_TSC,X86_FEATURE_MSR)|\
- M4(X86_FEATURE_PAE,X86_FEATURE_CX8,X86_FEATURE_PGE,X86_FEATURE_CMOV)|\
- M(X86_FEATURE_FXSR))
-#define REQUIRED_MASK2 \
- (M(X86_FEATURE_LM - 32))
-
pushfl # standard way to check for cpuid
popl %eax
movl %eax,%ebx
@@ -79,8 +65,8 @@ verify_cpu:
verify_cpu_noamd:
movl $0x1,%eax # Does the cpu have what it takes
cpuid
- andl $REQUIRED_MASK1,%edx
- xorl $REQUIRED_MASK1,%edx
+ andl $REQUIRED_MASK0,%edx
+ xorl $REQUIRED_MASK0,%edx
jnz verify_cpu_no_longmode
movl $0x80000000,%eax # See if extended cpuid is implemented
@@ -90,8 +76,8 @@ verify_cpu_noamd:
movl $0x80000001,%eax # Does the cpu have what it takes
cpuid
- andl $REQUIRED_MASK2,%edx
- xorl $REQUIRED_MASK2,%edx
+ andl $REQUIRED_MASK1,%edx
+ xorl $REQUIRED_MASK1,%edx
jnz verify_cpu_no_longmode
verify_cpu_sse_test:
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
index dbccfda8364..ba8ea97abd2 100644
--- a/arch/x86_64/kernel/vmlinux.lds.S
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -28,7 +28,7 @@ SECTIONS
_text = .; /* Text and read-only data */
.text : AT(ADDR(.text) - LOAD_OFFSET) {
/* First the code that has to be first for bootstrapping */
- *(.bootstrap.text)
+ *(.text.head)
_stext = .;
/* Then the rest */
TEXT_TEXT
@@ -48,10 +48,19 @@ SECTIONS
__ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) }
__stop___ex_table = .;
- BUG_TABLE
+ NOTES :text :note
+
+ BUG_TABLE :text
RODATA
+ . = ALIGN(4);
+ .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) {
+ __tracedata_start = .;
+ *(.tracedata)
+ __tracedata_end = .;
+ }
+
. = ALIGN(PAGE_SIZE); /* Align data segment to page size boundary */
/* Data */
.data : AT(ADDR(.data) - LOAD_OFFSET) {
@@ -91,6 +100,9 @@ SECTIONS
.vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data))
{ *(.vsyscall_gtod_data) }
vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
+ .vsyscall_clock : AT(VLOAD(.vsyscall_clock))
+ { *(.vsyscall_clock) }
+ vsyscall_clock = VVIRT(.vsyscall_clock);
.vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1))
@@ -131,20 +143,11 @@ SECTIONS
/* might get freed after init */
. = ALIGN(4096);
__smp_alt_begin = .;
- __smp_alt_instructions = .;
- .smp_altinstructions : AT(ADDR(.smp_altinstructions) - LOAD_OFFSET) {
- *(.smp_altinstructions)
- }
- __smp_alt_instructions_end = .;
- . = ALIGN(8);
__smp_locks = .;
.smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
*(.smp_locks)
}
__smp_locks_end = .;
- .smp_altinstr_replacement : AT(ADDR(.smp_altinstr_replacement) - LOAD_OFFSET) {
- *(.smp_altinstr_replacement)
- }
. = ALIGN(4096);
__smp_alt_end = .;
@@ -187,6 +190,12 @@ SECTIONS
.exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) }
.exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) }
+/* vdso blob that is mapped into user space */
+ vdso_start = . ;
+ .vdso : AT(ADDR(.vdso) - LOAD_OFFSET) { *(.vdso) }
+ . = ALIGN(4096);
+ vdso_end = .;
+
#ifdef CONFIG_BLK_DEV_INITRD
. = ALIGN(4096);
__initramfs_start = .;
@@ -194,10 +203,8 @@ SECTIONS
__initramfs_end = .;
#endif
- . = ALIGN(4096);
- __per_cpu_start = .;
- .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) }
- __per_cpu_end = .;
+ PERCPU(4096)
+
. = ALIGN(4096);
__init_end = .;
diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c
index 57660d58d50..06c34949bfd 100644
--- a/arch/x86_64/kernel/vsyscall.c
+++ b/arch/x86_64/kernel/vsyscall.c
@@ -42,6 +42,7 @@
#include <asm/segment.h>
#include <asm/desc.h>
#include <asm/topology.h>
+#include <asm/vgtod.h>
#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
#define __syscall_clobber "r11","rcx","memory"
@@ -57,26 +58,9 @@
* - writen by timer interrupt or systcl (/proc/sys/kernel/vsyscall64)
* Try to keep this structure as small as possible to avoid cache line ping pongs
*/
-struct vsyscall_gtod_data_t {
- seqlock_t lock;
-
- /* open coded 'struct timespec' */
- time_t wall_time_sec;
- u32 wall_time_nsec;
-
- int sysctl_enabled;
- struct timezone sys_tz;
- struct { /* extract of a clocksource struct */
- cycle_t (*vread)(void);
- cycle_t cycle_last;
- cycle_t mask;
- u32 mult;
- u32 shift;
- } clock;
-};
int __vgetcpu_mode __section_vgetcpu_mode;
-struct vsyscall_gtod_data_t __vsyscall_gtod_data __section_vsyscall_gtod_data =
+struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data =
{
.lock = SEQLOCK_UNLOCKED,
.sysctl_enabled = 1,
@@ -96,6 +80,8 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
vsyscall_gtod_data.sys_tz = sys_tz;
+ vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
+ vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic;
write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
}
diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c
index 635e58d443d..327c9f2fa62 100644
--- a/arch/x86_64/mm/fault.c
+++ b/arch/x86_64/mm/fault.c
@@ -159,7 +159,7 @@ void dump_pagetable(unsigned long address)
pmd_t *pmd;
pte_t *pte;
- asm("movq %%cr3,%0" : "=r" (pgd));
+ pgd = (pgd_t *)read_cr3();
pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
pgd += pgd_index(address);
@@ -221,16 +221,6 @@ static int is_errata93(struct pt_regs *regs, unsigned long address)
return 0;
}
-int unhandled_signal(struct task_struct *tsk, int sig)
-{
- if (is_init(tsk))
- return 1;
- if (tsk->ptrace & PT_PTRACED)
- return 0;
- return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
- (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
-}
-
static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
unsigned long error_code)
{
@@ -301,8 +291,8 @@ static int vmalloc_fault(unsigned long address)
return 0;
}
-int page_fault_trace = 0;
-int exception_trace = 1;
+static int page_fault_trace;
+int show_unhandled_signals = 1;
/*
* This routine handles page faults. It determines the address,
@@ -317,7 +307,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
struct vm_area_struct * vma;
unsigned long address;
const struct exception_table_entry *fixup;
- int write;
+ int write, fault;
unsigned long flags;
siginfo_t info;
@@ -326,7 +316,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
prefetchw(&mm->mmap_sem);
/* get the address */
- __asm__("movq %%cr2,%0":"=r" (address));
+ address = read_cr2();
info.si_code = SEGV_MAPERR;
@@ -450,19 +440,18 @@ good_area:
* make sure we exit gracefully rather than endlessly redo
* the fault.
*/
- switch (handle_mm_fault(mm, vma, address, write)) {
- case VM_FAULT_MINOR:
- tsk->min_flt++;
- break;
- case VM_FAULT_MAJOR:
- tsk->maj_flt++;
- break;
- case VM_FAULT_SIGBUS:
- goto do_sigbus;
- default:
- goto out_of_memory;
+ fault = handle_mm_fault(mm, vma, address, write);
+ if (unlikely(fault & VM_FAULT_ERROR)) {
+ if (fault & VM_FAULT_OOM)
+ goto out_of_memory;
+ else if (fault & VM_FAULT_SIGBUS)
+ goto do_sigbus;
+ BUG();
}
-
+ if (fault & VM_FAULT_MAJOR)
+ tsk->maj_flt++;
+ else
+ tsk->min_flt++;
up_read(&mm->mmap_sem);
return;
@@ -495,7 +484,8 @@ bad_area_nosemaphore:
(address >> 32))
return;
- if (exception_trace && unhandled_signal(tsk, SIGSEGV)) {
+ if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+ printk_ratelimit()) {
printk(
"%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
@@ -569,7 +559,7 @@ out_of_memory:
}
printk("VM: killing process %s\n", tsk->comm);
if (error_code & 4)
- do_exit(SIGKILL);
+ do_group_exit(SIGKILL);
goto no_context;
do_sigbus:
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index 9a0e98accf0..38f5d636800 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -383,7 +383,7 @@ void __meminit init_memory_mapping(unsigned long start, unsigned long end)
}
if (!after_bootmem)
- asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features));
+ mmu_cr4_features = read_cr4();
__flush_tlb_all();
}
@@ -600,16 +600,6 @@ void mark_rodata_ro(void)
{
unsigned long start = (unsigned long)_stext, end;
-#ifdef CONFIG_HOTPLUG_CPU
- /* It must still be possible to apply SMP alternatives. */
- if (num_possible_cpus() > 1)
- start = (unsigned long)_etext;
-#endif
-
-#ifdef CONFIG_KPROBES
- start = (unsigned long)__start_rodata;
-#endif
-
end = (unsigned long)__end_rodata;
start = (start + PAGE_SIZE - 1) & PAGE_MASK;
end &= PAGE_MASK;
@@ -697,41 +687,6 @@ int kern_addr_valid(unsigned long addr)
return pfn_valid(pte_pfn(*pte));
}
-#ifdef CONFIG_SYSCTL
-#include <linux/sysctl.h>
-
-extern int exception_trace, page_fault_trace;
-
-static ctl_table debug_table2[] = {
- {
- .ctl_name = 99,
- .procname = "exception-trace",
- .data = &exception_trace,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {}
-};
-
-static ctl_table debug_root_table2[] = {
- {
- .ctl_name = CTL_DEBUG,
- .procname = "debug",
- .mode = 0555,
- .child = debug_table2
- },
- {}
-};
-
-static __init int x8664_sysctl_init(void)
-{
- register_sysctl_table(debug_root_table2);
- return 0;
-}
-__initcall(x8664_sysctl_init);
-#endif
-
/* A pseudo VMA to allow ptrace access for the vsyscall page. This only
covers the 64bit vsyscall page now. 32bit has a real VMA now and does
not need special handling anymore. */
@@ -769,8 +724,17 @@ int in_gate_area_no_task(unsigned long addr)
return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
}
-void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size)
+void * __init alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size)
{
return __alloc_bootmem_core(pgdat->bdata, size,
SMP_CACHE_BYTES, (4UL*1024*1024*1024), 0);
}
+
+const char *arch_vma_name(struct vm_area_struct *vma)
+{
+ if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
+ return "[vdso]";
+ if (vma == &gate_vma)
+ return "[vsyscall]";
+ return NULL;
+}
diff --git a/arch/x86_64/mm/k8topology.c b/arch/x86_64/mm/k8topology.c
index f983c75825d..a96006f7ae0 100644
--- a/arch/x86_64/mm/k8topology.c
+++ b/arch/x86_64/mm/k8topology.c
@@ -44,12 +44,12 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
{
unsigned long prevbase;
struct bootnode nodes[8];
- int nodeid, i, nb;
+ int nodeid, i, j, nb;
unsigned char nodeids[8];
int found = 0;
u32 reg;
unsigned numnodes;
- unsigned dualcore = 0;
+ unsigned num_cores;
if (!early_pci_allowed())
return -1;
@@ -60,6 +60,9 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb);
+ num_cores = (cpuid_ecx(0x80000008) & 0xff) + 1;
+ printk(KERN_INFO "CPU has %d num_cores\n", num_cores);
+
reg = read_pci_config(0, nb, 0, 0x60);
numnodes = ((reg >> 4) & 0xF) + 1;
if (numnodes <= 1)
@@ -73,8 +76,6 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
unsigned long base,limit;
u32 nodeid;
- /* Undefined before E stepping, but hopefully 0 */
- dualcore |= ((read_pci_config(0, nb, 3, 0xe8) >> 12) & 3) == 1;
base = read_pci_config(0, nb, 1, 0x40 + i*8);
limit = read_pci_config(0, nb, 1, 0x44 + i*8);
@@ -170,8 +171,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
for (i = 0; i < 8; i++) {
if (nodes[i].start != nodes[i].end) {
nodeid = nodeids[i];
- apicid_to_node[nodeid << dualcore] = i;
- apicid_to_node[(nodeid << dualcore) + dualcore] = i;
+ for (j = 0; j < num_cores; j++)
+ apicid_to_node[(nodeid * num_cores) + j] = i;
setup_node_bootmem(i, nodes[i].start, nodes[i].end);
}
}
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 51548947ad3..6da23552226 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -273,9 +273,6 @@ void __init numa_init_array(void)
#ifdef CONFIG_NUMA_EMU
/* Numa emulation */
-#define E820_ADDR_HOLE_SIZE(start, end) \
- (e820_hole_size((start) >> PAGE_SHIFT, (end) >> PAGE_SHIFT) << \
- PAGE_SHIFT)
char *cmdline __initdata;
/*
@@ -319,7 +316,7 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
return -1;
if (num_nodes > MAX_NUMNODES)
num_nodes = MAX_NUMNODES;
- size = (max_addr - *addr - E820_ADDR_HOLE_SIZE(*addr, max_addr)) /
+ size = (max_addr - *addr - e820_hole_size(*addr, max_addr)) /
num_nodes;
/*
* Calculate the number of big nodes that can be allocated as a result
@@ -347,7 +344,7 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
if (i == num_nodes + node_start - 1)
end = max_addr;
else
- while (end - *addr - E820_ADDR_HOLE_SIZE(*addr, end) <
+ while (end - *addr - e820_hole_size(*addr, end) <
size) {
end += FAKE_NODE_MIN_SIZE;
if (end > max_addr) {
@@ -476,18 +473,22 @@ out:
/*
* We need to vacate all active ranges that may have been registered by
- * SRAT.
+ * SRAT and set acpi_numa to -1 so that srat_disabled() always returns
+ * true. NUMA emulation has succeeded so we will not scan ACPI nodes.
*/
remove_all_active_ranges();
+#ifdef CONFIG_ACPI_NUMA
+ acpi_numa = -1;
+#endif
for_each_node_mask(i, node_possible_map) {
e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
nodes[i].end >> PAGE_SHIFT);
setup_node_bootmem(i, nodes[i].start, nodes[i].end);
}
+ acpi_fake_nodes(nodes, num_nodes);
numa_init_array();
return 0;
}
-#undef E820_ADDR_HOLE_SIZE
#endif /* CONFIG_NUMA_EMU */
void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c
index 9148f4a4cec..7e161c698af 100644
--- a/arch/x86_64/mm/pageattr.c
+++ b/arch/x86_64/mm/pageattr.c
@@ -13,7 +13,7 @@
#include <asm/tlbflush.h>
#include <asm/io.h>
-static inline pte_t *lookup_address(unsigned long address)
+pte_t *lookup_address(unsigned long address)
{
pgd_t *pgd = pgd_offset_k(address);
pud_t *pud;
@@ -74,14 +74,12 @@ static void flush_kernel_map(void *arg)
struct page *pg;
/* When clflush is available always use it because it is
- much cheaper than WBINVD. Disable clflush for now because
- the high level code is not ready yet */
- if (1 || !cpu_has_clflush)
+ much cheaper than WBINVD. */
+ if (!cpu_has_clflush)
asm volatile("wbinvd" ::: "memory");
else list_for_each_entry(pg, l, lru) {
void *adr = page_address(pg);
- if (cpu_has_clflush)
- cache_flush_page(adr);
+ cache_flush_page(adr);
}
__flush_tlb_all();
}
@@ -95,7 +93,8 @@ static LIST_HEAD(deferred_pages); /* protected by init_mm.mmap_sem */
static inline void save_page(struct page *fpage)
{
- list_add(&fpage->lru, &deferred_pages);
+ if (!test_and_set_bit(PG_arch_1, &fpage->flags))
+ list_add(&fpage->lru, &deferred_pages);
}
/*
@@ -129,9 +128,12 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
pte_t *kpte;
struct page *kpte_page;
pgprot_t ref_prot2;
+
kpte = lookup_address(address);
if (!kpte) return 0;
kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
+ BUG_ON(PageLRU(kpte_page));
+ BUG_ON(PageCompound(kpte_page));
if (pgprot_val(prot) != pgprot_val(ref_prot)) {
if (!pte_huge(*kpte)) {
set_pte(kpte, pfn_pte(pfn, prot));
@@ -159,10 +161,9 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
/* on x86-64 the direct mapping set at boot is not using 4k pages */
BUG_ON(PageReserved(kpte_page));
- if (page_private(kpte_page) == 0) {
- save_page(kpte_page);
+ save_page(kpte_page);
+ if (page_private(kpte_page) == 0)
revert_page(address, ref_prot);
- }
return 0;
}
@@ -234,6 +235,10 @@ void global_flush_tlb(void)
flush_map(&l);
list_for_each_entry_safe(pg, next, &l, lru) {
+ list_del(&pg->lru);
+ clear_bit(PG_arch_1, &pg->flags);
+ if (page_private(pg) != 0)
+ continue;
ClearPagePrivate(pg);
__free_page(pg);
}
diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c
index 1e76bb0a727..acdf03e1914 100644
--- a/arch/x86_64/mm/srat.c
+++ b/arch/x86_64/mm/srat.c
@@ -106,9 +106,9 @@ static __init int slit_valid(struct acpi_table_slit *slit)
for (j = 0; j < d; j++) {
u8 val = slit->entry[d*i + j];
if (i == j) {
- if (val != 10)
+ if (val != LOCAL_DISTANCE)
return 0;
- } else if (val <= 10)
+ } else if (val <= LOCAL_DISTANCE)
return 0;
}
}
@@ -350,7 +350,7 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
/* Sanity check to catch more bad SRATs (they are amazingly common).
Make sure the PXMs cover all memory. */
-static int nodes_cover_memory(void)
+static int __init nodes_cover_memory(const struct bootnode *nodes)
{
int i;
unsigned long pxmram, e820ram;
@@ -394,6 +394,9 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
{
int i;
+ if (acpi_numa <= 0)
+ return -1;
+
/* First clean up the node list */
for (i = 0; i < MAX_NUMNODES; i++) {
cutoff_node(i, start, end);
@@ -403,10 +406,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
}
}
- if (acpi_numa <= 0)
- return -1;
-
- if (!nodes_cover_memory()) {
+ if (!nodes_cover_memory(nodes)) {
bad_srat();
return -1;
}
@@ -440,6 +440,86 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
return 0;
}
+#ifdef CONFIG_NUMA_EMU
+static int __init find_node_by_addr(unsigned long addr)
+{
+ int ret = NUMA_NO_NODE;
+ int i;
+
+ for_each_node_mask(i, nodes_parsed) {
+ /*
+ * Find the real node that this emulated node appears on. For
+ * the sake of simplicity, we only use a real node's starting
+ * address to determine which emulated node it appears on.
+ */
+ if (addr >= nodes[i].start && addr < nodes[i].end) {
+ ret = i;
+ break;
+ }
+ }
+ return i;
+}
+
+/*
+ * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID
+ * mappings that respect the real ACPI topology but reflect our emulated
+ * environment. For each emulated node, we find which real node it appears on
+ * and create PXM to NID mappings for those fake nodes which mirror that
+ * locality. SLIT will now represent the correct distances between emulated
+ * nodes as a result of the real topology.
+ */
+void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
+{
+ int i, j;
+ int fake_node_to_pxm_map[MAX_NUMNODES] = {
+ [0 ... MAX_NUMNODES-1] = PXM_INVAL
+ };
+ unsigned char fake_apicid_to_node[MAX_LOCAL_APIC] = {
+ [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
+ };
+
+ printk(KERN_INFO "Faking PXM affinity for fake nodes on real "
+ "topology.\n");
+ for (i = 0; i < num_nodes; i++) {
+ int nid, pxm;
+
+ nid = find_node_by_addr(fake_nodes[i].start);
+ if (nid == NUMA_NO_NODE)
+ continue;
+ pxm = node_to_pxm(nid);
+ if (pxm == PXM_INVAL)
+ continue;
+ fake_node_to_pxm_map[i] = pxm;
+ /*
+ * For each apicid_to_node mapping that exists for this real
+ * node, it must now point to the fake node ID.
+ */
+ for (j = 0; j < MAX_LOCAL_APIC; j++)
+ if (apicid_to_node[j] == nid)
+ fake_apicid_to_node[j] = i;
+ }
+ for (i = 0; i < num_nodes; i++)
+ __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i);
+ memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
+
+ nodes_clear(nodes_parsed);
+ for (i = 0; i < num_nodes; i++)
+ if (fake_nodes[i].start != fake_nodes[i].end)
+ node_set(i, nodes_parsed);
+ WARN_ON(!nodes_cover_memory(fake_nodes));
+}
+
+static int null_slit_node_compare(int a, int b)
+{
+ return node_to_pxm(a) == node_to_pxm(b);
+}
+#else
+static int null_slit_node_compare(int a, int b)
+{
+ return a == b;
+}
+#endif /* CONFIG_NUMA_EMU */
+
void __init srat_reserve_add_area(int nodeid)
{
if (found_add_area && nodes_add[nodeid].end) {
@@ -464,7 +544,8 @@ int __node_distance(int a, int b)
int index;
if (!acpi_slit)
- return a == b ? 10 : 20;
+ return null_slit_node_compare(a, b) ? LOCAL_DISTANCE :
+ REMOTE_DISTANCE;
index = acpi_slit->locality_count * node_to_pxm(a);
return acpi_slit->entry[index + node_to_pxm(b)];
}
diff --git a/arch/x86_64/pci/k8-bus.c b/arch/x86_64/pci/k8-bus.c
index 3acf60ded2a..9cc813e2970 100644
--- a/arch/x86_64/pci/k8-bus.c
+++ b/arch/x86_64/pci/k8-bus.c
@@ -59,6 +59,8 @@ fill_mp_bus_to_cpumask(void)
j <= SUBORDINATE_LDT_BUS_NUMBER(ldtbus);
j++) {
struct pci_bus *bus;
+ struct pci_sysdata *sd;
+
long node = NODE_ID(nid);
/* Algorithm a bit dumb, but
it shouldn't matter here */
@@ -67,7 +69,9 @@ fill_mp_bus_to_cpumask(void)
continue;
if (!node_online(node))
node = 0;
- bus->sysdata = (void *)node;
+
+ sd = bus->sysdata;
+ sd->node = node;
}
}
}
diff --git a/arch/x86_64/vdso/Makefile b/arch/x86_64/vdso/Makefile
new file mode 100644
index 00000000000..8d03de029d9
--- /dev/null
+++ b/arch/x86_64/vdso/Makefile
@@ -0,0 +1,49 @@
+#
+# x86-64 vDSO.
+#
+
+# files to link into the vdso
+# vdso-start.o has to be first
+vobjs-y := vdso-start.o vdso-note.o vclock_gettime.o vgetcpu.o vvar.o
+
+# files to link into kernel
+obj-y := vma.o vdso.o vdso-syms.o
+
+vobjs := $(foreach F,$(vobjs-y),$(obj)/$F)
+
+$(obj)/vdso.o: $(obj)/vdso.so
+
+targets += vdso.so vdso.lds $(vobjs-y) vdso-syms.o
+
+# The DSO images are built using a special linker script.
+quiet_cmd_syscall = SYSCALL $@
+ cmd_syscall = $(CC) -m elf_x86_64 -nostdlib $(SYSCFLAGS_$(@F)) \
+ -Wl,-T,$(filter-out FORCE,$^) -o $@
+
+export CPPFLAGS_vdso.lds += -P -C -U$(ARCH)
+
+vdso-flags = -fPIC -shared -Wl,-soname=linux-vdso.so.1 \
+ $(call ld-option, -Wl$(comma)--hash-style=sysv) \
+ -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096
+SYSCFLAGS_vdso.so = $(vdso-flags)
+
+$(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so
+
+$(obj)/vdso.so: $(src)/vdso.lds $(vobjs) FORCE
+ $(call if_changed,syscall)
+
+CFL := $(PROFILING) -mcmodel=small -fPIC -g0 -O2 -fasynchronous-unwind-tables -m64
+
+$(obj)/vclock_gettime.o: CFLAGS = $(CFL)
+$(obj)/vgetcpu.o: CFLAGS = $(CFL)
+
+# We also create a special relocatable object that should mirror the symbol
+# table and layout of the linked DSO. With ld -R we can then refer to
+# these symbols in the kernel code rather than hand-coded addresses.
+extra-y += vdso-syms.o
+$(obj)/built-in.o: $(obj)/vdso-syms.o
+$(obj)/built-in.o: ld_flags += -R $(obj)/vdso-syms.o
+
+SYSCFLAGS_vdso-syms.o = -r -d
+$(obj)/vdso-syms.o: $(src)/vdso.lds $(vobjs) FORCE
+ $(call if_changed,syscall)
diff --git a/arch/x86_64/vdso/vclock_gettime.c b/arch/x86_64/vdso/vclock_gettime.c
new file mode 100644
index 00000000000..17f6a00de71
--- /dev/null
+++ b/arch/x86_64/vdso/vclock_gettime.c
@@ -0,0 +1,120 @@
+/*
+ * Copyright 2006 Andi Kleen, SUSE Labs.
+ * Subject to the GNU Public License, v.2
+ *
+ * Fast user context implementation of clock_gettime and gettimeofday.
+ *
+ * The code should have no internal unresolved relocations.
+ * Check with readelf after changing.
+ * Also alternative() doesn't work.
+ */
+
+#include <linux/kernel.h>
+#include <linux/posix-timers.h>
+#include <linux/time.h>
+#include <linux/string.h>
+#include <asm/vsyscall.h>
+#include <asm/vgtod.h>
+#include <asm/timex.h>
+#include <asm/hpet.h>
+#include <asm/unistd.h>
+#include <asm/io.h>
+#include <asm/vgtod.h>
+#include "vextern.h"
+
+#define gtod vdso_vsyscall_gtod_data
+
+static long vdso_fallback_gettime(long clock, struct timespec *ts)
+{
+ long ret;
+ asm("syscall" : "=a" (ret) :
+ "0" (__NR_clock_gettime),"D" (clock), "S" (ts) : "memory");
+ return ret;
+}
+
+static inline long vgetns(void)
+{
+ cycles_t (*vread)(void);
+ vread = gtod->clock.vread;
+ return ((vread() - gtod->clock.cycle_last) * gtod->clock.mult) >>
+ gtod->clock.shift;
+}
+
+static noinline int do_realtime(struct timespec *ts)
+{
+ unsigned long seq, ns;
+ do {
+ seq = read_seqbegin(&gtod->lock);
+ ts->tv_sec = gtod->wall_time_sec;
+ ts->tv_nsec = gtod->wall_time_nsec;
+ ns = vgetns();
+ } while (unlikely(read_seqretry(&gtod->lock, seq)));
+ timespec_add_ns(ts, ns);
+ return 0;
+}
+
+/* Copy of the version in kernel/time.c which we cannot directly access */
+static void vset_normalized_timespec(struct timespec *ts, long sec, long nsec)
+{
+ while (nsec >= NSEC_PER_SEC) {
+ nsec -= NSEC_PER_SEC;
+ ++sec;
+ }
+ while (nsec < 0) {
+ nsec += NSEC_PER_SEC;
+ --sec;
+ }
+ ts->tv_sec = sec;
+ ts->tv_nsec = nsec;
+}
+
+static noinline int do_monotonic(struct timespec *ts)
+{
+ unsigned long seq, ns, secs;
+ do {
+ seq = read_seqbegin(&gtod->lock);
+ secs = gtod->wall_time_sec;
+ ns = gtod->wall_time_nsec + vgetns();
+ secs += gtod->wall_to_monotonic.tv_sec;
+ ns += gtod->wall_to_monotonic.tv_nsec;
+ } while (unlikely(read_seqretry(&gtod->lock, seq)));
+ vset_normalized_timespec(ts, secs, ns);
+ return 0;
+}
+
+int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
+{
+ if (likely(gtod->sysctl_enabled && gtod->clock.vread))
+ switch (clock) {
+ case CLOCK_REALTIME:
+ return do_realtime(ts);
+ case CLOCK_MONOTONIC:
+ return do_monotonic(ts);
+ }
+ return vdso_fallback_gettime(clock, ts);
+}
+int clock_gettime(clockid_t, struct timespec *)
+ __attribute__((weak, alias("__vdso_clock_gettime")));
+
+int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
+{
+ long ret;
+ if (likely(gtod->sysctl_enabled && gtod->clock.vread)) {
+ BUILD_BUG_ON(offsetof(struct timeval, tv_usec) !=
+ offsetof(struct timespec, tv_nsec) ||
+ sizeof(*tv) != sizeof(struct timespec));
+ do_realtime((struct timespec *)tv);
+ tv->tv_usec /= 1000;
+ if (unlikely(tz != NULL)) {
+ /* This relies on gcc inlining the memcpy. We'll notice
+ if it ever fails to do so. */
+ memcpy(tz, &gtod->sys_tz, sizeof(struct timezone));
+ }
+ return 0;
+ }
+ asm("syscall" : "=a" (ret) :
+ "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory");
+ return ret;
+}
+int gettimeofday(struct timeval *, struct timezone *)
+ __attribute__((weak, alias("__vdso_gettimeofday")));
diff --git a/arch/x86_64/vdso/vdso-note.S b/arch/x86_64/vdso/vdso-note.S
new file mode 100644
index 00000000000..79a071e4357
--- /dev/null
+++ b/arch/x86_64/vdso/vdso-note.S
@@ -0,0 +1,12 @@
+/*
+ * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text.
+ * Here we can supply some information useful to userland.
+ */
+
+#include <linux/uts.h>
+#include <linux/version.h>
+#include <linux/elfnote.h>
+
+ELFNOTE_START(Linux, 0, "a")
+ .long LINUX_VERSION_CODE
+ELFNOTE_END
diff --git a/arch/x86_64/vdso/vdso-start.S b/arch/x86_64/vdso/vdso-start.S
new file mode 100644
index 00000000000..2dc2cdb84d6
--- /dev/null
+++ b/arch/x86_64/vdso/vdso-start.S
@@ -0,0 +1,2 @@
+ .globl vdso_kernel_start
+vdso_kernel_start:
diff --git a/arch/x86_64/vdso/vdso.S b/arch/x86_64/vdso/vdso.S
new file mode 100644
index 00000000000..92e80c1972a
--- /dev/null
+++ b/arch/x86_64/vdso/vdso.S
@@ -0,0 +1,2 @@
+ .section ".vdso","a"
+ .incbin "arch/x86_64/vdso/vdso.so"
diff --git a/arch/x86_64/vdso/vdso.lds.S b/arch/x86_64/vdso/vdso.lds.S
new file mode 100644
index 00000000000..b9a60e665d0
--- /dev/null
+++ b/arch/x86_64/vdso/vdso.lds.S
@@ -0,0 +1,77 @@
+/*
+ * Linker script for vsyscall DSO. The vsyscall page is an ELF shared
+ * object prelinked to its virtual address, and with only one read-only
+ * segment (that fits in one page). This script controls its layout.
+ */
+#include <asm/asm-offsets.h>
+#include "voffset.h"
+
+#define VDSO_PRELINK 0xffffffffff700000
+
+SECTIONS
+{
+ . = VDSO_PRELINK + SIZEOF_HEADERS;
+
+ .hash : { *(.hash) } :text
+ .gnu.hash : { *(.gnu.hash) }
+ .dynsym : { *(.dynsym) }
+ .dynstr : { *(.dynstr) }
+ .gnu.version : { *(.gnu.version) }
+ .gnu.version_d : { *(.gnu.version_d) }
+ .gnu.version_r : { *(.gnu.version_r) }
+
+ /* This linker script is used both with -r and with -shared.
+ For the layouts to match, we need to skip more than enough
+ space for the dynamic symbol table et al. If this amount
+ is insufficient, ld -shared will barf. Just increase it here. */
+ . = VDSO_PRELINK + VDSO_TEXT_OFFSET;
+
+ .text : { *(.text) } :text
+ .text.ptr : { *(.text.ptr) } :text
+ . = VDSO_PRELINK + 0x900;
+ .data : { *(.data) } :text
+ .bss : { *(.bss) } :text
+
+ .altinstructions : { *(.altinstructions) } :text
+ .altinstr_replacement : { *(.altinstr_replacement) } :text
+
+ .note : { *(.note.*) } :text :note
+ .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr
+ .eh_frame : { KEEP (*(.eh_frame)) } :text
+ .dynamic : { *(.dynamic) } :text :dynamic
+ .useless : {
+ *(.got.plt) *(.got)
+ *(.gnu.linkonce.d.*)
+ *(.dynbss)
+ *(.gnu.linkonce.b.*)
+ } :text
+}
+
+/*
+ * We must supply the ELF program headers explicitly to get just one
+ * PT_LOAD segment, and set the flags explicitly to make segments read-only.
+ */
+PHDRS
+{
+ text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */
+ dynamic PT_DYNAMIC FLAGS(4); /* PF_R */
+ note PT_NOTE FLAGS(4); /* PF_R */
+ eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */
+}
+
+/*
+ * This controls what symbols we export from the DSO.
+ */
+VERSION
+{
+ LINUX_2.6 {
+ global:
+ clock_gettime;
+ __vdso_clock_gettime;
+ gettimeofday;
+ __vdso_gettimeofday;
+ getcpu;
+ __vdso_getcpu;
+ local: *;
+ };
+}
diff --git a/arch/x86_64/vdso/vextern.h b/arch/x86_64/vdso/vextern.h
new file mode 100644
index 00000000000..1683ba2ae3e
--- /dev/null
+++ b/arch/x86_64/vdso/vextern.h
@@ -0,0 +1,16 @@
+#ifndef VEXTERN
+#include <asm/vsyscall.h>
+#define VEXTERN(x) \
+ extern typeof(x) *vdso_ ## x __attribute__((visibility("hidden")));
+#endif
+
+#define VMAGIC 0xfeedbabeabcdefabUL
+
+/* Any kernel variables used in the vDSO must be exported in the main
+ kernel's vmlinux.lds.S/vsyscall.h/proper __section and
+ put into vextern.h and be referenced as a pointer with vdso prefix.
+ The main kernel later fills in the values. */
+
+VEXTERN(jiffies)
+VEXTERN(vgetcpu_mode)
+VEXTERN(vsyscall_gtod_data)
diff --git a/arch/x86_64/vdso/vgetcpu.c b/arch/x86_64/vdso/vgetcpu.c
new file mode 100644
index 00000000000..91f6e85d0fc
--- /dev/null
+++ b/arch/x86_64/vdso/vgetcpu.c
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2006 Andi Kleen, SUSE Labs.
+ * Subject to the GNU Public License, v.2
+ *
+ * Fast user context implementation of getcpu()
+ */
+
+#include <linux/kernel.h>
+#include <linux/getcpu.h>
+#include <linux/jiffies.h>
+#include <linux/time.h>
+#include <asm/vsyscall.h>
+#include <asm/vgtod.h>
+#include "vextern.h"
+
+long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
+{
+ unsigned int dummy, p;
+ unsigned long j = 0;
+
+ /* Fast cache - only recompute value once per jiffies and avoid
+ relatively costly rdtscp/cpuid otherwise.
+ This works because the scheduler usually keeps the process
+ on the same CPU and this syscall doesn't guarantee its
+ results anyways.
+ We do this here because otherwise user space would do it on
+ its own in a likely inferior way (no access to jiffies).
+ If you don't like it pass NULL. */
+ if (tcache && tcache->blob[0] == (j = *vdso_jiffies)) {
+ p = tcache->blob[1];
+ } else if (*vdso_vgetcpu_mode == VGETCPU_RDTSCP) {
+ /* Load per CPU data from RDTSCP */
+ rdtscp(dummy, dummy, p);
+ } else {
+ /* Load per CPU data from GDT */
+ asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
+ }
+ if (tcache) {
+ tcache->blob[0] = j;
+ tcache->blob[1] = p;
+ }
+ if (cpu)
+ *cpu = p & 0xfff;
+ if (node)
+ *node = p >> 12;
+ return 0;
+}
+
+long getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
+ __attribute__((weak, alias("__vdso_getcpu")));
diff --git a/arch/x86_64/vdso/vma.c b/arch/x86_64/vdso/vma.c
new file mode 100644
index 00000000000..d4cb83a6c06
--- /dev/null
+++ b/arch/x86_64/vdso/vma.c
@@ -0,0 +1,139 @@
+/*
+ * Set up the VMAs to tell the VM about the vDSO.
+ * Copyright 2007 Andi Kleen, SUSE Labs.
+ * Subject to the GPL, v.2
+ */
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/random.h>
+#include <asm/vsyscall.h>
+#include <asm/vgtod.h>
+#include <asm/proto.h>
+#include "voffset.h"
+
+int vdso_enabled = 1;
+
+#define VEXTERN(x) extern typeof(__ ## x) *vdso_ ## x;
+#include "vextern.h"
+#undef VEXTERN
+
+extern char vdso_kernel_start[], vdso_start[], vdso_end[];
+extern unsigned short vdso_sync_cpuid;
+
+struct page **vdso_pages;
+
+static inline void *var_ref(void *vbase, char *var, char *name)
+{
+ unsigned offset = var - &vdso_kernel_start[0] + VDSO_TEXT_OFFSET;
+ void *p = vbase + offset;
+ if (*(void **)p != (void *)VMAGIC) {
+ printk("VDSO: variable %s broken\n", name);
+ vdso_enabled = 0;
+ }
+ return p;
+}
+
+static int __init init_vdso_vars(void)
+{
+ int npages = (vdso_end - vdso_start + PAGE_SIZE - 1) / PAGE_SIZE;
+ int i;
+ char *vbase;
+
+ vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL);
+ if (!vdso_pages)
+ goto oom;
+ for (i = 0; i < npages; i++) {
+ struct page *p;
+ p = alloc_page(GFP_KERNEL);
+ if (!p)
+ goto oom;
+ vdso_pages[i] = p;
+ copy_page(page_address(p), vdso_start + i*PAGE_SIZE);
+ }
+
+ vbase = vmap(vdso_pages, npages, 0, PAGE_KERNEL);
+ if (!vbase)
+ goto oom;
+
+ if (memcmp(vbase, "\177ELF", 4)) {
+ printk("VDSO: I'm broken; not ELF\n");
+ vdso_enabled = 0;
+ }
+
+#define V(x) *(typeof(x) *) var_ref(vbase, (char *)RELOC_HIDE(&x, 0), #x)
+#define VEXTERN(x) \
+ V(vdso_ ## x) = &__ ## x;
+#include "vextern.h"
+#undef VEXTERN
+ return 0;
+
+ oom:
+ printk("Cannot allocate vdso\n");
+ vdso_enabled = 0;
+ return -ENOMEM;
+}
+__initcall(init_vdso_vars);
+
+struct linux_binprm;
+
+/* Put the vdso above the (randomized) stack with another randomized offset.
+ This way there is no hole in the middle of address space.
+ To save memory make sure it is still in the same PTE as the stack top.
+ This doesn't give that many random bits */
+static unsigned long vdso_addr(unsigned long start, unsigned len)
+{
+ unsigned long addr, end;
+ unsigned offset;
+ end = (start + PMD_SIZE - 1) & PMD_MASK;
+ if (end >= TASK_SIZE64)
+ end = TASK_SIZE64;
+ end -= len;
+ /* This loses some more bits than a modulo, but is cheaper */
+ offset = get_random_int() & (PTRS_PER_PTE - 1);
+ addr = start + (offset << PAGE_SHIFT);
+ if (addr >= end)
+ addr = end;
+ return addr;
+}
+
+/* Setup a VMA at program startup for the vsyscall page.
+ Not called for compat tasks */
+int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long addr;
+ int ret;
+ unsigned len = round_up(vdso_end - vdso_start, PAGE_SIZE);
+
+ if (!vdso_enabled)
+ return 0;
+
+ down_write(&mm->mmap_sem);
+ addr = vdso_addr(mm->start_stack, len);
+ addr = get_unmapped_area(NULL, addr, len, 0, 0);
+ if (IS_ERR_VALUE(addr)) {
+ ret = addr;
+ goto up_fail;
+ }
+
+ ret = install_special_mapping(mm, addr, len,
+ VM_READ|VM_EXEC|
+ VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
+ VM_ALWAYSDUMP,
+ vdso_pages);
+ if (ret)
+ goto up_fail;
+
+ current->mm->context.vdso = (void *)addr;
+up_fail:
+ up_write(&mm->mmap_sem);
+ return ret;
+}
+
+static __init int vdso_setup(char *s)
+{
+ vdso_enabled = simple_strtoul(s, NULL, 0);
+ return 0;
+}
+__setup("vdso=", vdso_setup);
diff --git a/arch/x86_64/vdso/voffset.h b/arch/x86_64/vdso/voffset.h
new file mode 100644
index 00000000000..5304204911f
--- /dev/null
+++ b/arch/x86_64/vdso/voffset.h
@@ -0,0 +1 @@
+#define VDSO_TEXT_OFFSET 0x500
diff --git a/arch/x86_64/vdso/vvar.c b/arch/x86_64/vdso/vvar.c
new file mode 100644
index 00000000000..6fc22219a47
--- /dev/null
+++ b/arch/x86_64/vdso/vvar.c
@@ -0,0 +1,12 @@
+/* Define pointer to external vDSO variables.
+ These are part of the vDSO. The kernel fills in the real addresses
+ at boot time. This is done because when the vdso is linked the
+ kernel isn't yet and we don't know the final addresses. */
+#include <linux/kernel.h>
+#include <linux/time.h>
+#include <asm/vsyscall.h>
+#include <asm/timex.h>
+#include <asm/vgtod.h>
+
+#define VEXTERN(x) typeof (__ ## x) *vdso_ ## x = (void *)VMAGIC;
+#include "vextern.h"