diff options
Diffstat (limited to 'arch/x86_64/kernel')
57 files changed, 0 insertions, 22337 deletions
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile deleted file mode 100644 index 48f9e2c19cd..00000000000 --- a/arch/x86_64/kernel/Makefile +++ /dev/null @@ -1,47 +0,0 @@ -# -# Makefile for the linux kernel. -# - -extra-y := head.o head64.o init_task.o vmlinux.lds -EXTRA_AFLAGS := -traditional -obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o \ - ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \ - x8664_ksyms.o i387.o syscall.o vsyscall.o \ - setup64.o bootflag.o e820.o reboot.o quirks.o - -obj-$(CONFIG_X86_MCE) += mce.o -obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o -obj-$(CONFIG_MTRR) += ../../i386/kernel/cpu/mtrr/ -obj-$(CONFIG_ACPI_BOOT) += acpi/ -obj-$(CONFIG_X86_MSR) += msr.o -obj-$(CONFIG_MICROCODE) += microcode.o -obj-$(CONFIG_X86_CPUID) += cpuid.o -obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o -obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o -obj-$(CONFIG_X86_IO_APIC) += io_apic.o mpparse.o \ - genapic.o genapic_cluster.o genapic_flat.o -obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o -obj-$(CONFIG_PM) += suspend.o -obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend_asm.o -obj-$(CONFIG_CPU_FREQ) += cpufreq/ -obj-$(CONFIG_EARLY_PRINTK) += early_printk.o -obj-$(CONFIG_GART_IOMMU) += pci-gart.o aperture.o -obj-$(CONFIG_DUMMY_IOMMU) += pci-nommu.o pci-dma.o -obj-$(CONFIG_SWIOTLB) += swiotlb.o -obj-$(CONFIG_KPROBES) += kprobes.o -obj-$(CONFIG_X86_PM_TIMER) += pmtimer.o - -obj-$(CONFIG_MODULES) += module.o - -obj-y += topology.o -obj-y += intel_cacheinfo.o - -CFLAGS_vsyscall.o := $(PROFILING) -g0 - -bootflag-y += ../../i386/kernel/bootflag.o -cpuid-$(subst m,y,$(CONFIG_X86_CPUID)) += ../../i386/kernel/cpuid.o -topology-y += ../../i386/mach-default/topology.o -swiotlb-$(CONFIG_SWIOTLB) += ../../ia64/lib/swiotlb.o -microcode-$(subst m,y,$(CONFIG_MICROCODE)) += ../../i386/kernel/microcode.o -intel_cacheinfo-y += ../../i386/kernel/cpu/intel_cacheinfo.o -quirks-y += ../../i386/kernel/quirks.o diff --git a/arch/x86_64/kernel/acpi/Makefile b/arch/x86_64/kernel/acpi/Makefile deleted file mode 100644 index d2c2ee5f9a8..00000000000 --- a/arch/x86_64/kernel/acpi/Makefile +++ /dev/null @@ -1,3 +0,0 @@ -obj-$(CONFIG_ACPI_BOOT) := boot.o -boot-$(CONFIG_ACPI_BOOT) := ../../../i386/kernel/acpi/boot.o -obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup.o diff --git a/arch/x86_64/kernel/acpi/sleep.c b/arch/x86_64/kernel/acpi/sleep.c deleted file mode 100644 index 7a275de6df2..00000000000 --- a/arch/x86_64/kernel/acpi/sleep.c +++ /dev/null @@ -1,132 +0,0 @@ -/* - * acpi.c - Architecture-Specific Low-Level ACPI Support - * - * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com> - * Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com> - * Copyright (C) 2001 Patrick Mochel <mochel@osdl.org> - * Copyright (C) 2002 Andi Kleen, SuSE Labs (x86-64 port) - * Copyright (C) 2003 Pavel Machek, SuSE Labs - * - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - */ - -#include <linux/config.h> -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/types.h> -#include <linux/stddef.h> -#include <linux/slab.h> -#include <linux/pci.h> -#include <linux/bootmem.h> -#include <linux/irq.h> -#include <linux/acpi.h> -#include <asm/mpspec.h> -#include <asm/io.h> -#include <asm/apic.h> -#include <asm/apicdef.h> -#include <asm/page.h> -#include <asm/pgtable.h> -#include <asm/pgalloc.h> -#include <asm/io_apic.h> -#include <asm/proto.h> -#include <asm/tlbflush.h> - - -/* -------------------------------------------------------------------------- - Low-Level Sleep Support - -------------------------------------------------------------------------- */ - -#ifdef CONFIG_ACPI_SLEEP - -/* address in low memory of the wakeup routine. */ -unsigned long acpi_wakeup_address = 0; -unsigned long acpi_video_flags; -extern char wakeup_start, wakeup_end; - -extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long)); - -static pgd_t low_ptr; - -static void init_low_mapping(void) -{ - pgd_t *slot0 = pgd_offset(current->mm, 0UL); - low_ptr = *slot0; - set_pgd(slot0, *pgd_offset(current->mm, PAGE_OFFSET)); - flush_tlb_all(); -} - -/** - * acpi_save_state_mem - save kernel state - * - * Create an identity mapped page table and copy the wakeup routine to - * low memory. - */ -int acpi_save_state_mem (void) -{ - init_low_mapping(); - - memcpy((void *) acpi_wakeup_address, &wakeup_start, &wakeup_end - &wakeup_start); - acpi_copy_wakeup_routine(acpi_wakeup_address); - - return 0; -} - -/* - * acpi_restore_state - */ -void acpi_restore_state_mem (void) -{ - set_pgd(pgd_offset(current->mm, 0UL), low_ptr); - flush_tlb_all(); -} - -/** - * acpi_reserve_bootmem - do _very_ early ACPI initialisation - * - * We allocate a page in low memory for the wakeup - * routine for when we come back from a sleep state. The - * runtime allocator allows specification of <16M pages, but not - * <1M pages. - */ -void __init acpi_reserve_bootmem(void) -{ - acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE); - if ((&wakeup_end - &wakeup_start) > PAGE_SIZE) - printk(KERN_CRIT "ACPI: Wakeup code way too big, will crash on attempt to suspend\n"); -} - -static int __init acpi_sleep_setup(char *str) -{ - while ((str != NULL) && (*str != '\0')) { - if (strncmp(str, "s3_bios", 7) == 0) - acpi_video_flags = 1; - if (strncmp(str, "s3_mode", 7) == 0) - acpi_video_flags |= 2; - str = strchr(str, ','); - if (str != NULL) - str += strspn(str, ", \t"); - } - return 1; -} - -__setup("acpi_sleep=", acpi_sleep_setup); - -#endif /*CONFIG_ACPI_SLEEP*/ - -void acpi_pci_link_exit(void) {} diff --git a/arch/x86_64/kernel/acpi/wakeup.S b/arch/x86_64/kernel/acpi/wakeup.S deleted file mode 100644 index 185faa911db..00000000000 --- a/arch/x86_64/kernel/acpi/wakeup.S +++ /dev/null @@ -1,527 +0,0 @@ -.text -#include <linux/linkage.h> -#include <asm/segment.h> -#include <asm/page.h> -#include <asm/msr.h> - -# Copyright 2003 Pavel Machek <pavel@suse.cz>, distribute under GPLv2 -# -# wakeup_code runs in real mode, and at unknown address (determined at run-time). -# Therefore it must only use relative jumps/calls. -# -# Do we need to deal with A20? It is okay: ACPI specs says A20 must be enabled -# -# If physical address of wakeup_code is 0x12345, BIOS should call us with -# cs = 0x1234, eip = 0x05 -# - - -ALIGN - .align 16 -ENTRY(wakeup_start) -wakeup_code: - wakeup_code_start = . - .code16 - -# Running in *copy* of this code, somewhere in low 1MB. - - movb $0xa1, %al ; outb %al, $0x80 - cli - cld - # setup data segment - movw %cs, %ax - movw %ax, %ds # Make ds:0 point to wakeup_start - movw %ax, %ss - mov $(wakeup_stack - wakeup_code), %sp # Private stack is needed for ASUS board - - pushl $0 # Kill any dangerous flags - popfl - - movl real_magic - wakeup_code, %eax - cmpl $0x12345678, %eax - jne bogus_real_magic - - testl $1, video_flags - wakeup_code - jz 1f - lcall $0xc000,$3 - movw %cs, %ax - movw %ax, %ds # Bios might have played with that - movw %ax, %ss -1: - - testl $2, video_flags - wakeup_code - jz 1f - mov video_mode - wakeup_code, %ax - call mode_seta -1: - - movw $0xb800, %ax - movw %ax,%fs - movw $0x0e00 + 'L', %fs:(0x10) - - movb $0xa2, %al ; outb %al, $0x80 - - lidt %ds:idt_48a - wakeup_code - xorl %eax, %eax - movw %ds, %ax # (Convert %ds:gdt to a linear ptr) - shll $4, %eax - addl $(gdta - wakeup_code), %eax - movl %eax, gdt_48a +2 - wakeup_code - lgdtl %ds:gdt_48a - wakeup_code # load gdt with whatever is - # appropriate - - movl $1, %eax # protected mode (PE) bit - lmsw %ax # This is it! - jmp 1f -1: - - .byte 0x66, 0xea # prefix + jmpi-opcode - .long wakeup_32 - __START_KERNEL_map - .word __KERNEL_CS - - .code32 -wakeup_32: -# Running in this code, but at low address; paging is not yet turned on. - movb $0xa5, %al ; outb %al, $0x80 - - /* Check if extended functions are implemented */ - movl $0x80000000, %eax - cpuid - cmpl $0x80000000, %eax - jbe bogus_cpu - wbinvd - mov $0x80000001, %eax - cpuid - btl $29, %edx - jnc bogus_cpu - movl %edx,%edi - - movw $__KERNEL_DS, %ax - movw %ax, %ds - movw %ax, %es - movw %ax, %fs - movw %ax, %gs - - movw $__KERNEL_DS, %ax - movw %ax, %ss - - mov $(wakeup_stack - __START_KERNEL_map), %esp - movl saved_magic - __START_KERNEL_map, %eax - cmpl $0x9abcdef0, %eax - jne bogus_32_magic - - /* - * Prepare for entering 64bits mode - */ - - /* Enable PAE mode and PGE */ - xorl %eax, %eax - btsl $5, %eax - btsl $7, %eax - movl %eax, %cr4 - - /* Setup early boot stage 4 level pagetables */ - movl $(wakeup_level4_pgt - __START_KERNEL_map), %eax - movl %eax, %cr3 - - /* Setup EFER (Extended Feature Enable Register) */ - movl $MSR_EFER, %ecx - rdmsr - /* Fool rdmsr and reset %eax to avoid dependences */ - xorl %eax, %eax - /* Enable Long Mode */ - btsl $_EFER_LME, %eax - /* Enable System Call */ - btsl $_EFER_SCE, %eax - - /* No Execute supported? */ - btl $20,%edi - jnc 1f - btsl $_EFER_NX, %eax -1: - - /* Make changes effective */ - wrmsr - wbinvd - - xorl %eax, %eax - btsl $31, %eax /* Enable paging and in turn activate Long Mode */ - btsl $0, %eax /* Enable protected mode */ - btsl $1, %eax /* Enable MP */ - btsl $4, %eax /* Enable ET */ - btsl $5, %eax /* Enable NE */ - btsl $16, %eax /* Enable WP */ - btsl $18, %eax /* Enable AM */ - - /* Make changes effective */ - movl %eax, %cr0 - /* At this point: - CR4.PAE must be 1 - CS.L must be 0 - CR3 must point to PML4 - Next instruction must be a branch - This must be on identity-mapped page - */ - jmp reach_compatibility_mode -reach_compatibility_mode: - movw $0x0e00 + 'i', %ds:(0xb8012) - movb $0xa8, %al ; outb %al, $0x80; - - /* - * At this point we're in long mode but in 32bit compatibility mode - * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn - * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we load - * the new gdt/idt that has __KERNEL_CS with CS.L = 1. - */ - - movw $0x0e00 + 'n', %ds:(0xb8014) - movb $0xa9, %al ; outb %al, $0x80 - - /* Load new GDT with the 64bit segment using 32bit descriptor */ - movl $(pGDT32 - __START_KERNEL_map), %eax - lgdt (%eax) - - movl $(wakeup_jumpvector - __START_KERNEL_map), %eax - /* Finally jump in 64bit mode */ - ljmp *(%eax) - -wakeup_jumpvector: - .long wakeup_long64 - __START_KERNEL_map - .word __KERNEL_CS - -.code64 - - /* Hooray, we are in Long 64-bit mode (but still running in low memory) */ -wakeup_long64: - /* - * We must switch to a new descriptor in kernel space for the GDT - * because soon the kernel won't have access anymore to the userspace - * addresses where we're currently running on. We have to do that here - * because in 32bit we couldn't load a 64bit linear address. - */ - lgdt cpu_gdt_descr - __START_KERNEL_map - - movw $0x0e00 + 'u', %ds:(0xb8016) - - nop - nop - movw $__KERNEL_DS, %ax - movw %ax, %ss - movw %ax, %ds - movw %ax, %es - movw %ax, %fs - movw %ax, %gs - movq saved_esp, %rsp - - movw $0x0e00 + 'x', %ds:(0xb8018) - movq saved_ebx, %rbx - movq saved_edi, %rdi - movq saved_esi, %rsi - movq saved_ebp, %rbp - - movw $0x0e00 + '!', %ds:(0xb801a) - movq saved_eip, %rax - jmp *%rax - -.code32 - - .align 64 -gdta: - .word 0, 0, 0, 0 # dummy - - .word 0, 0, 0, 0 # unused - - .word 0xFFFF # 4Gb - (0x100000*0x1000 = 4Gb) - .word 0 # base address = 0 - .word 0x9B00 # code read/exec. ??? Why I need 0x9B00 (as opposed to 0x9A00 in order for this to work?) - .word 0x00CF # granularity = 4096, 386 - # (+5th nibble of limit) - - .word 0xFFFF # 4Gb - (0x100000*0x1000 = 4Gb) - .word 0 # base address = 0 - .word 0x9200 # data read/write - .word 0x00CF # granularity = 4096, 386 - # (+5th nibble of limit) -# this is 64bit descriptor for code - .word 0xFFFF - .word 0 - .word 0x9A00 # code read/exec - .word 0x00AF # as above, but it is long mode and with D=0 - -idt_48a: - .word 0 # idt limit = 0 - .word 0, 0 # idt base = 0L - -gdt_48a: - .word 0x8000 # gdt limit=2048, - # 256 GDT entries - .word 0, 0 # gdt base (filled in later) - - -real_save_gdt: .word 0 - .quad 0 -real_magic: .quad 0 -video_mode: .quad 0 -video_flags: .quad 0 - -bogus_real_magic: - movb $0xba,%al ; outb %al,$0x80 - jmp bogus_real_magic - -bogus_32_magic: - movb $0xb3,%al ; outb %al,$0x80 - jmp bogus_32_magic - -bogus_31_magic: - movb $0xb1,%al ; outb %al,$0x80 - jmp bogus_31_magic - -bogus_cpu: - movb $0xbc,%al ; outb %al,$0x80 - jmp bogus_cpu - - -/* This code uses an extended set of video mode numbers. These include: - * Aliases for standard modes - * NORMAL_VGA (-1) - * EXTENDED_VGA (-2) - * ASK_VGA (-3) - * Video modes numbered by menu position -- NOT RECOMMENDED because of lack - * of compatibility when extending the table. These are between 0x00 and 0xff. - */ -#define VIDEO_FIRST_MENU 0x0000 - -/* Standard BIOS video modes (BIOS number + 0x0100) */ -#define VIDEO_FIRST_BIOS 0x0100 - -/* VESA BIOS video modes (VESA number + 0x0200) */ -#define VIDEO_FIRST_VESA 0x0200 - -/* Video7 special modes (BIOS number + 0x0900) */ -#define VIDEO_FIRST_V7 0x0900 - -# Setting of user mode (AX=mode ID) => CF=success -mode_seta: - movw %ax, %bx -#if 0 - cmpb $0xff, %ah - jz setalias - - testb $VIDEO_RECALC>>8, %ah - jnz _setrec - - cmpb $VIDEO_FIRST_RESOLUTION>>8, %ah - jnc setres - - cmpb $VIDEO_FIRST_SPECIAL>>8, %ah - jz setspc - - cmpb $VIDEO_FIRST_V7>>8, %ah - jz setv7 -#endif - - cmpb $VIDEO_FIRST_VESA>>8, %ah - jnc check_vesaa -#if 0 - orb %ah, %ah - jz setmenu -#endif - - decb %ah -# jz setbios Add bios modes later - -setbada: clc - ret - -check_vesaa: - subb $VIDEO_FIRST_VESA>>8, %bh - orw $0x4000, %bx # Use linear frame buffer - movw $0x4f02, %ax # VESA BIOS mode set call - int $0x10 - cmpw $0x004f, %ax # AL=4f if implemented - jnz _setbada # AH=0 if OK - - stc - ret - -_setbada: jmp setbada - - .code64 -bogus_magic: - movw $0x0e00 + 'B', %ds:(0xb8018) - jmp bogus_magic - -bogus_magic2: - movw $0x0e00 + '2', %ds:(0xb8018) - jmp bogus_magic2 - - -wakeup_stack_begin: # Stack grows down - -.org 0xff0 -wakeup_stack: # Just below end of page - -ENTRY(wakeup_end) - -## -# acpi_copy_wakeup_routine -# -# Copy the above routine to low memory. -# -# Parameters: -# %rdi: place to copy wakeup routine to -# -# Returned address is location of code in low memory (past data and stack) -# -ENTRY(acpi_copy_wakeup_routine) - pushq %rax - pushq %rcx - pushq %rdx - - sgdt saved_gdt - sidt saved_idt - sldt saved_ldt - str saved_tss - - movq %cr3, %rdx - movq %rdx, saved_cr3 - movq %cr4, %rdx - movq %rdx, saved_cr4 - movq %cr0, %rdx - movq %rdx, saved_cr0 - sgdt real_save_gdt - wakeup_start (,%rdi) - movl $MSR_EFER, %ecx - rdmsr - movl %eax, saved_efer - movl %edx, saved_efer2 - - movl saved_video_mode, %edx - movl %edx, video_mode - wakeup_start (,%rdi) - movl acpi_video_flags, %edx - movl %edx, video_flags - wakeup_start (,%rdi) - movq $0x12345678, real_magic - wakeup_start (,%rdi) - movq $0x123456789abcdef0, %rdx - movq %rdx, saved_magic - - movl saved_magic - __START_KERNEL_map, %eax - cmpl $0x9abcdef0, %eax - jne bogus_32_magic - - # make sure %cr4 is set correctly (features, etc) - movl saved_cr4 - __START_KERNEL_map, %eax - movq %rax, %cr4 - - movl saved_cr0 - __START_KERNEL_map, %eax - movq %rax, %cr0 - jmp 1f # Flush pipelines -1: - # restore the regs we used - popq %rdx - popq %rcx - popq %rax -ENTRY(do_suspend_lowlevel_s4bios) - ret - - .align 2 - .p2align 4,,15 -.globl do_suspend_lowlevel - .type do_suspend_lowlevel,@function -do_suspend_lowlevel: -.LFB5: - subq $8, %rsp - xorl %eax, %eax - call save_processor_state - - movq %rsp, saved_context_esp(%rip) - movq %rax, saved_context_eax(%rip) - movq %rbx, saved_context_ebx(%rip) - movq %rcx, saved_context_ecx(%rip) - movq %rdx, saved_context_edx(%rip) - movq %rbp, saved_context_ebp(%rip) - movq %rsi, saved_context_esi(%rip) - movq %rdi, saved_context_edi(%rip) - movq %r8, saved_context_r08(%rip) - movq %r9, saved_context_r09(%rip) - movq %r10, saved_context_r10(%rip) - movq %r11, saved_context_r11(%rip) - movq %r12, saved_context_r12(%rip) - movq %r13, saved_context_r13(%rip) - movq %r14, saved_context_r14(%rip) - movq %r15, saved_context_r15(%rip) - pushfq ; popq saved_context_eflags(%rip) - - movq $.L97, saved_eip(%rip) - - movq %rsp,saved_esp - movq %rbp,saved_ebp - movq %rbx,saved_ebx - movq %rdi,saved_edi - movq %rsi,saved_esi - - addq $8, %rsp - movl $3, %edi - xorl %eax, %eax - jmp acpi_enter_sleep_state -.L97: - .p2align 4,,7 -.L99: - .align 4 - movl $24, %eax - movw %ax, %ds - movq saved_context+58(%rip), %rax - movq %rax, %cr4 - movq saved_context+50(%rip), %rax - movq %rax, %cr3 - movq saved_context+42(%rip), %rax - movq %rax, %cr2 - movq saved_context+34(%rip), %rax - movq %rax, %cr0 - pushq saved_context_eflags(%rip) ; popfq - movq saved_context_esp(%rip), %rsp - movq saved_context_ebp(%rip), %rbp - movq saved_context_eax(%rip), %rax - movq saved_context_ebx(%rip), %rbx - movq saved_context_ecx(%rip), %rcx - movq saved_context_edx(%rip), %rdx - movq saved_context_esi(%rip), %rsi - movq saved_context_edi(%rip), %rdi - movq saved_context_r08(%rip), %r8 - movq saved_context_r09(%rip), %r9 - movq saved_context_r10(%rip), %r10 - movq saved_context_r11(%rip), %r11 - movq saved_context_r12(%rip), %r12 - movq saved_context_r13(%rip), %r13 - movq saved_context_r14(%rip), %r14 - movq saved_context_r15(%rip), %r15 - - xorl %eax, %eax - addq $8, %rsp - jmp restore_processor_state -.LFE5: -.Lfe5: - .size do_suspend_lowlevel,.Lfe5-do_suspend_lowlevel - -.data -ALIGN -ENTRY(saved_ebp) .quad 0 -ENTRY(saved_esi) .quad 0 -ENTRY(saved_edi) .quad 0 -ENTRY(saved_ebx) .quad 0 - -ENTRY(saved_eip) .quad 0 -ENTRY(saved_esp) .quad 0 - -ENTRY(saved_magic) .quad 0 - -ALIGN -# saved registers -saved_gdt: .quad 0,0 -saved_idt: .quad 0,0 -saved_ldt: .quad 0 -saved_tss: .quad 0 - -saved_cr0: .quad 0 -saved_cr3: .quad 0 -saved_cr4: .quad 0 -saved_efer: .quad 0 -saved_efer2: .quad 0 diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c deleted file mode 100644 index c9a6b812e92..00000000000 --- a/arch/x86_64/kernel/aperture.c +++ /dev/null @@ -1,282 +0,0 @@ -/* - * Firmware replacement code. - * - * Work around broken BIOSes that don't set an aperture or only set the - * aperture in the AGP bridge. - * If all fails map the aperture over some low memory. This is cheaper than - * doing bounce buffering. The memory is lost. This is done at early boot - * because only the bootmem allocator can allocate 32+MB. - * - * Copyright 2002 Andi Kleen, SuSE Labs. - * $Id: aperture.c,v 1.7 2003/08/01 03:36:18 ak Exp $ - */ -#include <linux/config.h> -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/init.h> -#include <linux/bootmem.h> -#include <linux/mmzone.h> -#include <linux/pci_ids.h> -#include <linux/pci.h> -#include <linux/bitops.h> -#include <asm/e820.h> -#include <asm/io.h> -#include <asm/proto.h> -#include <asm/pci-direct.h> - -int iommu_aperture; -int iommu_aperture_disabled __initdata = 0; -int iommu_aperture_allowed __initdata = 0; - -int fallback_aper_order __initdata = 1; /* 64MB */ -int fallback_aper_force __initdata = 0; - -int fix_aperture __initdata = 1; - -/* This code runs before the PCI subsystem is initialized, so just - access the northbridge directly. */ - -#define NB_ID_3 (PCI_VENDOR_ID_AMD | (0x1103<<16)) - -static u32 __init allocate_aperture(void) -{ - pg_data_t *nd0 = NODE_DATA(0); - u32 aper_size; - void *p; - - if (fallback_aper_order > 7) - fallback_aper_order = 7; - aper_size = (32 * 1024 * 1024) << fallback_aper_order; - - /* - * Aperture has to be naturally aligned. This means an 2GB aperture won't - * have much chances to find a place in the lower 4GB of memory. - * Unfortunately we cannot move it up because that would make the - * IOMMU useless. - */ - p = __alloc_bootmem_node(nd0, aper_size, aper_size, 0); - if (!p || __pa(p)+aper_size > 0xffffffff) { - printk("Cannot allocate aperture memory hole (%p,%uK)\n", - p, aper_size>>10); - if (p) - free_bootmem_node(nd0, (unsigned long)p, aper_size); - return 0; - } - printk("Mapping aperture over %d KB of RAM @ %lx\n", - aper_size >> 10, __pa(p)); - return (u32)__pa(p); -} - -static int __init aperture_valid(char *name, u64 aper_base, u32 aper_size) -{ - if (!aper_base) - return 0; - if (aper_size < 64*1024*1024) { - printk("Aperture from %s too small (%d MB)\n", name, aper_size>>20); - return 0; - } - if (aper_base + aper_size >= 0xffffffff) { - printk("Aperture from %s beyond 4GB. Ignoring.\n",name); - return 0; - } - if (e820_mapped(aper_base, aper_base + aper_size, E820_RAM)) { - printk("Aperture from %s pointing to e820 RAM. Ignoring.\n",name); - return 0; - } - return 1; -} - -/* Find a PCI capability */ -static __u32 __init find_cap(int num, int slot, int func, int cap) -{ - u8 pos; - int bytes; - if (!(read_pci_config_16(num,slot,func,PCI_STATUS) & PCI_STATUS_CAP_LIST)) - return 0; - pos = read_pci_config_byte(num,slot,func,PCI_CAPABILITY_LIST); - for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { - u8 id; - pos &= ~3; - id = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_ID); - if (id == 0xff) - break; - if (id == cap) - return pos; - pos = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_NEXT); - } - return 0; -} - -/* Read a standard AGPv3 bridge header */ -static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order) -{ - u32 apsize; - u32 apsizereg; - int nbits; - u32 aper_low, aper_hi; - u64 aper; - - printk("AGP bridge at %02x:%02x:%02x\n", num, slot, func); - apsizereg = read_pci_config_16(num,slot,func, cap + 0x14); - if (apsizereg == 0xffffffff) { - printk("APSIZE in AGP bridge unreadable\n"); - return 0; - } - - apsize = apsizereg & 0xfff; - /* Some BIOS use weird encodings not in the AGPv3 table. */ - if (apsize & 0xff) - apsize |= 0xf00; - nbits = hweight16(apsize); - *order = 7 - nbits; - if ((int)*order < 0) /* < 32MB */ - *order = 0; - - aper_low = read_pci_config(num,slot,func, 0x10); - aper_hi = read_pci_config(num,slot,func,0x14); - aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32); - - printk("Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n", - aper, 32 << *order, apsizereg); - - if (!aperture_valid("AGP bridge", aper, (32*1024*1024) << *order)) - return 0; - return (u32)aper; -} - -/* Look for an AGP bridge. Windows only expects the aperture in the - AGP bridge and some BIOS forget to initialize the Northbridge too. - Work around this here. - - Do an PCI bus scan by hand because we're running before the PCI - subsystem. - - All K8 AGP bridges are AGPv3 compliant, so we can do this scan - generically. It's probably overkill to always scan all slots because - the AGP bridges should be always an own bus on the HT hierarchy, - but do it here for future safety. */ -static __u32 __init search_agp_bridge(u32 *order, int *valid_agp) -{ - int num, slot, func; - - /* Poor man's PCI discovery */ - for (num = 0; num < 32; num++) { - for (slot = 0; slot < 32; slot++) { - for (func = 0; func < 8; func++) { - u32 class, cap; - u8 type; - class = read_pci_config(num,slot,func, - PCI_CLASS_REVISION); - if (class == 0xffffffff) - break; - - switch (class >> 16) { - case PCI_CLASS_BRIDGE_HOST: - case PCI_CLASS_BRIDGE_OTHER: /* needed? */ - /* AGP bridge? */ - cap = find_cap(num,slot,func,PCI_CAP_ID_AGP); - if (!cap) - break; - *valid_agp = 1; - return read_agp(num,slot,func,cap,order); - } - - /* No multi-function device? */ - type = read_pci_config_byte(num,slot,func, - PCI_HEADER_TYPE); - if (!(type & 0x80)) - break; - } - } - } - printk("No AGP bridge found\n"); - return 0; -} - -void __init iommu_hole_init(void) -{ - int fix, num; - u32 aper_size, aper_alloc = 0, aper_order, last_aper_order = 0; - u64 aper_base, last_aper_base = 0; - int valid_agp = 0; - - if (iommu_aperture_disabled || !fix_aperture) - return; - - printk("Checking aperture...\n"); - - fix = 0; - for (num = 24; num < 32; num++) { - char name[30]; - if (read_pci_config(0, num, 3, 0x00) != NB_ID_3) - continue; - - iommu_aperture = 1; - - aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7; - aper_size = (32 * 1024 * 1024) << aper_order; - aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff; - aper_base <<= 25; - - printk("CPU %d: aperture @ %Lx size %u MB\n", num-24, - aper_base, aper_size>>20); - - sprintf(name, "northbridge cpu %d", num-24); - - if (!aperture_valid(name, aper_base, aper_size)) { - fix = 1; - break; - } - - if ((last_aper_order && aper_order != last_aper_order) || - (last_aper_base && aper_base != last_aper_base)) { - fix = 1; - break; - } - last_aper_order = aper_order; - last_aper_base = aper_base; - } - - if (!fix && !fallback_aper_force) - return; - - if (!fallback_aper_force) - aper_alloc = search_agp_bridge(&aper_order, &valid_agp); - - if (aper_alloc) { - /* Got the aperture from the AGP bridge */ - } else if ((!no_iommu && end_pfn >= 0xffffffff>>PAGE_SHIFT) || - force_iommu || - valid_agp || - fallback_aper_force) { - printk("Your BIOS doesn't leave a aperture memory hole\n"); - printk("Please enable the IOMMU option in the BIOS setup\n"); - printk("This costs you %d MB of RAM\n", - 32 << fallback_aper_order); - - aper_order = fallback_aper_order; - aper_alloc = allocate_aperture(); - if (!aper_alloc) { - /* Could disable AGP and IOMMU here, but it's probably - not worth it. But the later users cannot deal with - bad apertures and turning on the aperture over memory - causes very strange problems, so it's better to - panic early. */ - panic("Not enough memory for aperture"); - } - } else { - return; - } - - /* Fix up the north bridges */ - for (num = 24; num < 32; num++) { - if (read_pci_config(0, num, 3, 0x00) != NB_ID_3) - continue; - - /* Don't enable translation yet. That is done later. - Assume this BIOS didn't initialise the GART so - just overwrite all previous bits */ - write_pci_config(0, num, 3, 0x90, aper_order<<1); - write_pci_config(0, num, 3, 0x94, aper_alloc>>25); - } -} diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c deleted file mode 100644 index 375d369570c..00000000000 --- a/arch/x86_64/kernel/apic.c +++ /dev/null @@ -1,1125 +0,0 @@ -/* - * Local APIC handling, local APIC timers - * - * (c) 1999, 2000 Ingo Molnar <mingo@redhat.com> - * - * Fixes - * Maciej W. Rozycki : Bits for genuine 82489DX APICs; - * thanks to Eric Gilmore - * and Rolf G. Tews - * for testing these extensively. - * Maciej W. Rozycki : Various updates and fixes. - * Mikael Pettersson : Power Management for UP-APIC. - * Pavel Machek and - * Mikael Pettersson : PM converted to driver model. - */ - -#include <linux/config.h> -#include <linux/init.h> - -#include <linux/mm.h> -#include <linux/irq.h> -#include <linux/delay.h> -#include <linux/bootmem.h> -#include <linux/smp_lock.h> -#include <linux/interrupt.h> -#include <linux/mc146818rtc.h> -#include <linux/kernel_stat.h> -#include <linux/sysdev.h> - -#include <asm/atomic.h> -#include <asm/smp.h> -#include <asm/mtrr.h> -#include <asm/mpspec.h> -#include <asm/pgalloc.h> -#include <asm/mach_apic.h> -#include <asm/nmi.h> - -int apic_verbosity; - -int disable_apic_timer __initdata; - -/* Using APIC to generate smp_local_timer_interrupt? */ -int using_apic_timer = 0; - -static DEFINE_PER_CPU(int, prof_multiplier) = 1; -static DEFINE_PER_CPU(int, prof_old_multiplier) = 1; -static DEFINE_PER_CPU(int, prof_counter) = 1; - -static void apic_pm_activate(void); - -void enable_NMI_through_LVT0 (void * dummy) -{ - unsigned int v, ver; - - ver = apic_read(APIC_LVR); - ver = GET_APIC_VERSION(ver); - v = APIC_DM_NMI; /* unmask and set to NMI */ - apic_write_around(APIC_LVT0, v); -} - -int get_maxlvt(void) -{ - unsigned int v, ver, maxlvt; - - v = apic_read(APIC_LVR); - ver = GET_APIC_VERSION(v); - maxlvt = GET_APIC_MAXLVT(v); - return maxlvt; -} - -void clear_local_APIC(void) -{ - int maxlvt; - unsigned int v; - - maxlvt = get_maxlvt(); - - /* - * Masking an LVT entry on a P6 can trigger a local APIC error - * if the vector is zero. Mask LVTERR first to prevent this. - */ - if (maxlvt >= 3) { - v = ERROR_APIC_VECTOR; /* any non-zero vector will do */ - apic_write_around(APIC_LVTERR, v | APIC_LVT_MASKED); - } - /* - * Careful: we have to set masks only first to deassert - * any level-triggered sources. - */ - v = apic_read(APIC_LVTT); - apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED); - v = apic_read(APIC_LVT0); - apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); - v = apic_read(APIC_LVT1); - apic_write_around(APIC_LVT1, v | APIC_LVT_MASKED); - if (maxlvt >= 4) { - v = apic_read(APIC_LVTPC); - apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED); - } - - /* - * Clean APIC state for other OSs: - */ - apic_write_around(APIC_LVTT, APIC_LVT_MASKED); - apic_write_around(APIC_LVT0, APIC_LVT_MASKED); - apic_write_around(APIC_LVT1, APIC_LVT_MASKED); - if (maxlvt >= 3) - apic_write_around(APIC_LVTERR, APIC_LVT_MASKED); - if (maxlvt >= 4) - apic_write_around(APIC_LVTPC, APIC_LVT_MASKED); - v = GET_APIC_VERSION(apic_read(APIC_LVR)); - if (APIC_INTEGRATED(v)) { /* !82489DX */ - if (maxlvt > 3) /* Due to Pentium errata 3AP and 11AP. */ - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); - } -} - -void __init connect_bsp_APIC(void) -{ - if (pic_mode) { - /* - * Do not trust the local APIC being empty at bootup. - */ - clear_local_APIC(); - /* - * PIC mode, enable APIC mode in the IMCR, i.e. - * connect BSP's local APIC to INT and NMI lines. - */ - apic_printk(APIC_VERBOSE, "leaving PIC mode, enabling APIC mode.\n"); - outb(0x70, 0x22); - outb(0x01, 0x23); - } -} - -void disconnect_bsp_APIC(int virt_wire_setup) -{ - if (pic_mode) { - /* - * Put the board back into PIC mode (has an effect - * only on certain older boards). Note that APIC - * interrupts, including IPIs, won't work beyond - * this point! The only exception are INIT IPIs. - */ - apic_printk(APIC_QUIET, "disabling APIC mode, entering PIC mode.\n"); - outb(0x70, 0x22); - outb(0x00, 0x23); - } - else { - /* Go back to Virtual Wire compatibility mode */ - unsigned long value; - - /* For the spurious interrupt use vector F, and enable it */ - value = apic_read(APIC_SPIV); - value &= ~APIC_VECTOR_MASK; - value |= APIC_SPIV_APIC_ENABLED; - value |= 0xf; - apic_write_around(APIC_SPIV, value); - - if (!virt_wire_setup) { - /* For LVT0 make it edge triggered, active high, external and enabled */ - value = apic_read(APIC_LVT0); - value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | - APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | - APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED ); - value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; - value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); - apic_write_around(APIC_LVT0, value); - } - else { - /* Disable LVT0 */ - apic_write_around(APIC_LVT0, APIC_LVT_MASKED); - } - - /* For LVT1 make it edge triggered, active high, nmi and enabled */ - value = apic_read(APIC_LVT1); - value &= ~( - APIC_MODE_MASK | APIC_SEND_PENDING | - APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | - APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); - value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; - value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); - apic_write_around(APIC_LVT1, value); - } -} - -void disable_local_APIC(void) -{ - unsigned int value; - - clear_local_APIC(); - - /* - * Disable APIC (implies clearing of registers - * for 82489DX!). - */ - value = apic_read(APIC_SPIV); - value &= ~APIC_SPIV_APIC_ENABLED; - apic_write_around(APIC_SPIV, value); -} - -/* - * This is to verify that we're looking at a real local APIC. - * Check these against your board if the CPUs aren't getting - * started for no apparent reason. - */ -int __init verify_local_APIC(void) -{ - unsigned int reg0, reg1; - - /* - * The version register is read-only in a real APIC. - */ - reg0 = apic_read(APIC_LVR); - apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0); - apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK); - reg1 = apic_read(APIC_LVR); - apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1); - - /* - * The two version reads above should print the same - * numbers. If the second one is different, then we - * poke at a non-APIC. - */ - if (reg1 != reg0) - return 0; - - /* - * Check if the version looks reasonably. - */ - reg1 = GET_APIC_VERSION(reg0); - if (reg1 == 0x00 || reg1 == 0xff) - return 0; - reg1 = get_maxlvt(); - if (reg1 < 0x02 || reg1 == 0xff) - return 0; - - /* - * The ID register is read/write in a real APIC. - */ - reg0 = apic_read(APIC_ID); - apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); - apic_write(APIC_ID, reg0 ^ APIC_ID_MASK); - reg1 = apic_read(APIC_ID); - apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1); - apic_write(APIC_ID, reg0); - if (reg1 != (reg0 ^ APIC_ID_MASK)) - return 0; - - /* - * The next two are just to see if we have sane values. - * They're only really relevant if we're in Virtual Wire - * compatibility mode, but most boxes are anymore. - */ - reg0 = apic_read(APIC_LVT0); - apic_printk(APIC_DEBUG,"Getting LVT0: %x\n", reg0); - reg1 = apic_read(APIC_LVT1); - apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1); - - return 1; -} - -void __init sync_Arb_IDs(void) -{ - /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */ - unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR)); - if (ver >= 0x14) /* P4 or higher */ - return; - - /* - * Wait for idle. - */ - apic_wait_icr_idle(); - - apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); - apic_write_around(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG - | APIC_DM_INIT); -} - -extern void __error_in_apic_c (void); - -/* - * An initial setup of the virtual wire mode. - */ -void __init init_bsp_APIC(void) -{ - unsigned int value, ver; - - /* - * Don't do the setup now if we have a SMP BIOS as the - * through-I/O-APIC virtual wire mode might be active. - */ - if (smp_found_config || !cpu_has_apic) - return; - - value = apic_read(APIC_LVR); - ver = GET_APIC_VERSION(value); - - /* - * Do not trust the local APIC being empty at bootup. - */ - clear_local_APIC(); - - /* - * Enable APIC. - */ - value = apic_read(APIC_SPIV); - value &= ~APIC_VECTOR_MASK; - value |= APIC_SPIV_APIC_ENABLED; - value |= APIC_SPIV_FOCUS_DISABLED; - value |= SPURIOUS_APIC_VECTOR; - apic_write_around(APIC_SPIV, value); - - /* - * Set up the virtual wire mode. - */ - apic_write_around(APIC_LVT0, APIC_DM_EXTINT); - value = APIC_DM_NMI; - if (!APIC_INTEGRATED(ver)) /* 82489DX */ - value |= APIC_LVT_LEVEL_TRIGGER; - apic_write_around(APIC_LVT1, value); -} - -void __cpuinit setup_local_APIC (void) -{ - unsigned int value, ver, maxlvt; - - /* Pound the ESR really hard over the head with a big hammer - mbligh */ - if (esr_disable) { - apic_write(APIC_ESR, 0); - apic_write(APIC_ESR, 0); - apic_write(APIC_ESR, 0); - apic_write(APIC_ESR, 0); - } - - value = apic_read(APIC_LVR); - ver = GET_APIC_VERSION(value); - - if ((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f) - __error_in_apic_c(); - - /* - * Double-check whether this APIC is really registered. - * This is meaningless in clustered apic mode, so we skip it. - */ - if (!apic_id_registered()) - BUG(); - - /* - * Intel recommends to set DFR, LDR and TPR before enabling - * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel - * document number 292116). So here it goes... - */ - init_apic_ldr(); - - /* - * Set Task Priority to 'accept all'. We never change this - * later on. - */ - value = apic_read(APIC_TASKPRI); - value &= ~APIC_TPRI_MASK; - apic_write_around(APIC_TASKPRI, value); - - /* - * Now that we are all set up, enable the APIC - */ - value = apic_read(APIC_SPIV); - value &= ~APIC_VECTOR_MASK; - /* - * Enable APIC - */ - value |= APIC_SPIV_APIC_ENABLED; - - /* - * Some unknown Intel IO/APIC (or APIC) errata is biting us with - * certain networking cards. If high frequency interrupts are - * happening on a particular IOAPIC pin, plus the IOAPIC routing - * entry is masked/unmasked at a high rate as well then sooner or - * later IOAPIC line gets 'stuck', no more interrupts are received - * from the device. If focus CPU is disabled then the hang goes - * away, oh well :-( - * - * [ This bug can be reproduced easily with a level-triggered - * PCI Ne2000 networking cards and PII/PIII processors, dual - * BX chipset. ] - */ - /* - * Actually disabling the focus CPU check just makes the hang less - * frequent as it makes the interrupt distributon model be more - * like LRU than MRU (the short-term load is more even across CPUs). - * See also the comment in end_level_ioapic_irq(). --macro - */ -#if 1 - /* Enable focus processor (bit==0) */ - value &= ~APIC_SPIV_FOCUS_DISABLED; -#else - /* Disable focus processor (bit==1) */ - value |= APIC_SPIV_FOCUS_DISABLED; -#endif - /* - * Set spurious IRQ vector - */ - value |= SPURIOUS_APIC_VECTOR; - apic_write_around(APIC_SPIV, value); - - /* - * Set up LVT0, LVT1: - * - * set up through-local-APIC on the BP's LINT0. This is not - * strictly necessary in pure symmetric-IO mode, but sometimes - * we delegate interrupts to the 8259A. - */ - /* - * TODO: set up through-local-APIC from through-I/O-APIC? --macro - */ - value = apic_read(APIC_LVT0) & APIC_LVT_MASKED; - if (!smp_processor_id() && (pic_mode || !value)) { - value = APIC_DM_EXTINT; - apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", smp_processor_id()); - } else { - value = APIC_DM_EXTINT | APIC_LVT_MASKED; - apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", smp_processor_id()); - } - apic_write_around(APIC_LVT0, value); - - /* - * only the BP should see the LINT1 NMI signal, obviously. - */ - if (!smp_processor_id()) - value = APIC_DM_NMI; - else - value = APIC_DM_NMI | APIC_LVT_MASKED; - if (!APIC_INTEGRATED(ver)) /* 82489DX */ - value |= APIC_LVT_LEVEL_TRIGGER; - apic_write_around(APIC_LVT1, value); - - if (APIC_INTEGRATED(ver) && !esr_disable) { /* !82489DX */ - unsigned oldvalue; - maxlvt = get_maxlvt(); - if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ - apic_write(APIC_ESR, 0); - oldvalue = apic_read(APIC_ESR); - value = ERROR_APIC_VECTOR; // enables sending errors - apic_write_around(APIC_LVTERR, value); - /* - * spec says clear errors after enabling vector. - */ - if (maxlvt > 3) - apic_write(APIC_ESR, 0); - value = apic_read(APIC_ESR); - if (value != oldvalue) - apic_printk(APIC_VERBOSE, - "ESR value after enabling vector: %08x, after %08x\n", - oldvalue, value); - } else { - if (esr_disable) - /* - * Something untraceble is creating bad interrupts on - * secondary quads ... for the moment, just leave the - * ESR disabled - we can't do anything useful with the - * errors anyway - mbligh - */ - apic_printk(APIC_DEBUG, "Leaving ESR disabled.\n"); - else - apic_printk(APIC_DEBUG, "No ESR for 82489DX.\n"); - } - - nmi_watchdog_default(); - if (nmi_watchdog == NMI_LOCAL_APIC) - setup_apic_nmi_watchdog(); - apic_pm_activate(); -} - -#ifdef CONFIG_PM - -static struct { - /* 'active' is true if the local APIC was enabled by us and - not the BIOS; this signifies that we are also responsible - for disabling it before entering apm/acpi suspend */ - int active; - /* r/w apic fields */ - unsigned int apic_id; - unsigned int apic_taskpri; - unsigned int apic_ldr; - unsigned int apic_dfr; - unsigned int apic_spiv; - unsigned int apic_lvtt; - unsigned int apic_lvtpc; - unsigned int apic_lvt0; - unsigned int apic_lvt1; - unsigned int apic_lvterr; - unsigned int apic_tmict; - unsigned int apic_tdcr; - unsigned int apic_thmr; -} apic_pm_state; - -static int lapic_suspend(struct sys_device *dev, pm_message_t state) -{ - unsigned long flags; - - if (!apic_pm_state.active) - return 0; - - apic_pm_state.apic_id = apic_read(APIC_ID); - apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); - apic_pm_state.apic_ldr = apic_read(APIC_LDR); - apic_pm_state.apic_dfr = apic_read(APIC_DFR); - apic_pm_state.apic_spiv = apic_read(APIC_SPIV); - apic_pm_state.apic_lvtt = apic_read(APIC_LVTT); - apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); - apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0); - apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1); - apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); - apic_pm_state.apic_tmict = apic_read(APIC_TMICT); - apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); - apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); - local_save_flags(flags); - local_irq_disable(); - disable_local_APIC(); - local_irq_restore(flags); - return 0; -} - -static int lapic_resume(struct sys_device *dev) -{ - unsigned int l, h; - unsigned long flags; - - if (!apic_pm_state.active) - return 0; - - /* XXX: Pavel needs this for S3 resume, but can't explain why */ - set_fixmap_nocache(FIX_APIC_BASE, APIC_DEFAULT_PHYS_BASE); - - local_irq_save(flags); - rdmsr(MSR_IA32_APICBASE, l, h); - l &= ~MSR_IA32_APICBASE_BASE; - l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; - wrmsr(MSR_IA32_APICBASE, l, h); - apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); - apic_write(APIC_ID, apic_pm_state.apic_id); - apic_write(APIC_DFR, apic_pm_state.apic_dfr); - apic_write(APIC_LDR, apic_pm_state.apic_ldr); - apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri); - apic_write(APIC_SPIV, apic_pm_state.apic_spiv); - apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); - apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); - apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); - apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); - apic_write(APIC_LVTT, apic_pm_state.apic_lvtt); - apic_write(APIC_TDCR, apic_pm_state.apic_tdcr); - apic_write(APIC_TMICT, apic_pm_state.apic_tmict); - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); - apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); - local_irq_restore(flags); - return 0; -} - -static struct sysdev_class lapic_sysclass = { - set_kset_name("lapic"), - .resume = lapic_resume, - .suspend = lapic_suspend, -}; - -static struct sys_device device_lapic = { - .id = 0, - .cls = &lapic_sysclass, -}; - -static void __cpuinit apic_pm_activate(void) -{ - apic_pm_state.active = 1; -} - -static int __init init_lapic_sysfs(void) -{ - int error; - if (!cpu_has_apic) - return 0; - /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ - error = sysdev_class_register(&lapic_sysclass); - if (!error) - error = sysdev_register(&device_lapic); - return error; -} -device_initcall(init_lapic_sysfs); - -#else /* CONFIG_PM */ - -static void apic_pm_activate(void) { } - -#endif /* CONFIG_PM */ - -static int __init apic_set_verbosity(char *str) -{ - if (strcmp("debug", str) == 0) - apic_verbosity = APIC_DEBUG; - else if (strcmp("verbose", str) == 0) - apic_verbosity = APIC_VERBOSE; - else - printk(KERN_WARNING "APIC Verbosity level %s not recognised" - " use apic=verbose or apic=debug", str); - - return 0; -} - -__setup("apic=", apic_set_verbosity); - -/* - * Detect and enable local APICs on non-SMP boards. - * Original code written by Keir Fraser. - * On AMD64 we trust the BIOS - if it says no APIC it is likely - * not correctly set up (usually the APIC timer won't work etc.) - */ - -static int __init detect_init_APIC (void) -{ - if (!cpu_has_apic) { - printk(KERN_INFO "No local APIC present\n"); - return -1; - } - - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; - boot_cpu_id = 0; - return 0; -} - -void __init init_apic_mappings(void) -{ - unsigned long apic_phys; - - /* - * If no local APIC can be found then set up a fake all - * zeroes page to simulate the local APIC and another - * one for the IO-APIC. - */ - if (!smp_found_config && detect_init_APIC()) { - apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); - apic_phys = __pa(apic_phys); - } else - apic_phys = mp_lapic_addr; - - set_fixmap_nocache(FIX_APIC_BASE, apic_phys); - apic_printk(APIC_VERBOSE,"mapped APIC to %16lx (%16lx)\n", APIC_BASE, apic_phys); - - /* - * Fetch the APIC ID of the BSP in case we have a - * default configuration (or the MP table is broken). - */ - if (boot_cpu_id == -1U) - boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID)); - -#ifdef CONFIG_X86_IO_APIC - { - unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; - int i; - - for (i = 0; i < nr_ioapics; i++) { - if (smp_found_config) { - ioapic_phys = mp_ioapics[i].mpc_apicaddr; - } else { - ioapic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); - ioapic_phys = __pa(ioapic_phys); - } - set_fixmap_nocache(idx, ioapic_phys); - apic_printk(APIC_VERBOSE,"mapped IOAPIC to %016lx (%016lx)\n", - __fix_to_virt(idx), ioapic_phys); - idx++; - } - } -#endif -} - -/* - * This function sets up the local APIC timer, with a timeout of - * 'clocks' APIC bus clock. During calibration we actually call - * this function twice on the boot CPU, once with a bogus timeout - * value, second time for real. The other (noncalibrating) CPUs - * call this function only once, with the real, calibrated value. - * - * We do reads before writes even if unnecessary, to get around the - * P5 APIC double write bug. - */ - -#define APIC_DIVISOR 16 - -static void __setup_APIC_LVTT(unsigned int clocks) -{ - unsigned int lvtt_value, tmp_value, ver; - - ver = GET_APIC_VERSION(apic_read(APIC_LVR)); - lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR; - if (!APIC_INTEGRATED(ver)) - lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); - apic_write_around(APIC_LVTT, lvtt_value); - - /* - * Divide PICLK by 16 - */ - tmp_value = apic_read(APIC_TDCR); - apic_write_around(APIC_TDCR, (tmp_value - & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) - | APIC_TDR_DIV_16); - - apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); -} - -static void setup_APIC_timer(unsigned int clocks) -{ - unsigned long flags; - - local_irq_save(flags); - - /* For some reasons this doesn't work on Simics, so fake it for now */ - if (!strstr(boot_cpu_data.x86_model_id, "Screwdriver")) { - __setup_APIC_LVTT(clocks); - return; - } - - /* wait for irq slice */ - if (vxtime.hpet_address) { - int trigger = hpet_readl(HPET_T0_CMP); - while (hpet_readl(HPET_COUNTER) >= trigger) - /* do nothing */ ; - while (hpet_readl(HPET_COUNTER) < trigger) - /* do nothing */ ; - } else { - int c1, c2; - outb_p(0x00, 0x43); - c2 = inb_p(0x40); - c2 |= inb_p(0x40) << 8; - do { - c1 = c2; - outb_p(0x00, 0x43); - c2 = inb_p(0x40); - c2 |= inb_p(0x40) << 8; - } while (c2 - c1 < 300); - } - - __setup_APIC_LVTT(clocks); - - local_irq_restore(flags); -} - -/* - * In this function we calibrate APIC bus clocks to the external - * timer. Unfortunately we cannot use jiffies and the timer irq - * to calibrate, since some later bootup code depends on getting - * the first irq? Ugh. - * - * We want to do the calibration only once since we - * want to have local timer irqs syncron. CPUs connected - * by the same APIC bus have the very same bus frequency. - * And we want to have irqs off anyways, no accidental - * APIC irq that way. - */ - -#define TICK_COUNT 100000000 - -static int __init calibrate_APIC_clock(void) -{ - int apic, apic_start, tsc, tsc_start; - int result; - /* - * Put whatever arbitrary (but long enough) timeout - * value into the APIC clock, we just want to get the - * counter running for calibration. - */ - __setup_APIC_LVTT(1000000000); - - apic_start = apic_read(APIC_TMCCT); - rdtscl(tsc_start); - - do { - apic = apic_read(APIC_TMCCT); - rdtscl(tsc); - } while ((tsc - tsc_start) < TICK_COUNT && (apic - apic_start) < TICK_COUNT); - - result = (apic_start - apic) * 1000L * cpu_khz / (tsc - tsc_start); - - printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n", - result / 1000 / 1000, result / 1000 % 1000); - - return result * APIC_DIVISOR / HZ; -} - -static unsigned int calibration_result; - -void __init setup_boot_APIC_clock (void) -{ - if (disable_apic_timer) { - printk(KERN_INFO "Disabling APIC timer\n"); - return; - } - - printk(KERN_INFO "Using local APIC timer interrupts.\n"); - using_apic_timer = 1; - - local_irq_disable(); - - calibration_result = calibrate_APIC_clock(); - /* - * Now set up the timer for real. - */ - setup_APIC_timer(calibration_result); - - local_irq_enable(); -} - -void __cpuinit setup_secondary_APIC_clock(void) -{ - local_irq_disable(); /* FIXME: Do we need this? --RR */ - setup_APIC_timer(calibration_result); - local_irq_enable(); -} - -void __cpuinit disable_APIC_timer(void) -{ - if (using_apic_timer) { - unsigned long v; - - v = apic_read(APIC_LVTT); - apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED); - } -} - -void enable_APIC_timer(void) -{ - if (using_apic_timer) { - unsigned long v; - - v = apic_read(APIC_LVTT); - apic_write_around(APIC_LVTT, v & ~APIC_LVT_MASKED); - } -} - -/* - * the frequency of the profiling timer can be changed - * by writing a multiplier value into /proc/profile. - */ -int setup_profiling_timer(unsigned int multiplier) -{ - int i; - - /* - * Sanity check. [at least 500 APIC cycles should be - * between APIC interrupts as a rule of thumb, to avoid - * irqs flooding us] - */ - if ( (!multiplier) || (calibration_result/multiplier < 500)) - return -EINVAL; - - /* - * Set the new multiplier for each CPU. CPUs don't start using the - * new values until the next timer interrupt in which they do process - * accounting. At that time they also adjust their APIC timers - * accordingly. - */ - for (i = 0; i < NR_CPUS; ++i) - per_cpu(prof_multiplier, i) = multiplier; - - return 0; -} - -#undef APIC_DIVISOR - -/* - * Local timer interrupt handler. It does both profiling and - * process statistics/rescheduling. - * - * We do profiling in every local tick, statistics/rescheduling - * happen only every 'profiling multiplier' ticks. The default - * multiplier is 1 and it can be changed by writing the new multiplier - * value into /proc/profile. - */ - -void smp_local_timer_interrupt(struct pt_regs *regs) -{ - int cpu = smp_processor_id(); - - profile_tick(CPU_PROFILING, regs); - if (--per_cpu(prof_counter, cpu) <= 0) { - /* - * The multiplier may have changed since the last time we got - * to this point as a result of the user writing to - * /proc/profile. In this case we need to adjust the APIC - * timer accordingly. - * - * Interrupts are already masked off at this point. - */ - per_cpu(prof_counter, cpu) = per_cpu(prof_multiplier, cpu); - if (per_cpu(prof_counter, cpu) != - per_cpu(prof_old_multiplier, cpu)) { - __setup_APIC_LVTT(calibration_result/ - per_cpu(prof_counter, cpu)); - per_cpu(prof_old_multiplier, cpu) = - per_cpu(prof_counter, cpu); - } - -#ifdef CONFIG_SMP - update_process_times(user_mode(regs)); -#endif - } - - /* - * We take the 'long' return path, and there every subsystem - * grabs the appropriate locks (kernel lock/ irq lock). - * - * we might want to decouple profiling from the 'long path', - * and do the profiling totally in assembly. - * - * Currently this isn't too much of an issue (performance wise), - * we can take more than 100K local irqs per second on a 100 MHz P5. - */ -} - -/* - * Local APIC timer interrupt. This is the most natural way for doing - * local interrupts, but local timer interrupts can be emulated by - * broadcast interrupts too. [in case the hw doesn't support APIC timers] - * - * [ if a single-CPU system runs an SMP kernel then we call the local - * interrupt as well. Thus we cannot inline the local irq ... ] - */ -void smp_apic_timer_interrupt(struct pt_regs *regs) -{ - /* - * the NMI deadlock-detector uses this. - */ - add_pda(apic_timer_irqs, 1); - - /* - * NOTE! We'd better ACK the irq immediately, - * because timer handling can be slow. - */ - ack_APIC_irq(); - /* - * update_process_times() expects us to have done irq_enter(). - * Besides, if we don't timer interrupts ignore the global - * interrupt lock, which is the WrongThing (tm) to do. - */ - irq_enter(); - smp_local_timer_interrupt(regs); - irq_exit(); -} - -/* - * oem_force_hpet_timer -- force HPET mode for some boxes. - * - * Thus far, the major user of this is IBM's Summit2 series: - * - * Clustered boxes may have unsynced TSC problems if they are - * multi-chassis. Use available data to take a good guess. - * If in doubt, go HPET. - */ -__init int oem_force_hpet_timer(void) -{ - int i, clusters, zeros; - unsigned id; - DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); - - bitmap_zero(clustermap, NUM_APIC_CLUSTERS); - - for (i = 0; i < NR_CPUS; i++) { - id = bios_cpu_apicid[i]; - if (id != BAD_APICID) - __set_bit(APIC_CLUSTERID(id), clustermap); - } - - /* Problem: Partially populated chassis may not have CPUs in some of - * the APIC clusters they have been allocated. Only present CPUs have - * bios_cpu_apicid entries, thus causing zeroes in the bitmap. Since - * clusters are allocated sequentially, count zeros only if they are - * bounded by ones. - */ - clusters = 0; - zeros = 0; - for (i = 0; i < NUM_APIC_CLUSTERS; i++) { - if (test_bit(i, clustermap)) { - clusters += 1 + zeros; - zeros = 0; - } else - ++zeros; - } - - /* - * If clusters > 2, then should be multi-chassis. Return 1 for HPET. - * Else return 0 to use TSC. - * May have to revisit this when multi-core + hyperthreaded CPUs come - * out, but AFAIK this will work even for them. - */ - return (clusters > 2); -} - -/* - * This interrupt should _never_ happen with our APIC/SMP architecture - */ -asmlinkage void smp_spurious_interrupt(void) -{ - unsigned int v; - irq_enter(); - /* - * Check if this really is a spurious interrupt and ACK it - * if it is a vectored one. Just in case... - * Spurious interrupts should not be ACKed. - */ - v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1)); - if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) - ack_APIC_irq(); - -#if 0 - static unsigned long last_warning; - static unsigned long skipped; - - /* see sw-dev-man vol 3, chapter 7.4.13.5 */ - if (time_before(last_warning+30*HZ,jiffies)) { - printk(KERN_INFO "spurious APIC interrupt on CPU#%d, %ld skipped.\n", - smp_processor_id(), skipped); - last_warning = jiffies; - skipped = 0; - } else { - skipped++; - } -#endif - irq_exit(); -} - -/* - * This interrupt should never happen with our APIC/SMP architecture - */ - -asmlinkage void smp_error_interrupt(void) -{ - unsigned int v, v1; - - irq_enter(); - /* First tickle the hardware, only then report what went on. -- REW */ - v = apic_read(APIC_ESR); - apic_write(APIC_ESR, 0); - v1 = apic_read(APIC_ESR); - ack_APIC_irq(); - atomic_inc(&irq_err_count); - - /* Here is what the APIC error bits mean: - 0: Send CS error - 1: Receive CS error - 2: Send accept error - 3: Receive accept error - 4: Reserved - 5: Send illegal vector - 6: Received illegal vector - 7: Illegal register address - */ - printk (KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n", - smp_processor_id(), v , v1); - irq_exit(); -} - -int disable_apic; - -/* - * This initializes the IO-APIC and APIC hardware if this is - * a UP kernel. - */ -int __init APIC_init_uniprocessor (void) -{ - if (disable_apic) { - printk(KERN_INFO "Apic disabled\n"); - return -1; - } - if (!cpu_has_apic) { - disable_apic = 1; - printk(KERN_INFO "Apic disabled by BIOS\n"); - return -1; - } - - verify_local_APIC(); - - connect_bsp_APIC(); - - phys_cpu_present_map = physid_mask_of_physid(0); - apic_write_around(APIC_ID, boot_cpu_id); - - setup_local_APIC(); - -#ifdef CONFIG_X86_IO_APIC - if (smp_found_config && !skip_ioapic_setup && nr_ioapics) - setup_IO_APIC(); - else - nr_ioapics = 0; -#endif - setup_boot_APIC_clock(); - check_nmi_watchdog(); - return 0; -} - -static __init int setup_disableapic(char *str) -{ - disable_apic = 1; - return 0; -} - -static __init int setup_nolapic(char *str) -{ - disable_apic = 1; - return 0; -} - -static __init int setup_noapictimer(char *str) -{ - disable_apic_timer = 1; - return 0; -} - -/* dummy parsing: see setup.c */ - -__setup("disableapic", setup_disableapic); -__setup("nolapic", setup_nolapic); /* same as disableapic, for compatibility */ - -__setup("noapictimer", setup_noapictimer); - -/* no "lapic" flag - we only use the lapic when the BIOS tells us so. */ diff --git a/arch/x86_64/kernel/asm-offsets.c b/arch/x86_64/kernel/asm-offsets.c deleted file mode 100644 index 35b4c3fcbb3..00000000000 --- a/arch/x86_64/kernel/asm-offsets.c +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Generate definitions needed by assembly language modules. - * This code generates raw asm output which is post-processed to extract - * and format the required data. - */ - -#include <linux/sched.h> -#include <linux/stddef.h> -#include <linux/errno.h> -#include <linux/hardirq.h> -#include <linux/suspend.h> -#include <asm/pda.h> -#include <asm/processor.h> -#include <asm/segment.h> -#include <asm/thread_info.h> -#include <asm/ia32.h> - -#define DEFINE(sym, val) \ - asm volatile("\n->" #sym " %0 " #val : : "i" (val)) - -#define BLANK() asm volatile("\n->" : : ) - -int main(void) -{ -#define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry)) - ENTRY(state); - ENTRY(flags); - ENTRY(thread); - ENTRY(pid); - BLANK(); -#undef ENTRY -#define ENTRY(entry) DEFINE(threadinfo_ ## entry, offsetof(struct thread_info, entry)) - ENTRY(flags); - ENTRY(addr_limit); - ENTRY(preempt_count); - BLANK(); -#undef ENTRY -#define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry)) - ENTRY(kernelstack); - ENTRY(oldrsp); - ENTRY(pcurrent); - ENTRY(irqrsp); - ENTRY(irqcount); - ENTRY(cpunumber); - ENTRY(irqstackptr); - BLANK(); -#undef ENTRY -#ifdef CONFIG_IA32_EMULATION -#define ENTRY(entry) DEFINE(IA32_SIGCONTEXT_ ## entry, offsetof(struct sigcontext_ia32, entry)) - ENTRY(eax); - ENTRY(ebx); - ENTRY(ecx); - ENTRY(edx); - ENTRY(esi); - ENTRY(edi); - ENTRY(ebp); - ENTRY(esp); - ENTRY(eip); - BLANK(); -#undef ENTRY - DEFINE(IA32_RT_SIGFRAME_sigcontext, - offsetof (struct rt_sigframe32, uc.uc_mcontext)); - BLANK(); -#endif - DEFINE(pbe_address, offsetof(struct pbe, address)); - DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address)); - DEFINE(pbe_next, offsetof(struct pbe, next)); - return 0; -} diff --git a/arch/x86_64/kernel/cpufreq/Kconfig b/arch/x86_64/kernel/cpufreq/Kconfig deleted file mode 100644 index 81f1562e539..00000000000 --- a/arch/x86_64/kernel/cpufreq/Kconfig +++ /dev/null @@ -1,96 +0,0 @@ -# -# CPU Frequency scaling -# - -menu "CPU Frequency scaling" - -source "drivers/cpufreq/Kconfig" - -if CPU_FREQ - -comment "CPUFreq processor drivers" - -config X86_POWERNOW_K8 - tristate "AMD Opteron/Athlon64 PowerNow!" - select CPU_FREQ_TABLE - help - This adds the CPUFreq driver for mobile AMD Opteron/Athlon64 processors. - - For details, take a look at <file:Documentation/cpu-freq/>. - - If in doubt, say N. - -config X86_POWERNOW_K8_ACPI - bool - depends on X86_POWERNOW_K8 && ACPI_PROCESSOR - depends on !(X86_POWERNOW_K8 = y && ACPI_PROCESSOR = m) - default y - -config X86_SPEEDSTEP_CENTRINO - tristate "Intel Enhanced SpeedStep" - select CPU_FREQ_TABLE - depends on ACPI_PROCESSOR - help - This adds the CPUFreq driver for Enhanced SpeedStep enabled - mobile CPUs. This means Intel Pentium M (Centrino) CPUs - or 64bit enabled Intel Xeons. - - For details, take a look at <file:Documentation/cpu-freq/>. - - If in doubt, say N. - -config X86_SPEEDSTEP_CENTRINO_ACPI - bool - depends on X86_SPEEDSTEP_CENTRINO - default y - -config X86_ACPI_CPUFREQ - tristate "ACPI Processor P-States driver" - depends on ACPI_PROCESSOR - help - This driver adds a CPUFreq driver which utilizes the ACPI - Processor Performance States. - - For details, take a look at <file:Documentation/cpu-freq/>. - - If in doubt, say N. - -comment "shared options" - -config X86_ACPI_CPUFREQ_PROC_INTF - bool "/proc/acpi/processor/../performance interface (deprecated)" - depends on PROC_FS - depends on X86_ACPI_CPUFREQ || X86_SPEEDSTEP_CENTRINO_ACPI || X86_POWERNOW_K8_ACPI - help - This enables the deprecated /proc/acpi/processor/../performance - interface. While it is helpful for debugging, the generic, - cross-architecture cpufreq interfaces should be used. - - If in doubt, say N. - -config X86_P4_CLOCKMOD - tristate "Intel Pentium 4 clock modulation" - depends on EMBEDDED - help - This adds the clock modulation driver for Intel Pentium 4 / XEON - processors. When enabled it will lower CPU temperature by skipping - clocks. - - This driver should be only used in exceptional - circumstances when very low power is needed because it causes severe - slowdowns and noticeable latencies. Normally Speedstep should be used - instead. - - For details, take a look at <file:Documentation/cpu-freq/>. - - Unless you are absolutely sure say N. - - -config X86_SPEEDSTEP_LIB - tristate - default X86_P4_CLOCKMOD - -endif - -endmenu - diff --git a/arch/x86_64/kernel/cpufreq/Makefile b/arch/x86_64/kernel/cpufreq/Makefile deleted file mode 100644 index d8b59387922..00000000000 --- a/arch/x86_64/kernel/cpufreq/Makefile +++ /dev/null @@ -1,17 +0,0 @@ -# -# Reuse the i386 cpufreq drivers -# - -SRCDIR := ../../../i386/kernel/cpu/cpufreq - -obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o -obj-$(CONFIG_X86_SPEEDSTEP_CENTRINO) += speedstep-centrino.o -obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o -obj-$(CONFIG_X86_P4_CLOCKMOD) += p4-clockmod.o -obj-$(CONFIG_X86_SPEEDSTEP_LIB) += speedstep-lib.o - -powernow-k8-objs := ${SRCDIR}/powernow-k8.o -speedstep-centrino-objs := ${SRCDIR}/speedstep-centrino.o -acpi-cpufreq-objs := ${SRCDIR}/acpi-cpufreq.o -p4-clockmod-objs := ${SRCDIR}/p4-clockmod.o -speedstep-lib-objs := ${SRCDIR}/speedstep-lib.o diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c deleted file mode 100644 index d7fa4248501..00000000000 --- a/arch/x86_64/kernel/crash.c +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Architecture specific (x86_64) functions for kexec based crash dumps. - * - * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) - * - * Copyright (C) IBM Corporation, 2004. All rights reserved. - * - */ - -#include <linux/init.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/smp.h> -#include <linux/irq.h> -#include <linux/reboot.h> -#include <linux/kexec.h> - -#include <asm/processor.h> -#include <asm/hardirq.h> -#include <asm/nmi.h> -#include <asm/hw_irq.h> - -note_buf_t crash_notes[NR_CPUS]; - -void machine_crash_shutdown(struct pt_regs *regs) -{ - /* This function is only called after the system - * has paniced or is otherwise in a critical state. - * The minimum amount of code to allow a kexec'd kernel - * to run successfully needs to happen here. - * - * In practice this means shooting down the other cpus in - * an SMP system. - */ -} diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c deleted file mode 100644 index 6ded3a50dfe..00000000000 --- a/arch/x86_64/kernel/e820.c +++ /dev/null @@ -1,584 +0,0 @@ -/* - * Handle the memory map. - * The functions here do the job until bootmem takes over. - * $Id: e820.c,v 1.4 2002/09/19 19:25:32 ak Exp $ - * - * Getting sanitize_e820_map() in sync with i386 version by applying change: - * - Provisions for empty E820 memory regions (reported by certain BIOSes). - * Alex Achenbach <xela@slit.de>, December 2002. - * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> - * - */ -#include <linux/config.h> -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/init.h> -#include <linux/bootmem.h> -#include <linux/ioport.h> -#include <linux/string.h> -#include <linux/kexec.h> -#include <asm/page.h> -#include <asm/e820.h> -#include <asm/proto.h> -#include <asm/bootsetup.h> - -extern char _end[]; - -/* - * PFN of last memory page. - */ -unsigned long end_pfn; - -/* - * end_pfn only includes RAM, while end_pfn_map includes all e820 entries. - * The direct mapping extends to end_pfn_map, so that we can directly access - * apertures, ACPI and other tables without having to play with fixmaps. - */ -unsigned long end_pfn_map; - -/* - * Last pfn which the user wants to use. - */ -unsigned long end_user_pfn = MAXMEM>>PAGE_SHIFT; - -extern struct resource code_resource, data_resource; - -/* Check for some hardcoded bad areas that early boot is not allowed to touch */ -static inline int bad_addr(unsigned long *addrp, unsigned long size) -{ - unsigned long addr = *addrp, last = addr + size; - - /* various gunk below that needed for SMP startup */ - if (addr < 0x8000) { - *addrp = 0x8000; - return 1; - } - - /* direct mapping tables of the kernel */ - if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) { - *addrp = table_end << PAGE_SHIFT; - return 1; - } - - /* initrd */ -#ifdef CONFIG_BLK_DEV_INITRD - if (LOADER_TYPE && INITRD_START && last >= INITRD_START && - addr < INITRD_START+INITRD_SIZE) { - *addrp = INITRD_START + INITRD_SIZE; - return 1; - } -#endif - /* kernel code + 640k memory hole (later should not be needed, but - be paranoid for now) */ - if (last >= 640*1024 && addr < __pa_symbol(&_end)) { - *addrp = __pa_symbol(&_end); - return 1; - } - /* XXX ramdisk image here? */ - return 0; -} - -int __init e820_mapped(unsigned long start, unsigned long end, unsigned type) -{ - int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - if (type && ei->type != type) - continue; - if (ei->addr >= end || ei->addr + ei->size < start) - continue; - return 1; - } - return 0; -} - -/* - * Find a free area in a specific range. - */ -unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size) -{ - int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - unsigned long addr = ei->addr, last; - if (ei->type != E820_RAM) - continue; - if (addr < start) - addr = start; - if (addr > ei->addr + ei->size) - continue; - while (bad_addr(&addr, size) && addr+size < ei->addr + ei->size) - ; - last = addr + size; - if (last > ei->addr + ei->size) - continue; - if (last > end) - continue; - return addr; - } - return -1UL; -} - -/* - * Free bootmem based on the e820 table for a node. - */ -void __init e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned long end) -{ - int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - unsigned long last, addr; - - if (ei->type != E820_RAM || - ei->addr+ei->size <= start || - ei->addr > end) - continue; - - addr = round_up(ei->addr, PAGE_SIZE); - if (addr < start) - addr = start; - - last = round_down(ei->addr + ei->size, PAGE_SIZE); - if (last >= end) - last = end; - - if (last > addr && last-addr >= PAGE_SIZE) - free_bootmem_node(pgdat, addr, last-addr); - } -} - -/* - * Find the highest page frame number we have available - */ -unsigned long __init e820_end_of_ram(void) -{ - int i; - unsigned long end_pfn = 0; - - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - unsigned long start, end; - - start = round_up(ei->addr, PAGE_SIZE); - end = round_down(ei->addr + ei->size, PAGE_SIZE); - if (start >= end) - continue; - if (ei->type == E820_RAM) { - if (end > end_pfn<<PAGE_SHIFT) - end_pfn = end>>PAGE_SHIFT; - } else { - if (end > end_pfn_map<<PAGE_SHIFT) - end_pfn_map = end>>PAGE_SHIFT; - } - } - - if (end_pfn > end_pfn_map) - end_pfn_map = end_pfn; - if (end_pfn_map > MAXMEM>>PAGE_SHIFT) - end_pfn_map = MAXMEM>>PAGE_SHIFT; - if (end_pfn > end_user_pfn) - end_pfn = end_user_pfn; - if (end_pfn > end_pfn_map) - end_pfn = end_pfn_map; - - return end_pfn; -} - -/* - * Mark e820 reserved areas as busy for the resource manager. - */ -void __init e820_reserve_resources(void) -{ - int i; - for (i = 0; i < e820.nr_map; i++) { - struct resource *res; - res = alloc_bootmem_low(sizeof(struct resource)); - switch (e820.map[i].type) { - case E820_RAM: res->name = "System RAM"; break; - case E820_ACPI: res->name = "ACPI Tables"; break; - case E820_NVS: res->name = "ACPI Non-volatile Storage"; break; - default: res->name = "reserved"; - } - res->start = e820.map[i].addr; - res->end = res->start + e820.map[i].size - 1; - res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; - request_resource(&iomem_resource, res); - if (e820.map[i].type == E820_RAM) { - /* - * We don't know which RAM region contains kernel data, - * so we try it repeatedly and let the resource manager - * test it. - */ - request_resource(res, &code_resource); - request_resource(res, &data_resource); -#ifdef CONFIG_KEXEC - request_resource(res, &crashk_res); -#endif - } - } -} - -/* - * Add a memory region to the kernel e820 map. - */ -void __init add_memory_region(unsigned long start, unsigned long size, int type) -{ - int x = e820.nr_map; - - if (x == E820MAX) { - printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); - return; - } - - e820.map[x].addr = start; - e820.map[x].size = size; - e820.map[x].type = type; - e820.nr_map++; -} - -void __init e820_print_map(char *who) -{ - int i; - - for (i = 0; i < e820.nr_map; i++) { - printk(" %s: %016Lx - %016Lx ", who, - (unsigned long long) e820.map[i].addr, - (unsigned long long) (e820.map[i].addr + e820.map[i].size)); - switch (e820.map[i].type) { - case E820_RAM: printk("(usable)\n"); - break; - case E820_RESERVED: - printk("(reserved)\n"); - break; - case E820_ACPI: - printk("(ACPI data)\n"); - break; - case E820_NVS: - printk("(ACPI NVS)\n"); - break; - default: printk("type %u\n", e820.map[i].type); - break; - } - } -} - -/* - * Sanitize the BIOS e820 map. - * - * Some e820 responses include overlapping entries. The following - * replaces the original e820 map with a new one, removing overlaps. - * - */ -static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) -{ - struct change_member { - struct e820entry *pbios; /* pointer to original bios entry */ - unsigned long long addr; /* address for this change point */ - }; - static struct change_member change_point_list[2*E820MAX] __initdata; - static struct change_member *change_point[2*E820MAX] __initdata; - static struct e820entry *overlap_list[E820MAX] __initdata; - static struct e820entry new_bios[E820MAX] __initdata; - struct change_member *change_tmp; - unsigned long current_type, last_type; - unsigned long long last_addr; - int chgidx, still_changing; - int overlap_entries; - int new_bios_entry; - int old_nr, new_nr, chg_nr; - int i; - - /* - Visually we're performing the following (1,2,3,4 = memory types)... - - Sample memory map (w/overlaps): - ____22__________________ - ______________________4_ - ____1111________________ - _44_____________________ - 11111111________________ - ____________________33__ - ___________44___________ - __________33333_________ - ______________22________ - ___________________2222_ - _________111111111______ - _____________________11_ - _________________4______ - - Sanitized equivalent (no overlap): - 1_______________________ - _44_____________________ - ___1____________________ - ____22__________________ - ______11________________ - _________1______________ - __________3_____________ - ___________44___________ - _____________33_________ - _______________2________ - ________________1_______ - _________________4______ - ___________________2____ - ____________________33__ - ______________________4_ - */ - - /* if there's only one memory region, don't bother */ - if (*pnr_map < 2) - return -1; - - old_nr = *pnr_map; - - /* bail out if we find any unreasonable addresses in bios map */ - for (i=0; i<old_nr; i++) - if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) - return -1; - - /* create pointers for initial change-point information (for sorting) */ - for (i=0; i < 2*old_nr; i++) - change_point[i] = &change_point_list[i]; - - /* record all known change-points (starting and ending addresses), - omitting those that are for empty memory regions */ - chgidx = 0; - for (i=0; i < old_nr; i++) { - if (biosmap[i].size != 0) { - change_point[chgidx]->addr = biosmap[i].addr; - change_point[chgidx++]->pbios = &biosmap[i]; - change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size; - change_point[chgidx++]->pbios = &biosmap[i]; - } - } - chg_nr = chgidx; - - /* sort change-point list by memory addresses (low -> high) */ - still_changing = 1; - while (still_changing) { - still_changing = 0; - for (i=1; i < chg_nr; i++) { - /* if <current_addr> > <last_addr>, swap */ - /* or, if current=<start_addr> & last=<end_addr>, swap */ - if ((change_point[i]->addr < change_point[i-1]->addr) || - ((change_point[i]->addr == change_point[i-1]->addr) && - (change_point[i]->addr == change_point[i]->pbios->addr) && - (change_point[i-1]->addr != change_point[i-1]->pbios->addr)) - ) - { - change_tmp = change_point[i]; - change_point[i] = change_point[i-1]; - change_point[i-1] = change_tmp; - still_changing=1; - } - } - } - - /* create a new bios memory map, removing overlaps */ - overlap_entries=0; /* number of entries in the overlap table */ - new_bios_entry=0; /* index for creating new bios map entries */ - last_type = 0; /* start with undefined memory type */ - last_addr = 0; /* start with 0 as last starting address */ - /* loop through change-points, determining affect on the new bios map */ - for (chgidx=0; chgidx < chg_nr; chgidx++) - { - /* keep track of all overlapping bios entries */ - if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr) - { - /* add map entry to overlap list (> 1 entry implies an overlap) */ - overlap_list[overlap_entries++]=change_point[chgidx]->pbios; - } - else - { - /* remove entry from list (order independent, so swap with last) */ - for (i=0; i<overlap_entries; i++) - { - if (overlap_list[i] == change_point[chgidx]->pbios) - overlap_list[i] = overlap_list[overlap_entries-1]; - } - overlap_entries--; - } - /* if there are overlapping entries, decide which "type" to use */ - /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */ - current_type = 0; - for (i=0; i<overlap_entries; i++) - if (overlap_list[i]->type > current_type) - current_type = overlap_list[i]->type; - /* continue building up new bios map based on this information */ - if (current_type != last_type) { - if (last_type != 0) { - new_bios[new_bios_entry].size = - change_point[chgidx]->addr - last_addr; - /* move forward only if the new size was non-zero */ - if (new_bios[new_bios_entry].size != 0) - if (++new_bios_entry >= E820MAX) - break; /* no more space left for new bios entries */ - } - if (current_type != 0) { - new_bios[new_bios_entry].addr = change_point[chgidx]->addr; - new_bios[new_bios_entry].type = current_type; - last_addr=change_point[chgidx]->addr; - } - last_type = current_type; - } - } - new_nr = new_bios_entry; /* retain count for new bios entries */ - - /* copy new bios mapping into original location */ - memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry)); - *pnr_map = new_nr; - - return 0; -} - -/* - * Copy the BIOS e820 map into a safe place. - * - * Sanity-check it while we're at it.. - * - * If we're lucky and live on a modern system, the setup code - * will have given us a memory map that we can use to properly - * set up memory. If we aren't, we'll fake a memory map. - * - * We check to see that the memory map contains at least 2 elements - * before we'll use it, because the detection code in setup.S may - * not be perfect and most every PC known to man has two memory - * regions: one from 0 to 640k, and one from 1mb up. (The IBM - * thinkpad 560x, for example, does not cooperate with the memory - * detection code.) - */ -static int __init copy_e820_map(struct e820entry * biosmap, int nr_map) -{ - /* Only one memory region (or negative)? Ignore it */ - if (nr_map < 2) - return -1; - - do { - unsigned long start = biosmap->addr; - unsigned long size = biosmap->size; - unsigned long end = start + size; - unsigned long type = biosmap->type; - - /* Overflow in 64 bits? Ignore the memory map. */ - if (start > end) - return -1; - - /* - * Some BIOSes claim RAM in the 640k - 1M region. - * Not right. Fix it up. - * - * This should be removed on Hammer which is supposed to not - * have non e820 covered ISA mappings there, but I had some strange - * problems so it stays for now. -AK - */ - if (type == E820_RAM) { - if (start < 0x100000ULL && end > 0xA0000ULL) { - if (start < 0xA0000ULL) - add_memory_region(start, 0xA0000ULL-start, type); - if (end <= 0x100000ULL) - continue; - start = 0x100000ULL; - size = end - start; - } - } - - add_memory_region(start, size, type); - } while (biosmap++,--nr_map); - return 0; -} - -void __init setup_memory_region(void) -{ - char *who = "BIOS-e820"; - - /* - * Try to copy the BIOS-supplied E820-map. - * - * Otherwise fake a memory map; one section from 0k->640k, - * the next section from 1mb->appropriate_mem_k - */ - sanitize_e820_map(E820_MAP, &E820_MAP_NR); - if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) { - unsigned long mem_size; - - /* compare results from other methods and take the greater */ - if (ALT_MEM_K < EXT_MEM_K) { - mem_size = EXT_MEM_K; - who = "BIOS-88"; - } else { - mem_size = ALT_MEM_K; - who = "BIOS-e801"; - } - - e820.nr_map = 0; - add_memory_region(0, LOWMEMSIZE(), E820_RAM); - add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM); - } - printk(KERN_INFO "BIOS-provided physical RAM map:\n"); - e820_print_map(who); -} - -void __init parse_memopt(char *p, char **from) -{ - end_user_pfn = memparse(p, from); - end_user_pfn >>= PAGE_SHIFT; -} - -unsigned long pci_mem_start = 0xaeedbabe; - -/* - * Search for the biggest gap in the low 32 bits of the e820 - * memory space. We pass this space to PCI to assign MMIO resources - * for hotplug or unconfigured devices in. - * Hopefully the BIOS let enough space left. - */ -__init void e820_setup_gap(void) -{ - unsigned long gapstart, gapsize; - unsigned long last; - int i; - int found = 0; - - last = 0x100000000ull; - gapstart = 0x10000000; - gapsize = 0x400000; - i = e820.nr_map; - while (--i >= 0) { - unsigned long long start = e820.map[i].addr; - unsigned long long end = start + e820.map[i].size; - - /* - * Since "last" is at most 4GB, we know we'll - * fit in 32 bits if this condition is true - */ - if (last > end) { - unsigned long gap = last - end; - - if (gap > gapsize) { - gapsize = gap; - gapstart = end; - found = 1; - } - } - if (start < last) - last = start; - } - - if (!found) { - gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024; - printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit address range\n" - KERN_ERR "PCI: Unassigned devices with 32bit resource registers may break!\n"); - } - - /* - * Start allocating dynamic PCI memory a bit into the gap, - * aligned up to the nearest megabyte. - * - * Question: should we try to pad it up a bit (do something - * like " + (gapsize >> 3)" in there too?). We now have the - * technology. - */ - pci_mem_start = (gapstart + 0xfffff) & ~0xfffff; - - printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", - pci_mem_start, gapstart, gapsize); -} diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c deleted file mode 100644 index 9631c747c5e..00000000000 --- a/arch/x86_64/kernel/early_printk.c +++ /dev/null @@ -1,227 +0,0 @@ -#include <linux/console.h> -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/string.h> -#include <linux/tty.h> -#include <asm/io.h> -#include <asm/processor.h> - -/* Simple VGA output */ - -#ifdef __i386__ -#include <asm/setup.h> -#define VGABASE (__ISA_IO_base + 0xb8000) -#else -#include <asm/bootsetup.h> -#define VGABASE ((void __iomem *)0xffffffff800b8000UL) -#endif - -#define MAX_YPOS max_ypos -#define MAX_XPOS max_xpos - -static int max_ypos = 25, max_xpos = 80; -static int current_ypos = 1, current_xpos = 0; - -static void early_vga_write(struct console *con, const char *str, unsigned n) -{ - char c; - int i, k, j; - - while ((c = *str++) != '\0' && n-- > 0) { - if (current_ypos >= MAX_YPOS) { - /* scroll 1 line up */ - for (k = 1, j = 0; k < MAX_YPOS; k++, j++) { - for (i = 0; i < MAX_XPOS; i++) { - writew(readw(VGABASE + 2*(MAX_XPOS*k + i)), - VGABASE + 2*(MAX_XPOS*j + i)); - } - } - for (i = 0; i < MAX_XPOS; i++) - writew(0x720, VGABASE + 2*(MAX_XPOS*j + i)); - current_ypos = MAX_YPOS-1; - } - if (c == '\n') { - current_xpos = 0; - current_ypos++; - } else if (c != '\r') { - writew(((0x7 << 8) | (unsigned short) c), - VGABASE + 2*(MAX_XPOS*current_ypos + - current_xpos++)); - if (current_xpos >= MAX_XPOS) { - current_xpos = 0; - current_ypos++; - } - } - } -} - -static struct console early_vga_console = { - .name = "earlyvga", - .write = early_vga_write, - .flags = CON_PRINTBUFFER, - .index = -1, -}; - -/* Serial functions loosely based on a similar package from Klaus P. Gerlicher */ - -static int early_serial_base = 0x3f8; /* ttyS0 */ - -#define XMTRDY 0x20 - -#define DLAB 0x80 - -#define TXR 0 /* Transmit register (WRITE) */ -#define RXR 0 /* Receive register (READ) */ -#define IER 1 /* Interrupt Enable */ -#define IIR 2 /* Interrupt ID */ -#define FCR 2 /* FIFO control */ -#define LCR 3 /* Line control */ -#define MCR 4 /* Modem control */ -#define LSR 5 /* Line Status */ -#define MSR 6 /* Modem Status */ -#define DLL 0 /* Divisor Latch Low */ -#define DLH 1 /* Divisor latch High */ - -static int early_serial_putc(unsigned char ch) -{ - unsigned timeout = 0xffff; - while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) - cpu_relax(); - outb(ch, early_serial_base + TXR); - return timeout ? 0 : -1; -} - -static void early_serial_write(struct console *con, const char *s, unsigned n) -{ - while (*s && n-- > 0) { - early_serial_putc(*s); - if (*s == '\n') - early_serial_putc('\r'); - s++; - } -} - -#define DEFAULT_BAUD 9600 - -static __init void early_serial_init(char *s) -{ - unsigned char c; - unsigned divisor; - unsigned baud = DEFAULT_BAUD; - char *e; - - if (*s == ',') - ++s; - - if (*s) { - unsigned port; - if (!strncmp(s,"0x",2)) { - early_serial_base = simple_strtoul(s, &e, 16); - } else { - static int bases[] = { 0x3f8, 0x2f8 }; - - if (!strncmp(s,"ttyS",4)) - s += 4; - port = simple_strtoul(s, &e, 10); - if (port > 1 || s == e) - port = 0; - early_serial_base = bases[port]; - } - s += strcspn(s, ","); - if (*s == ',') - s++; - } - - outb(0x3, early_serial_base + LCR); /* 8n1 */ - outb(0, early_serial_base + IER); /* no interrupt */ - outb(0, early_serial_base + FCR); /* no fifo */ - outb(0x3, early_serial_base + MCR); /* DTR + RTS */ - - if (*s) { - baud = simple_strtoul(s, &e, 0); - if (baud == 0 || s == e) - baud = DEFAULT_BAUD; - } - - divisor = 115200 / baud; - c = inb(early_serial_base + LCR); - outb(c | DLAB, early_serial_base + LCR); - outb(divisor & 0xff, early_serial_base + DLL); - outb((divisor >> 8) & 0xff, early_serial_base + DLH); - outb(c & ~DLAB, early_serial_base + LCR); -} - -static struct console early_serial_console = { - .name = "earlyser", - .write = early_serial_write, - .flags = CON_PRINTBUFFER, - .index = -1, -}; - -/* Direct interface for emergencies */ -struct console *early_console = &early_vga_console; -static int early_console_initialized = 0; - -void early_printk(const char *fmt, ...) -{ - char buf[512]; - int n; - va_list ap; - - va_start(ap,fmt); - n = vscnprintf(buf,512,fmt,ap); - early_console->write(early_console,buf,n); - va_end(ap); -} - -static int keep_early; - -int __init setup_early_printk(char *opt) -{ - char *space; - char buf[256]; - - if (early_console_initialized) - return -1; - - opt = strchr(opt, '=') + 1; - - strlcpy(buf,opt,sizeof(buf)); - space = strchr(buf, ' '); - if (space) - *space = 0; - - if (strstr(buf,"keep")) - keep_early = 1; - - if (!strncmp(buf, "serial", 6)) { - early_serial_init(buf + 6); - early_console = &early_serial_console; - } else if (!strncmp(buf, "ttyS", 4)) { - early_serial_init(buf); - early_console = &early_serial_console; - } else if (!strncmp(buf, "vga", 3) - && SCREEN_INFO.orig_video_isVGA == 1) { - max_xpos = SCREEN_INFO.orig_video_cols; - max_ypos = SCREEN_INFO.orig_video_lines; - early_console = &early_vga_console; - } - early_console_initialized = 1; - register_console(early_console); - return 0; -} - -void __init disable_early_printk(void) -{ - if (!early_console_initialized || !early_console) - return; - if (!keep_early) { - printk("disabling early console\n"); - unregister_console(early_console); - early_console_initialized = 0; - } else { - printk("keeping early console\n"); - } -} - -__setup("earlyprintk=", setup_early_printk); diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S deleted file mode 100644 index 096d470e280..00000000000 --- a/arch/x86_64/kernel/entry.S +++ /dev/null @@ -1,932 +0,0 @@ -/* - * linux/arch/x86_64/entry.S - * - * Copyright (C) 1991, 1992 Linus Torvalds - * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs - * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> - * - * $Id$ - */ - -/* - * entry.S contains the system-call and fault low-level handling routines. - * - * NOTE: This code handles signal-recognition, which happens every time - * after an interrupt and after each system call. - * - * Normal syscalls and interrupts don't save a full stack frame, this is - * only done for syscall tracing, signals or fork/exec et.al. - * - * A note on terminology: - * - top of stack: Architecture defined interrupt frame from SS to RIP - * at the top of the kernel process stack. - * - partial stack frame: partially saved registers upto R11. - * - full stack frame: Like partial stack frame, but all register saved. - * - * TODO: - * - schedule it carefully for the final hardware. - */ - -#define ASSEMBLY 1 -#include <linux/config.h> -#include <linux/linkage.h> -#include <asm/segment.h> -#include <asm/smp.h> -#include <asm/cache.h> -#include <asm/errno.h> -#include <asm/dwarf2.h> -#include <asm/calling.h> -#include <asm/offset.h> -#include <asm/msr.h> -#include <asm/unistd.h> -#include <asm/thread_info.h> -#include <asm/hw_irq.h> - - .code64 - -#ifndef CONFIG_PREEMPT -#define retint_kernel retint_restore_args -#endif - -/* - * C code is not supposed to know about undefined top of stack. Every time - * a C function with an pt_regs argument is called from the SYSCALL based - * fast path FIXUP_TOP_OF_STACK is needed. - * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs - * manipulation. - */ - - /* %rsp:at FRAMEEND */ - .macro FIXUP_TOP_OF_STACK tmp - movq %gs:pda_oldrsp,\tmp - movq \tmp,RSP(%rsp) - movq $__USER_DS,SS(%rsp) - movq $__USER_CS,CS(%rsp) - movq $-1,RCX(%rsp) - movq R11(%rsp),\tmp /* get eflags */ - movq \tmp,EFLAGS(%rsp) - .endm - - .macro RESTORE_TOP_OF_STACK tmp,offset=0 - movq RSP-\offset(%rsp),\tmp - movq \tmp,%gs:pda_oldrsp - movq EFLAGS-\offset(%rsp),\tmp - movq \tmp,R11-\offset(%rsp) - .endm - - .macro FAKE_STACK_FRAME child_rip - /* push in order ss, rsp, eflags, cs, rip */ - xorl %eax, %eax - pushq %rax /* ss */ - CFI_ADJUST_CFA_OFFSET 8 - pushq %rax /* rsp */ - CFI_ADJUST_CFA_OFFSET 8 - CFI_OFFSET rip,0 - pushq $(1<<9) /* eflags - interrupts on */ - CFI_ADJUST_CFA_OFFSET 8 - pushq $__KERNEL_CS /* cs */ - CFI_ADJUST_CFA_OFFSET 8 - pushq \child_rip /* rip */ - CFI_ADJUST_CFA_OFFSET 8 - CFI_OFFSET rip,0 - pushq %rax /* orig rax */ - CFI_ADJUST_CFA_OFFSET 8 - .endm - - .macro UNFAKE_STACK_FRAME - addq $8*6, %rsp - CFI_ADJUST_CFA_OFFSET -(6*8) - .endm - - .macro CFI_DEFAULT_STACK - CFI_ADJUST_CFA_OFFSET (SS) - CFI_OFFSET r15,R15-SS - CFI_OFFSET r14,R14-SS - CFI_OFFSET r13,R13-SS - CFI_OFFSET r12,R12-SS - CFI_OFFSET rbp,RBP-SS - CFI_OFFSET rbx,RBX-SS - CFI_OFFSET r11,R11-SS - CFI_OFFSET r10,R10-SS - CFI_OFFSET r9,R9-SS - CFI_OFFSET r8,R8-SS - CFI_OFFSET rax,RAX-SS - CFI_OFFSET rcx,RCX-SS - CFI_OFFSET rdx,RDX-SS - CFI_OFFSET rsi,RSI-SS - CFI_OFFSET rdi,RDI-SS - CFI_OFFSET rsp,RSP-SS - CFI_OFFSET rip,RIP-SS - .endm -/* - * A newly forked process directly context switches into this. - */ -/* rdi: prev */ -ENTRY(ret_from_fork) - CFI_STARTPROC - CFI_DEFAULT_STACK - call schedule_tail - GET_THREAD_INFO(%rcx) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx) - jnz rff_trace -rff_action: - RESTORE_REST - testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread? - je int_ret_from_sys_call - testl $_TIF_IA32,threadinfo_flags(%rcx) - jnz int_ret_from_sys_call - RESTORE_TOP_OF_STACK %rdi,ARGOFFSET - jmp ret_from_sys_call -rff_trace: - movq %rsp,%rdi - call syscall_trace_leave - GET_THREAD_INFO(%rcx) - jmp rff_action - CFI_ENDPROC - -/* - * System call entry. Upto 6 arguments in registers are supported. - * - * SYSCALL does not save anything on the stack and does not change the - * stack pointer. - */ - -/* - * Register setup: - * rax system call number - * rdi arg0 - * rcx return address for syscall/sysret, C arg3 - * rsi arg1 - * rdx arg2 - * r10 arg3 (--> moved to rcx for C) - * r8 arg4 - * r9 arg5 - * r11 eflags for syscall/sysret, temporary for C - * r12-r15,rbp,rbx saved by C code, not touched. - * - * Interrupts are off on entry. - * Only called from user space. - * - * XXX if we had a free scratch register we could save the RSP into the stack frame - * and report it properly in ps. Unfortunately we haven't. - */ - -ENTRY(system_call) - CFI_STARTPROC - swapgs - movq %rsp,%gs:pda_oldrsp - movq %gs:pda_kernelstack,%rsp - sti - SAVE_ARGS 8,1 - movq %rax,ORIG_RAX-ARGOFFSET(%rsp) - movq %rcx,RIP-ARGOFFSET(%rsp) - GET_THREAD_INFO(%rcx) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx) - jnz tracesys - cmpq $__NR_syscall_max,%rax - ja badsys - movq %r10,%rcx - call *sys_call_table(,%rax,8) # XXX: rip relative - movq %rax,RAX-ARGOFFSET(%rsp) -/* - * Syscall return path ending with SYSRET (fast path) - * Has incomplete stack frame and undefined top of stack. - */ - .globl ret_from_sys_call -ret_from_sys_call: - movl $_TIF_ALLWORK_MASK,%edi - /* edi: flagmask */ -sysret_check: - GET_THREAD_INFO(%rcx) - cli - movl threadinfo_flags(%rcx),%edx - andl %edi,%edx - jnz sysret_careful - movq RIP-ARGOFFSET(%rsp),%rcx - RESTORE_ARGS 0,-ARG_SKIP,1 - movq %gs:pda_oldrsp,%rsp - swapgs - sysretq - - /* Handle reschedules */ - /* edx: work, edi: workmask */ -sysret_careful: - bt $TIF_NEED_RESCHED,%edx - jnc sysret_signal - sti - pushq %rdi - call schedule - popq %rdi - jmp sysret_check - - /* Handle a signal */ -sysret_signal: - sti - testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx - jz 1f - - /* Really a signal */ - /* edx: work flags (arg3) */ - leaq do_notify_resume(%rip),%rax - leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 - xorl %esi,%esi # oldset -> arg2 - call ptregscall_common -1: movl $_TIF_NEED_RESCHED,%edi - jmp sysret_check - - /* Do syscall tracing */ -tracesys: - SAVE_REST - movq $-ENOSYS,RAX(%rsp) - FIXUP_TOP_OF_STACK %rdi - movq %rsp,%rdi - call syscall_trace_enter - LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */ - RESTORE_REST - cmpq $__NR_syscall_max,%rax - ja 1f - movq %r10,%rcx /* fixup for C */ - call *sys_call_table(,%rax,8) - movq %rax,RAX-ARGOFFSET(%rsp) -1: SAVE_REST - movq %rsp,%rdi - call syscall_trace_leave - RESTORE_TOP_OF_STACK %rbx - RESTORE_REST - jmp ret_from_sys_call - -badsys: - movq $-ENOSYS,RAX-ARGOFFSET(%rsp) - jmp ret_from_sys_call - -/* - * Syscall return path ending with IRET. - * Has correct top of stack, but partial stack frame. - */ -ENTRY(int_ret_from_sys_call) - cli - testl $3,CS-ARGOFFSET(%rsp) - je retint_restore_args - movl $_TIF_ALLWORK_MASK,%edi - /* edi: mask to check */ -int_with_check: - GET_THREAD_INFO(%rcx) - movl threadinfo_flags(%rcx),%edx - andl %edi,%edx - jnz int_careful - jmp retint_swapgs - - /* Either reschedule or signal or syscall exit tracking needed. */ - /* First do a reschedule test. */ - /* edx: work, edi: workmask */ -int_careful: - bt $TIF_NEED_RESCHED,%edx - jnc int_very_careful - sti - pushq %rdi - call schedule - popq %rdi - cli - jmp int_with_check - - /* handle signals and tracing -- both require a full stack frame */ -int_very_careful: - sti - SAVE_REST - /* Check for syscall exit trace */ - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx - jz int_signal - pushq %rdi - leaq 8(%rsp),%rdi # &ptregs -> arg1 - call syscall_trace_leave - popq %rdi - andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi - cli - jmp int_restore_rest - -int_signal: - testl $(_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_SINGLESTEP),%edx - jz 1f - movq %rsp,%rdi # &ptregs -> arg1 - xorl %esi,%esi # oldset -> arg2 - call do_notify_resume -1: movl $_TIF_NEED_RESCHED,%edi -int_restore_rest: - RESTORE_REST - cli - jmp int_with_check - CFI_ENDPROC - -/* - * Certain special system calls that need to save a complete full stack frame. - */ - - .macro PTREGSCALL label,func,arg - .globl \label -\label: - leaq \func(%rip),%rax - leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */ - jmp ptregscall_common - .endm - - PTREGSCALL stub_clone, sys_clone, %r8 - PTREGSCALL stub_fork, sys_fork, %rdi - PTREGSCALL stub_vfork, sys_vfork, %rdi - PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx - PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx - PTREGSCALL stub_iopl, sys_iopl, %rsi - -ENTRY(ptregscall_common) - CFI_STARTPROC - popq %r11 - CFI_ADJUST_CFA_OFFSET -8 - SAVE_REST - movq %r11, %r15 - FIXUP_TOP_OF_STACK %r11 - call *%rax - RESTORE_TOP_OF_STACK %r11 - movq %r15, %r11 - RESTORE_REST - pushq %r11 - CFI_ADJUST_CFA_OFFSET 8 - ret - CFI_ENDPROC - -ENTRY(stub_execve) - CFI_STARTPROC - popq %r11 - CFI_ADJUST_CFA_OFFSET -8 - SAVE_REST - movq %r11, %r15 - FIXUP_TOP_OF_STACK %r11 - call sys_execve - GET_THREAD_INFO(%rcx) - bt $TIF_IA32,threadinfo_flags(%rcx) - jc exec_32bit - RESTORE_TOP_OF_STACK %r11 - movq %r15, %r11 - RESTORE_REST - push %r11 - ret - -exec_32bit: - CFI_ADJUST_CFA_OFFSET REST_SKIP - movq %rax,RAX(%rsp) - RESTORE_REST - jmp int_ret_from_sys_call - CFI_ENDPROC - -/* - * sigreturn is special because it needs to restore all registers on return. - * This cannot be done with SYSRET, so use the IRET return path instead. - */ -ENTRY(stub_rt_sigreturn) - CFI_STARTPROC - addq $8, %rsp - SAVE_REST - movq %rsp,%rdi - FIXUP_TOP_OF_STACK %r11 - call sys_rt_sigreturn - movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer - RESTORE_REST - jmp int_ret_from_sys_call - CFI_ENDPROC - -/* - * Interrupt entry/exit. - * - * Interrupt entry points save only callee clobbered registers in fast path. - * - * Entry runs with interrupts off. - */ - -/* 0(%rsp): interrupt number */ - .macro interrupt func - CFI_STARTPROC simple - CFI_DEF_CFA rsp,(SS-RDI) - CFI_REL_OFFSET rsp,(RSP-ORIG_RAX) - CFI_REL_OFFSET rip,(RIP-ORIG_RAX) - cld -#ifdef CONFIG_DEBUG_INFO - SAVE_ALL - movq %rsp,%rdi - /* - * Setup a stack frame pointer. This allows gdb to trace - * back to the original stack. - */ - movq %rsp,%rbp - CFI_DEF_CFA_REGISTER rbp -#else - SAVE_ARGS - leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler -#endif - testl $3,CS(%rdi) - je 1f - swapgs -1: incl %gs:pda_irqcount # RED-PEN should check preempt count - movq %gs:pda_irqstackptr,%rax - cmoveq %rax,%rsp - pushq %rdi # save old stack - call \func - .endm - -ENTRY(common_interrupt) - interrupt do_IRQ - /* 0(%rsp): oldrsp-ARGOFFSET */ -ret_from_intr: - popq %rdi - cli - decl %gs:pda_irqcount -#ifdef CONFIG_DEBUG_INFO - movq RBP(%rdi),%rbp -#endif - leaq ARGOFFSET(%rdi),%rsp -exit_intr: - GET_THREAD_INFO(%rcx) - testl $3,CS-ARGOFFSET(%rsp) - je retint_kernel - - /* Interrupt came from user space */ - /* - * Has a correct top of stack, but a partial stack frame - * %rcx: thread info. Interrupts off. - */ -retint_with_reschedule: - movl $_TIF_WORK_MASK,%edi -retint_check: - movl threadinfo_flags(%rcx),%edx - andl %edi,%edx - jnz retint_careful -retint_swapgs: - swapgs -retint_restore_args: - cli - RESTORE_ARGS 0,8,0 -iret_label: - iretq - - .section __ex_table,"a" - .quad iret_label,bad_iret - .previous - .section .fixup,"ax" - /* force a signal here? this matches i386 behaviour */ - /* running with kernel gs */ -bad_iret: - movq $-9999,%rdi /* better code? */ - jmp do_exit - .previous - - /* edi: workmask, edx: work */ -retint_careful: - bt $TIF_NEED_RESCHED,%edx - jnc retint_signal - sti - pushq %rdi - call schedule - popq %rdi - GET_THREAD_INFO(%rcx) - cli - jmp retint_check - -retint_signal: - testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx - jz retint_swapgs - sti - SAVE_REST - movq $-1,ORIG_RAX(%rsp) - xorl %esi,%esi # oldset - movq %rsp,%rdi # &pt_regs - call do_notify_resume - RESTORE_REST - cli - movl $_TIF_NEED_RESCHED,%edi - GET_THREAD_INFO(%rcx) - jmp retint_check - -#ifdef CONFIG_PREEMPT - /* Returning to kernel space. Check if we need preemption */ - /* rcx: threadinfo. interrupts off. */ - .p2align -retint_kernel: - cmpl $0,threadinfo_preempt_count(%rcx) - jnz retint_restore_args - bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx) - jnc retint_restore_args - bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */ - jnc retint_restore_args - call preempt_schedule_irq - jmp exit_intr -#endif - CFI_ENDPROC - -/* - * APIC interrupts. - */ - .macro apicinterrupt num,func - pushq $\num-256 - interrupt \func - jmp ret_from_intr - CFI_ENDPROC - .endm - -ENTRY(thermal_interrupt) - apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt - -#ifdef CONFIG_SMP -ENTRY(reschedule_interrupt) - apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt - -ENTRY(invalidate_interrupt) - apicinterrupt INVALIDATE_TLB_VECTOR,smp_invalidate_interrupt - -ENTRY(call_function_interrupt) - apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt -#endif - -#ifdef CONFIG_X86_LOCAL_APIC -ENTRY(apic_timer_interrupt) - apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt - -ENTRY(error_interrupt) - apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt - -ENTRY(spurious_interrupt) - apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt -#endif - -/* - * Exception entry points. - */ - .macro zeroentry sym - pushq $0 /* push error code/oldrax */ - pushq %rax /* push real oldrax to the rdi slot */ - leaq \sym(%rip),%rax - jmp error_entry - .endm - - .macro errorentry sym - pushq %rax - leaq \sym(%rip),%rax - jmp error_entry - .endm - - /* error code is on the stack already */ - /* handle NMI like exceptions that can happen everywhere */ - .macro paranoidentry sym - SAVE_ALL - cld - movl $1,%ebx - movl $MSR_GS_BASE,%ecx - rdmsr - testl %edx,%edx - js 1f - swapgs - xorl %ebx,%ebx -1: movq %rsp,%rdi - movq ORIG_RAX(%rsp),%rsi - movq $-1,ORIG_RAX(%rsp) - call \sym - cli - .endm - -/* - * Exception entry point. This expects an error code/orig_rax on the stack - * and the exception handler in %rax. - */ -ENTRY(error_entry) - CFI_STARTPROC simple - CFI_DEF_CFA rsp,(SS-RDI) - CFI_REL_OFFSET rsp,(RSP-RDI) - CFI_REL_OFFSET rip,(RIP-RDI) - /* rdi slot contains rax, oldrax contains error code */ - cld - subq $14*8,%rsp - CFI_ADJUST_CFA_OFFSET (14*8) - movq %rsi,13*8(%rsp) - CFI_REL_OFFSET rsi,RSI - movq 14*8(%rsp),%rsi /* load rax from rdi slot */ - movq %rdx,12*8(%rsp) - CFI_REL_OFFSET rdx,RDX - movq %rcx,11*8(%rsp) - CFI_REL_OFFSET rcx,RCX - movq %rsi,10*8(%rsp) /* store rax */ - CFI_REL_OFFSET rax,RAX - movq %r8, 9*8(%rsp) - CFI_REL_OFFSET r8,R8 - movq %r9, 8*8(%rsp) - CFI_REL_OFFSET r9,R9 - movq %r10,7*8(%rsp) - CFI_REL_OFFSET r10,R10 - movq %r11,6*8(%rsp) - CFI_REL_OFFSET r11,R11 - movq %rbx,5*8(%rsp) - CFI_REL_OFFSET rbx,RBX - movq %rbp,4*8(%rsp) - CFI_REL_OFFSET rbp,RBP - movq %r12,3*8(%rsp) - CFI_REL_OFFSET r12,R12 - movq %r13,2*8(%rsp) - CFI_REL_OFFSET r13,R13 - movq %r14,1*8(%rsp) - CFI_REL_OFFSET r14,R14 - movq %r15,(%rsp) - CFI_REL_OFFSET r15,R15 - xorl %ebx,%ebx - testl $3,CS(%rsp) - je error_kernelspace -error_swapgs: - swapgs -error_sti: - movq %rdi,RDI(%rsp) - movq %rsp,%rdi - movq ORIG_RAX(%rsp),%rsi /* get error code */ - movq $-1,ORIG_RAX(%rsp) - call *%rax - /* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */ -error_exit: - movl %ebx,%eax - RESTORE_REST - cli - GET_THREAD_INFO(%rcx) - testl %eax,%eax - jne retint_kernel - movl threadinfo_flags(%rcx),%edx - movl $_TIF_WORK_MASK,%edi - andl %edi,%edx - jnz retint_careful - swapgs - RESTORE_ARGS 0,8,0 - iretq - CFI_ENDPROC - -error_kernelspace: - incl %ebx - /* There are two places in the kernel that can potentially fault with - usergs. Handle them here. The exception handlers after - iret run with kernel gs again, so don't set the user space flag. - B stepping K8s sometimes report an truncated RIP for IRET - exceptions returning to compat mode. Check for these here too. */ - leaq iret_label(%rip),%rbp - cmpq %rbp,RIP(%rsp) - je error_swapgs - movl %ebp,%ebp /* zero extend */ - cmpq %rbp,RIP(%rsp) - je error_swapgs - cmpq $gs_change,RIP(%rsp) - je error_swapgs - jmp error_sti - - /* Reload gs selector with exception handling */ - /* edi: new selector */ -ENTRY(load_gs_index) - pushf - cli - swapgs -gs_change: - movl %edi,%gs -2: mfence /* workaround */ - swapgs - popf - ret - - .section __ex_table,"a" - .align 8 - .quad gs_change,bad_gs - .previous - .section .fixup,"ax" - /* running with kernelgs */ -bad_gs: - swapgs /* switch back to user gs */ - xorl %eax,%eax - movl %eax,%gs - jmp 2b - .previous - -/* - * Create a kernel thread. - * - * C extern interface: - * extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) - * - * asm input arguments: - * rdi: fn, rsi: arg, rdx: flags - */ -ENTRY(kernel_thread) - CFI_STARTPROC - FAKE_STACK_FRAME $child_rip - SAVE_ALL - - # rdi: flags, rsi: usp, rdx: will be &pt_regs - movq %rdx,%rdi - orq kernel_thread_flags(%rip),%rdi - movq $-1, %rsi - movq %rsp, %rdx - - xorl %r8d,%r8d - xorl %r9d,%r9d - - # clone now - call do_fork - movq %rax,RAX(%rsp) - xorl %edi,%edi - - /* - * It isn't worth to check for reschedule here, - * so internally to the x86_64 port you can rely on kernel_thread() - * not to reschedule the child before returning, this avoids the need - * of hacks for example to fork off the per-CPU idle tasks. - * [Hopefully no generic code relies on the reschedule -AK] - */ - RESTORE_ALL - UNFAKE_STACK_FRAME - ret - CFI_ENDPROC - - -child_rip: - /* - * Here we are in the child and the registers are set as they were - * at kernel_thread() invocation in the parent. - */ - movq %rdi, %rax - movq %rsi, %rdi - call *%rax - # exit - xorl %edi, %edi - call do_exit - -/* - * execve(). This function needs to use IRET, not SYSRET, to set up all state properly. - * - * C extern interface: - * extern long execve(char *name, char **argv, char **envp) - * - * asm input arguments: - * rdi: name, rsi: argv, rdx: envp - * - * We want to fallback into: - * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs) - * - * do_sys_execve asm fallback arguments: - * rdi: name, rsi: argv, rdx: envp, fake frame on the stack - */ -ENTRY(execve) - CFI_STARTPROC - FAKE_STACK_FRAME $0 - SAVE_ALL - call sys_execve - movq %rax, RAX(%rsp) - RESTORE_REST - testq %rax,%rax - je int_ret_from_sys_call - RESTORE_ARGS - UNFAKE_STACK_FRAME - ret - CFI_ENDPROC - -ENTRY(page_fault) - errorentry do_page_fault - -ENTRY(coprocessor_error) - zeroentry do_coprocessor_error - -ENTRY(simd_coprocessor_error) - zeroentry do_simd_coprocessor_error - -ENTRY(device_not_available) - zeroentry math_state_restore - - /* runs on exception stack */ -ENTRY(debug) - CFI_STARTPROC - pushq $0 - CFI_ADJUST_CFA_OFFSET 8 - paranoidentry do_debug - jmp paranoid_exit - CFI_ENDPROC - - /* runs on exception stack */ -ENTRY(nmi) - CFI_STARTPROC - pushq $-1 - CFI_ADJUST_CFA_OFFSET 8 - paranoidentry do_nmi - /* - * "Paranoid" exit path from exception stack. - * Paranoid because this is used by NMIs and cannot take - * any kernel state for granted. - * We don't do kernel preemption checks here, because only - * NMI should be common and it does not enable IRQs and - * cannot get reschedule ticks. - */ - /* ebx: no swapgs flag */ -paranoid_exit: - testl %ebx,%ebx /* swapgs needed? */ - jnz paranoid_restore - testl $3,CS(%rsp) - jnz paranoid_userspace -paranoid_swapgs: - swapgs -paranoid_restore: - RESTORE_ALL 8 - iretq -paranoid_userspace: - GET_THREAD_INFO(%rcx) - movl threadinfo_flags(%rcx),%ebx - andl $_TIF_WORK_MASK,%ebx - jz paranoid_swapgs - movq %rsp,%rdi /* &pt_regs */ - call sync_regs - movq %rax,%rsp /* switch stack for scheduling */ - testl $_TIF_NEED_RESCHED,%ebx - jnz paranoid_schedule - movl %ebx,%edx /* arg3: thread flags */ - sti - xorl %esi,%esi /* arg2: oldset */ - movq %rsp,%rdi /* arg1: &pt_regs */ - call do_notify_resume - cli - jmp paranoid_userspace -paranoid_schedule: - sti - call schedule - cli - jmp paranoid_userspace - CFI_ENDPROC - -ENTRY(int3) - zeroentry do_int3 - -ENTRY(overflow) - zeroentry do_overflow - -ENTRY(bounds) - zeroentry do_bounds - -ENTRY(invalid_op) - zeroentry do_invalid_op - -ENTRY(coprocessor_segment_overrun) - zeroentry do_coprocessor_segment_overrun - -ENTRY(reserved) - zeroentry do_reserved - - /* runs on exception stack */ -ENTRY(double_fault) - CFI_STARTPROC - paranoidentry do_double_fault - jmp paranoid_exit - CFI_ENDPROC - -ENTRY(invalid_TSS) - errorentry do_invalid_TSS - -ENTRY(segment_not_present) - errorentry do_segment_not_present - - /* runs on exception stack */ -ENTRY(stack_segment) - CFI_STARTPROC - paranoidentry do_stack_segment - jmp paranoid_exit - CFI_ENDPROC - -ENTRY(general_protection) - errorentry do_general_protection - -ENTRY(alignment_check) - errorentry do_alignment_check - -ENTRY(divide_error) - zeroentry do_divide_error - -ENTRY(spurious_interrupt_bug) - zeroentry do_spurious_interrupt_bug - -#ifdef CONFIG_X86_MCE - /* runs on exception stack */ -ENTRY(machine_check) - CFI_STARTPROC - pushq $0 - CFI_ADJUST_CFA_OFFSET 8 - paranoidentry do_machine_check - jmp paranoid_exit - CFI_ENDPROC -#endif - -ENTRY(call_debug) - zeroentry do_call_debug - -ENTRY(call_softirq) - movq %gs:pda_irqstackptr,%rax - pushq %r15 - movq %rsp,%r15 - incl %gs:pda_irqcount - cmove %rax,%rsp - call __do_softirq - movq %r15,%rsp - decl %gs:pda_irqcount - popq %r15 - ret - diff --git a/arch/x86_64/kernel/genapic.c b/arch/x86_64/kernel/genapic.c deleted file mode 100644 index 30c843a5efd..00000000000 --- a/arch/x86_64/kernel/genapic.c +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright 2004 James Cleverdon, IBM. - * Subject to the GNU Public License, v.2 - * - * Generic APIC sub-arch probe layer. - * - * Hacked for x86-64 by James Cleverdon from i386 architecture code by - * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and - * James Cleverdon. - */ -#include <linux/config.h> -#include <linux/threads.h> -#include <linux/cpumask.h> -#include <linux/string.h> -#include <linux/kernel.h> -#include <linux/ctype.h> -#include <linux/init.h> -#include <linux/module.h> - -#include <asm/smp.h> -#include <asm/ipi.h> - -#if defined(CONFIG_ACPI_BUS) -#include <acpi/acpi_bus.h> -#endif - -/* which logical CPU number maps to which CPU (physical APIC ID) */ -u8 x86_cpu_to_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; -EXPORT_SYMBOL(x86_cpu_to_apicid); -u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; - -extern struct genapic apic_cluster; -extern struct genapic apic_flat; -extern struct genapic apic_physflat; - -struct genapic *genapic = &apic_flat; - - -/* - * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. - */ -void __init clustered_apic_check(void) -{ - long i; - u8 clusters, max_cluster; - u8 id; - u8 cluster_cnt[NUM_APIC_CLUSTERS]; - int num_cpus = 0; - -#if defined(CONFIG_ACPI_BUS) - /* - * Some x86_64 machines use physical APIC mode regardless of how many - * procs/clusters are present (x86_64 ES7000 is an example). - */ - if (acpi_fadt.revision > FADT2_REVISION_ID) - if (acpi_fadt.force_apic_physical_destination_mode) { - genapic = &apic_cluster; - goto print; - } -#endif - - memset(cluster_cnt, 0, sizeof(cluster_cnt)); - for (i = 0; i < NR_CPUS; i++) { - id = bios_cpu_apicid[i]; - if (id == BAD_APICID) - continue; - num_cpus++; - cluster_cnt[APIC_CLUSTERID(id)]++; - } - - /* Don't use clustered mode on AMD platforms. */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { - genapic = &apic_physflat; -#ifndef CONFIG_CPU_HOTPLUG - /* In the CPU hotplug case we cannot use broadcast mode - because that opens a race when a CPU is removed. - Stay at physflat mode in this case. - It is bad to do this unconditionally though. Once - we have ACPI platform support for CPU hotplug - we should detect hotplug capablity from ACPI tables and - only do this when really needed. -AK */ - if (num_cpus <= 8) - genapic = &apic_flat; -#endif - goto print; - } - - clusters = 0; - max_cluster = 0; - - for (i = 0; i < NUM_APIC_CLUSTERS; i++) { - if (cluster_cnt[i] > 0) { - ++clusters; - if (cluster_cnt[i] > max_cluster) - max_cluster = cluster_cnt[i]; - } - } - - /* - * If we have clusters <= 1 and CPUs <= 8 in cluster 0, then flat mode, - * else if max_cluster <= 4 and cluster_cnt[15] == 0, clustered logical - * else physical mode. - * (We don't use lowest priority delivery + HW APIC IRQ steering, so - * can ignore the clustered logical case and go straight to physical.) - */ - if (clusters <= 1 && max_cluster <= 8 && cluster_cnt[0] == max_cluster) - genapic = &apic_flat; - else - genapic = &apic_cluster; - -print: - printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name); -} - -/* Same for both flat and clustered. */ - -void send_IPI_self(int vector) -{ - __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); -} diff --git a/arch/x86_64/kernel/genapic_cluster.c b/arch/x86_64/kernel/genapic_cluster.c deleted file mode 100644 index 9703da7202e..00000000000 --- a/arch/x86_64/kernel/genapic_cluster.c +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Copyright 2004 James Cleverdon, IBM. - * Subject to the GNU Public License, v.2 - * - * Clustered APIC subarch code. Up to 255 CPUs, physical delivery. - * (A more realistic maximum is around 230 CPUs.) - * - * Hacked for x86-64 by James Cleverdon from i386 architecture code by - * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and - * James Cleverdon. - */ -#include <linux/config.h> -#include <linux/threads.h> -#include <linux/cpumask.h> -#include <linux/string.h> -#include <linux/kernel.h> -#include <linux/ctype.h> -#include <linux/init.h> -#include <asm/smp.h> -#include <asm/ipi.h> - - -/* - * Set up the logical destination ID. - * - * Intel recommends to set DFR, LDR and TPR before enabling - * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel - * document number 292116). So here it goes... - */ -static void cluster_init_apic_ldr(void) -{ - unsigned long val, id; - long i, count; - u8 lid; - u8 my_id = hard_smp_processor_id(); - u8 my_cluster = APIC_CLUSTER(my_id); - - /* Create logical APIC IDs by counting CPUs already in cluster. */ - for (count = 0, i = NR_CPUS; --i >= 0; ) { - lid = x86_cpu_to_log_apicid[i]; - if (lid != BAD_APICID && APIC_CLUSTER(lid) == my_cluster) - ++count; - } - /* - * We only have a 4 wide bitmap in cluster mode. There's no way - * to get above 60 CPUs and still give each one it's own bit. - * But, we're using physical IRQ delivery, so we don't care. - * Use bit 3 for the 4th through Nth CPU in each cluster. - */ - if (count >= XAPIC_DEST_CPUS_SHIFT) - count = 3; - id = my_cluster | (1UL << count); - x86_cpu_to_log_apicid[smp_processor_id()] = id; - apic_write_around(APIC_DFR, APIC_DFR_CLUSTER); - val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; - val |= SET_APIC_LOGICAL_ID(id); - apic_write_around(APIC_LDR, val); -} - -/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ - -static cpumask_t cluster_target_cpus(void) -{ - return cpumask_of_cpu(0); -} - -static void cluster_send_IPI_mask(cpumask_t mask, int vector) -{ - send_IPI_mask_sequence(mask, vector); -} - -static void cluster_send_IPI_allbutself(int vector) -{ - cpumask_t mask = cpu_online_map; - cpu_clear(smp_processor_id(), mask); - - if (!cpus_empty(mask)) - cluster_send_IPI_mask(mask, vector); -} - -static void cluster_send_IPI_all(int vector) -{ - cluster_send_IPI_mask(cpu_online_map, vector); -} - -static int cluster_apic_id_registered(void) -{ - return 1; -} - -static unsigned int cluster_cpu_mask_to_apicid(cpumask_t cpumask) -{ - int cpu; - - /* - * We're using fixed IRQ delivery, can only return one phys APIC ID. - * May as well be the first. - */ - cpu = first_cpu(cpumask); - if ((unsigned)cpu < NR_CPUS) - return x86_cpu_to_apicid[cpu]; - else - return BAD_APICID; -} - -/* cpuid returns the value latched in the HW at reset, not the APIC ID - * register's value. For any box whose BIOS changes APIC IDs, like - * clustered APIC systems, we must use hard_smp_processor_id. - * - * See Intel's IA-32 SW Dev's Manual Vol2 under CPUID. - */ -static unsigned int phys_pkg_id(int index_msb) -{ - return hard_smp_processor_id() >> index_msb; -} - -struct genapic apic_cluster = { - .name = "clustered", - .int_delivery_mode = dest_Fixed, - .int_dest_mode = (APIC_DEST_PHYSICAL != 0), - .int_delivery_dest = APIC_DEST_PHYSICAL | APIC_DM_FIXED, - .target_cpus = cluster_target_cpus, - .apic_id_registered = cluster_apic_id_registered, - .init_apic_ldr = cluster_init_apic_ldr, - .send_IPI_all = cluster_send_IPI_all, - .send_IPI_allbutself = cluster_send_IPI_allbutself, - .send_IPI_mask = cluster_send_IPI_mask, - .cpu_mask_to_apicid = cluster_cpu_mask_to_apicid, - .phys_pkg_id = phys_pkg_id, -}; diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c deleted file mode 100644 index adc96282a9e..00000000000 --- a/arch/x86_64/kernel/genapic_flat.c +++ /dev/null @@ -1,181 +0,0 @@ -/* - * Copyright 2004 James Cleverdon, IBM. - * Subject to the GNU Public License, v.2 - * - * Flat APIC subarch code. - * - * Hacked for x86-64 by James Cleverdon from i386 architecture code by - * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and - * James Cleverdon. - */ -#include <linux/config.h> -#include <linux/threads.h> -#include <linux/cpumask.h> -#include <linux/string.h> -#include <linux/kernel.h> -#include <linux/ctype.h> -#include <linux/init.h> -#include <asm/smp.h> -#include <asm/ipi.h> - -static cpumask_t flat_target_cpus(void) -{ - return cpu_online_map; -} - -/* - * Set up the logical destination ID. - * - * Intel recommends to set DFR, LDR and TPR before enabling - * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel - * document number 292116). So here it goes... - */ -static void flat_init_apic_ldr(void) -{ - unsigned long val; - unsigned long num, id; - - num = smp_processor_id(); - id = 1UL << num; - x86_cpu_to_log_apicid[num] = id; - apic_write_around(APIC_DFR, APIC_DFR_FLAT); - val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; - val |= SET_APIC_LOGICAL_ID(id); - apic_write_around(APIC_LDR, val); -} - -static void flat_send_IPI_mask(cpumask_t cpumask, int vector) -{ - unsigned long mask = cpus_addr(cpumask)[0]; - unsigned long cfg; - unsigned long flags; - - local_save_flags(flags); - local_irq_disable(); - - /* - * Wait for idle. - */ - apic_wait_icr_idle(); - - /* - * prepare target chip field - */ - cfg = __prepare_ICR2(mask); - apic_write_around(APIC_ICR2, cfg); - - /* - * program the ICR - */ - cfg = __prepare_ICR(0, vector, APIC_DEST_LOGICAL); - - /* - * Send the IPI. The write to APIC_ICR fires this off. - */ - apic_write_around(APIC_ICR, cfg); - local_irq_restore(flags); -} - -static void flat_send_IPI_allbutself(int vector) -{ - if (((num_online_cpus()) - 1) >= 1) - __send_IPI_shortcut(APIC_DEST_ALLBUT, vector,APIC_DEST_LOGICAL); -} - -static void flat_send_IPI_all(int vector) -{ - __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL); -} - -static int flat_apic_id_registered(void) -{ - return physid_isset(GET_APIC_ID(apic_read(APIC_ID)), phys_cpu_present_map); -} - -static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask) -{ - return cpus_addr(cpumask)[0] & APIC_ALL_CPUS; -} - -static unsigned int phys_pkg_id(int index_msb) -{ - u32 ebx; - - ebx = cpuid_ebx(1); - return ((ebx >> 24) & 0xFF) >> index_msb; -} - -struct genapic apic_flat = { - .name = "flat", - .int_delivery_mode = dest_LowestPrio, - .int_dest_mode = (APIC_DEST_LOGICAL != 0), - .int_delivery_dest = APIC_DEST_LOGICAL | APIC_DM_LOWEST, - .target_cpus = flat_target_cpus, - .apic_id_registered = flat_apic_id_registered, - .init_apic_ldr = flat_init_apic_ldr, - .send_IPI_all = flat_send_IPI_all, - .send_IPI_allbutself = flat_send_IPI_allbutself, - .send_IPI_mask = flat_send_IPI_mask, - .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, - .phys_pkg_id = phys_pkg_id, -}; - -/* - * Physflat mode is used when there are more than 8 CPUs on a AMD system. - * We cannot use logical delivery in this case because the mask - * overflows, so use physical mode. - */ - -static cpumask_t physflat_target_cpus(void) -{ - return cpumask_of_cpu(0); -} - -static void physflat_send_IPI_mask(cpumask_t cpumask, int vector) -{ - send_IPI_mask_sequence(cpumask, vector); -} - -static void physflat_send_IPI_allbutself(int vector) -{ - cpumask_t allbutme = cpu_online_map; - int me = get_cpu(); - cpu_clear(me, allbutme); - physflat_send_IPI_mask(allbutme, vector); - put_cpu(); -} - -static void physflat_send_IPI_all(int vector) -{ - physflat_send_IPI_mask(cpu_online_map, vector); -} - -static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask) -{ - int cpu; - - /* - * We're using fixed IRQ delivery, can only return one phys APIC ID. - * May as well be the first. - */ - cpu = first_cpu(cpumask); - if ((unsigned)cpu < NR_CPUS) - return x86_cpu_to_apicid[cpu]; - else - return BAD_APICID; -} - -struct genapic apic_physflat = { - .name = "physical flat", - .int_delivery_mode = dest_LowestPrio, - .int_dest_mode = (APIC_DEST_PHYSICAL != 0), - .int_delivery_dest = APIC_DEST_PHYSICAL | APIC_DM_LOWEST, - .target_cpus = physflat_target_cpus, - .apic_id_registered = flat_apic_id_registered, - .init_apic_ldr = flat_init_apic_ldr,/*not needed, but shouldn't hurt*/ - .send_IPI_all = physflat_send_IPI_all, - .send_IPI_allbutself = physflat_send_IPI_allbutself, - .send_IPI_mask = physflat_send_IPI_mask, - .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid, - .phys_pkg_id = phys_pkg_id, -}; diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S deleted file mode 100644 index 98ff5eb32b9..00000000000 --- a/arch/x86_64/kernel/head.S +++ /dev/null @@ -1,404 +0,0 @@ -/* - * linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit - * - * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE - * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> - * Copyright (C) 2000 Karsten Keil <kkeil@suse.de> - * Copyright (C) 2001,2002 Andi Kleen <ak@suse.de> - * - * $Id: head.S,v 1.49 2002/03/19 17:39:25 ak Exp $ - */ - - -#include <linux/linkage.h> -#include <linux/threads.h> -#include <asm/desc.h> -#include <asm/segment.h> -#include <asm/page.h> -#include <asm/msr.h> -#include <asm/cache.h> - -/* we are not able to switch in one step to the final KERNEL ADRESS SPACE - * because we need identity-mapped pages on setup so define __START_KERNEL to - * 0x100000 for this stage - * - */ - - .text - .code32 - .globl startup_32 -/* %bx: 1 if coming from smp trampoline on secondary cpu */ -startup_32: - - /* - * At this point the CPU runs in 32bit protected mode (CS.D = 1) with - * paging disabled and the point of this file is to switch to 64bit - * long mode with a kernel mapping for kerneland to jump into the - * kernel virtual addresses. - * There is no stack until we set one up. - */ - - /* Initialize the %ds segment register */ - movl $__KERNEL_DS,%eax - movl %eax,%ds - - /* Load new GDT with the 64bit segments using 32bit descriptor */ - lgdt pGDT32 - __START_KERNEL_map - - /* If the CPU doesn't support CPUID this will double fault. - * Unfortunately it is hard to check for CPUID without a stack. - */ - - /* Check if extended functions are implemented */ - movl $0x80000000, %eax - cpuid - cmpl $0x80000000, %eax - jbe no_long_mode - /* Check if long mode is implemented */ - mov $0x80000001, %eax - cpuid - btl $29, %edx - jnc no_long_mode - - /* - * Prepare for entering 64bits mode - */ - - /* Enable PAE mode */ - xorl %eax, %eax - btsl $5, %eax - movl %eax, %cr4 - - /* Setup early boot stage 4 level pagetables */ - movl $(init_level4_pgt - __START_KERNEL_map), %eax - movl %eax, %cr3 - - /* Setup EFER (Extended Feature Enable Register) */ - movl $MSR_EFER, %ecx - rdmsr - - /* Enable Long Mode */ - btsl $_EFER_LME, %eax - - /* Make changes effective */ - wrmsr - - xorl %eax, %eax - btsl $31, %eax /* Enable paging and in turn activate Long Mode */ - btsl $0, %eax /* Enable protected mode */ - /* Make changes effective */ - movl %eax, %cr0 - /* - * At this point we're in long mode but in 32bit compatibility mode - * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn - * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use - * the new gdt/idt that has __KERNEL_CS with CS.L = 1. - */ - ljmp $__KERNEL_CS, $(startup_64 - __START_KERNEL_map) - - .code64 - .org 0x100 - .globl startup_64 -startup_64: - /* We come here either from startup_32 - * or directly from a 64bit bootloader. - * Since we may have come directly from a bootloader we - * reload the page tables here. - */ - - /* Enable PAE mode and PGE */ - xorq %rax, %rax - btsq $5, %rax - btsq $7, %rax - movq %rax, %cr4 - - /* Setup early boot stage 4 level pagetables. */ - movq $(init_level4_pgt - __START_KERNEL_map), %rax - movq %rax, %cr3 - - /* Check if nx is implemented */ - movl $0x80000001, %eax - cpuid - movl %edx,%edi - - /* Setup EFER (Extended Feature Enable Register) */ - movl $MSR_EFER, %ecx - rdmsr - - /* Enable System Call */ - btsl $_EFER_SCE, %eax - - /* No Execute supported? */ - btl $20,%edi - jnc 1f - btsl $_EFER_NX, %eax -1: - /* Make changes effective */ - wrmsr - - /* Setup cr0 */ -#define CR0_PM 1 /* protected mode */ -#define CR0_MP (1<<1) -#define CR0_ET (1<<4) -#define CR0_NE (1<<5) -#define CR0_WP (1<<16) -#define CR0_AM (1<<18) -#define CR0_PAGING (1<<31) - movl $CR0_PM|CR0_MP|CR0_ET|CR0_NE|CR0_WP|CR0_AM|CR0_PAGING,%eax - /* Make changes effective */ - movq %rax, %cr0 - - /* Setup a boot time stack */ - movq init_rsp(%rip),%rsp - - /* zero EFLAGS after setting rsp */ - pushq $0 - popfq - - /* - * We must switch to a new descriptor in kernel space for the GDT - * because soon the kernel won't have access anymore to the userspace - * addresses where we're currently running on. We have to do that here - * because in 32bit we couldn't load a 64bit linear address. - */ - lgdt cpu_gdt_descr - - /* - * Setup up a dummy PDA. this is just for some early bootup code - * that does in_interrupt() - */ - movl $MSR_GS_BASE,%ecx - movq $empty_zero_page,%rax - movq %rax,%rdx - shrq $32,%rdx - wrmsr - - /* set up data segments. actually 0 would do too */ - movl $__KERNEL_DS,%eax - movl %eax,%ds - movl %eax,%ss - movl %eax,%es - - /* esi is pointer to real mode structure with interesting info. - pass it to C */ - movl %esi, %edi - - /* Finally jump to run C code and to be on real kernel address - * Since we are running on identity-mapped space we have to jump - * to the full 64bit address , this is only possible as indirect - * jump - */ - movq initial_code(%rip),%rax - jmp *%rax - - /* SMP bootup changes these two */ - .globl initial_code -initial_code: - .quad x86_64_start_kernel - .globl init_rsp -init_rsp: - .quad init_thread_union+THREAD_SIZE-8 - -ENTRY(early_idt_handler) - cmpl $2,early_recursion_flag(%rip) - jz 1f - incl early_recursion_flag(%rip) - xorl %eax,%eax - movq 8(%rsp),%rsi # get rip - movq (%rsp),%rdx - movq %cr2,%rcx - leaq early_idt_msg(%rip),%rdi - call early_printk - cmpl $2,early_recursion_flag(%rip) - jz 1f - call dump_stack -1: hlt - jmp 1b -early_recursion_flag: - .long 0 - -early_idt_msg: - .asciz "PANIC: early exception rip %lx error %lx cr2 %lx\n" - -.code32 -ENTRY(no_long_mode) - /* This isn't an x86-64 CPU so hang */ -1: - jmp 1b - -.org 0xf00 - .globl pGDT32 -pGDT32: - .word gdt_end-cpu_gdt_table - .long cpu_gdt_table-__START_KERNEL_map - -.org 0xf10 -ljumpvector: - .long startup_64-__START_KERNEL_map - .word __KERNEL_CS - -ENTRY(stext) -ENTRY(_stext) - - /* - * This default setting generates an ident mapping at address 0x100000 - * and a mapping for the kernel that precisely maps virtual address - * 0xffffffff80000000 to physical address 0x000000. (always using - * 2Mbyte large pages provided by PAE mode) - */ -.org 0x1000 -ENTRY(init_level4_pgt) - .quad 0x0000000000002007 + __PHYSICAL_START /* -> level3_ident_pgt */ - .fill 255,8,0 - .quad 0x000000000000a007 + __PHYSICAL_START - .fill 254,8,0 - /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ - .quad 0x0000000000003007 + __PHYSICAL_START /* -> level3_kernel_pgt */ - -.org 0x2000 -ENTRY(level3_ident_pgt) - .quad 0x0000000000004007 + __PHYSICAL_START - .fill 511,8,0 - -.org 0x3000 -ENTRY(level3_kernel_pgt) - .fill 510,8,0 - /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ - .quad 0x0000000000005007 + __PHYSICAL_START /* -> level2_kernel_pgt */ - .fill 1,8,0 - -.org 0x4000 -ENTRY(level2_ident_pgt) - /* 40MB for bootup. */ - .quad 0x0000000000000283 - .quad 0x0000000000200183 - .quad 0x0000000000400183 - .quad 0x0000000000600183 - .quad 0x0000000000800183 - .quad 0x0000000000A00183 - .quad 0x0000000000C00183 - .quad 0x0000000000E00183 - .quad 0x0000000001000183 - .quad 0x0000000001200183 - .quad 0x0000000001400183 - .quad 0x0000000001600183 - .quad 0x0000000001800183 - .quad 0x0000000001A00183 - .quad 0x0000000001C00183 - .quad 0x0000000001E00183 - .quad 0x0000000002000183 - .quad 0x0000000002200183 - .quad 0x0000000002400183 - .quad 0x0000000002600183 - /* Temporary mappings for the super early allocator in arch/x86_64/mm/init.c */ - .globl temp_boot_pmds -temp_boot_pmds: - .fill 492,8,0 - -.org 0x5000 -ENTRY(level2_kernel_pgt) - /* 40MB kernel mapping. The kernel code cannot be bigger than that. - When you change this change KERNEL_TEXT_SIZE in page.h too. */ - /* (2^48-(2*1024*1024*1024)-((2^39)*511)-((2^30)*510)) = 0 */ - .quad 0x0000000000000183 - .quad 0x0000000000200183 - .quad 0x0000000000400183 - .quad 0x0000000000600183 - .quad 0x0000000000800183 - .quad 0x0000000000A00183 - .quad 0x0000000000C00183 - .quad 0x0000000000E00183 - .quad 0x0000000001000183 - .quad 0x0000000001200183 - .quad 0x0000000001400183 - .quad 0x0000000001600183 - .quad 0x0000000001800183 - .quad 0x0000000001A00183 - .quad 0x0000000001C00183 - .quad 0x0000000001E00183 - .quad 0x0000000002000183 - .quad 0x0000000002200183 - .quad 0x0000000002400183 - .quad 0x0000000002600183 - /* Module mapping starts here */ - .fill 492,8,0 - -.org 0x6000 -ENTRY(empty_zero_page) - -.org 0x7000 -ENTRY(empty_bad_page) - -.org 0x8000 -ENTRY(empty_bad_pte_table) - -.org 0x9000 -ENTRY(empty_bad_pmd_table) - -.org 0xa000 -ENTRY(level3_physmem_pgt) - .quad 0x0000000000005007 + __PHYSICAL_START /* -> level2_kernel_pgt (so that __va works even before pagetable_init) */ - - .org 0xb000 -#ifdef CONFIG_ACPI_SLEEP -ENTRY(wakeup_level4_pgt) - .quad 0x0000000000002007 + __PHYSICAL_START /* -> level3_ident_pgt */ - .fill 255,8,0 - .quad 0x000000000000a007 + __PHYSICAL_START - .fill 254,8,0 - /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ - .quad 0x0000000000003007 + __PHYSICAL_START /* -> level3_kernel_pgt */ -#endif - - .data - - .align 16 - .globl cpu_gdt_descr -cpu_gdt_descr: - .word gdt_end-cpu_gdt_table -gdt: - .quad cpu_gdt_table -#ifdef CONFIG_SMP - .rept NR_CPUS-1 - .word 0 - .quad 0 - .endr -#endif - -/* We need valid kernel segments for data and code in long mode too - * IRET will check the segment types kkeil 2000/10/28 - * Also sysret mandates a special GDT layout - */ - -.align L1_CACHE_BYTES - -/* The TLS descriptors are currently at a different place compared to i386. - Hopefully nobody expects them at a fixed place (Wine?) */ - -ENTRY(cpu_gdt_table) - .quad 0x0000000000000000 /* NULL descriptor */ - .quad 0x008f9a000000ffff /* __KERNEL_COMPAT32_CS */ - .quad 0x00af9a000000ffff /* __KERNEL_CS */ - .quad 0x00cf92000000ffff /* __KERNEL_DS */ - .quad 0x00cffa000000ffff /* __USER32_CS */ - .quad 0x00cff2000000ffff /* __USER_DS, __USER32_DS */ - .quad 0x00affa000000ffff /* __USER_CS */ - .quad 0x00cf9a000000ffff /* __KERNEL32_CS */ - .quad 0,0 /* TSS */ - .quad 0,0 /* LDT */ - .quad 0,0,0 /* three TLS descriptors */ - .quad 0x00009a000000ffff /* __KERNEL16_CS - 16bit PM for S3 wakeup. */ - /* base must be patched for real base address. */ -gdt_end: - /* asm/segment.h:GDT_ENTRIES must match this */ - /* This should be a multiple of the cache line size */ - /* GDTs of other CPUs: */ - .fill (GDT_SIZE * NR_CPUS) - (gdt_end - cpu_gdt_table) - - .align L1_CACHE_BYTES -ENTRY(idt_table) - .rept 256 - .quad 0 - .quad 0 - .endr - diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c deleted file mode 100644 index cf6ab147a2a..00000000000 --- a/arch/x86_64/kernel/head64.c +++ /dev/null @@ -1,112 +0,0 @@ -/* - * linux/arch/x86_64/kernel/head64.c -- prepare to run common code - * - * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE - * - * $Id: head64.c,v 1.22 2001/07/06 14:28:20 ak Exp $ - */ - -#include <linux/init.h> -#include <linux/linkage.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/string.h> -#include <linux/percpu.h> - -#include <asm/processor.h> -#include <asm/proto.h> -#include <asm/smp.h> -#include <asm/bootsetup.h> -#include <asm/setup.h> -#include <asm/desc.h> - -/* Don't add a printk in there. printk relies on the PDA which is not initialized - yet. */ -static void __init clear_bss(void) -{ - extern char __bss_start[], __bss_end[]; - memset(__bss_start, 0, - (unsigned long) __bss_end - (unsigned long) __bss_start); -} - -#define NEW_CL_POINTER 0x228 /* Relative to real mode data */ -#define OLD_CL_MAGIC_ADDR 0x90020 -#define OLD_CL_MAGIC 0xA33F -#define OLD_CL_BASE_ADDR 0x90000 -#define OLD_CL_OFFSET 0x90022 - -extern char saved_command_line[]; - -static void __init copy_bootdata(char *real_mode_data) -{ - int new_data; - char * command_line; - - memcpy(x86_boot_params, real_mode_data, BOOT_PARAM_SIZE); - new_data = *(int *) (x86_boot_params + NEW_CL_POINTER); - if (!new_data) { - if (OLD_CL_MAGIC != * (u16 *) OLD_CL_MAGIC_ADDR) { - printk("so old bootloader that it does not support commandline?!\n"); - return; - } - new_data = OLD_CL_BASE_ADDR + * (u16 *) OLD_CL_OFFSET; - printk("old bootloader convention, maybe loadlin?\n"); - } - command_line = (char *) ((u64)(new_data)); - memcpy(saved_command_line, command_line, COMMAND_LINE_SIZE); - printk("Bootdata ok (command line is %s)\n", saved_command_line); -} - -static void __init setup_boot_cpu_data(void) -{ - unsigned int dummy, eax; - - /* get vendor info */ - cpuid(0, (unsigned int *)&boot_cpu_data.cpuid_level, - (unsigned int *)&boot_cpu_data.x86_vendor_id[0], - (unsigned int *)&boot_cpu_data.x86_vendor_id[8], - (unsigned int *)&boot_cpu_data.x86_vendor_id[4]); - - /* get cpu type */ - cpuid(1, &eax, &dummy, &dummy, - (unsigned int *) &boot_cpu_data.x86_capability); - boot_cpu_data.x86 = (eax >> 8) & 0xf; - boot_cpu_data.x86_model = (eax >> 4) & 0xf; - boot_cpu_data.x86_mask = eax & 0xf; -} - -extern char _end[]; - -void __init x86_64_start_kernel(char * real_mode_data) -{ - char *s; - int i; - - for (i = 0; i < 256; i++) - set_intr_gate(i, early_idt_handler); - asm volatile("lidt %0" :: "m" (idt_descr)); - clear_bss(); - pda_init(0); - copy_bootdata(real_mode_data); -#ifdef CONFIG_SMP - cpu_set(0, cpu_online_map); -#endif - s = strstr(saved_command_line, "earlyprintk="); - if (s != NULL) - setup_early_printk(s); -#ifdef CONFIG_NUMA - s = strstr(saved_command_line, "numa="); - if (s != NULL) - numa_setup(s+5); -#endif -#ifdef CONFIG_X86_IO_APIC - if (strstr(saved_command_line, "disableapic")) - disable_apic = 1; -#endif - /* You need early console to see that */ - if (__pa_symbol(&_end) >= KERNEL_TEXT_SIZE) - panic("Kernel too big for kernel mapping\n"); - - setup_boot_cpu_data(); - start_kernel(); -} diff --git a/arch/x86_64/kernel/i387.c b/arch/x86_64/kernel/i387.c deleted file mode 100644 index d9b22b633e3..00000000000 --- a/arch/x86_64/kernel/i387.c +++ /dev/null @@ -1,155 +0,0 @@ -/* - * linux/arch/x86_64/kernel/i387.c - * - * Copyright (C) 1994 Linus Torvalds - * Copyright (C) 2002 Andi Kleen, SuSE Labs - * - * Pentium III FXSR, SSE support - * General FPU state handling cleanups - * Gareth Hughes <gareth@valinux.com>, May 2000 - * - * x86-64 rework 2002 Andi Kleen. - * Does direct fxsave in and out of user space now for signal handlers. - * All the FSAVE<->FXSAVE conversion code has been moved to the 32bit emulation, - * the 64bit user space sees a FXSAVE frame directly. - */ - -#include <linux/config.h> -#include <linux/sched.h> -#include <linux/init.h> -#include <asm/processor.h> -#include <asm/i387.h> -#include <asm/sigcontext.h> -#include <asm/user.h> -#include <asm/ptrace.h> -#include <asm/uaccess.h> - -unsigned int mxcsr_feature_mask = 0xffffffff; - -void mxcsr_feature_mask_init(void) -{ - unsigned int mask; - clts(); - memset(¤t->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct)); - asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave)); - mask = current->thread.i387.fxsave.mxcsr_mask; - if (mask == 0) mask = 0x0000ffbf; - mxcsr_feature_mask &= mask; - stts(); -} - -/* - * Called at bootup to set up the initial FPU state that is later cloned - * into all processes. - */ -void __cpuinit fpu_init(void) -{ - unsigned long oldcr0 = read_cr0(); - extern void __bad_fxsave_alignment(void); - - if (offsetof(struct task_struct, thread.i387.fxsave) & 15) - __bad_fxsave_alignment(); - set_in_cr4(X86_CR4_OSFXSR); - set_in_cr4(X86_CR4_OSXMMEXCPT); - - write_cr0(oldcr0 & ~((1UL<<3)|(1UL<<2))); /* clear TS and EM */ - - mxcsr_feature_mask_init(); - /* clean state in init */ - current_thread_info()->status = 0; - clear_used_math(); -} - -void init_fpu(struct task_struct *child) -{ - if (tsk_used_math(child)) { - if (child == current) - unlazy_fpu(child); - return; - } - memset(&child->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct)); - child->thread.i387.fxsave.cwd = 0x37f; - child->thread.i387.fxsave.mxcsr = 0x1f80; - /* only the device not available exception or ptrace can call init_fpu */ - set_stopped_child_used_math(child); -} - -/* - * Signal frame handlers. - */ - -int save_i387(struct _fpstate __user *buf) -{ - struct task_struct *tsk = current; - int err = 0; - - { - extern void bad_user_i387_struct(void); - if (sizeof(struct user_i387_struct) != sizeof(tsk->thread.i387.fxsave)) - bad_user_i387_struct(); - } - - if ((unsigned long)buf % 16) - printk("save_i387: bad fpstate %p\n",buf); - - if (!used_math()) - return 0; - clear_used_math(); /* trigger finit */ - if (tsk->thread_info->status & TS_USEDFPU) { - err = save_i387_checking((struct i387_fxsave_struct __user *)buf); - if (err) return err; - stts(); - } else { - if (__copy_to_user(buf, &tsk->thread.i387.fxsave, - sizeof(struct i387_fxsave_struct))) - return -1; - } - return 1; -} - -/* - * ptrace request handlers. - */ - -int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *tsk) -{ - init_fpu(tsk); - return __copy_to_user(buf, &tsk->thread.i387.fxsave, - sizeof(struct user_i387_struct)) ? -EFAULT : 0; -} - -int set_fpregs(struct task_struct *tsk, struct user_i387_struct __user *buf) -{ - if (__copy_from_user(&tsk->thread.i387.fxsave, buf, - sizeof(struct user_i387_struct))) - return -EFAULT; - return 0; -} - -/* - * FPU state for core dumps. - */ - -int dump_fpu( struct pt_regs *regs, struct user_i387_struct *fpu ) -{ - struct task_struct *tsk = current; - - if (!used_math()) - return 0; - - unlazy_fpu(tsk); - memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct)); - return 1; -} - -int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu) -{ - int fpvalid = !!tsk_used_math(tsk); - - if (fpvalid) { - if (tsk == current) - unlazy_fpu(tsk); - memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct)); -} - return fpvalid; -} diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c deleted file mode 100644 index a8916909512..00000000000 --- a/arch/x86_64/kernel/i8259.c +++ /dev/null @@ -1,590 +0,0 @@ -#include <linux/linkage.h> -#include <linux/config.h> -#include <linux/errno.h> -#include <linux/signal.h> -#include <linux/sched.h> -#include <linux/ioport.h> -#include <linux/interrupt.h> -#include <linux/timex.h> -#include <linux/slab.h> -#include <linux/random.h> -#include <linux/smp_lock.h> -#include <linux/init.h> -#include <linux/kernel_stat.h> -#include <linux/sysdev.h> -#include <linux/bitops.h> - -#include <asm/acpi.h> -#include <asm/atomic.h> -#include <asm/system.h> -#include <asm/io.h> -#include <asm/irq.h> -#include <asm/hw_irq.h> -#include <asm/pgtable.h> -#include <asm/delay.h> -#include <asm/desc.h> -#include <asm/apic.h> - -#include <linux/irq.h> - -/* - * Common place to define all x86 IRQ vectors - * - * This builds up the IRQ handler stubs using some ugly macros in irq.h - * - * These macros create the low-level assembly IRQ routines that save - * register context and call do_IRQ(). do_IRQ() then does all the - * operations that are needed to keep the AT (or SMP IOAPIC) - * interrupt-controller happy. - */ - -#define BI(x,y) \ - BUILD_IRQ(x##y) - -#define BUILD_16_IRQS(x) \ - BI(x,0) BI(x,1) BI(x,2) BI(x,3) \ - BI(x,4) BI(x,5) BI(x,6) BI(x,7) \ - BI(x,8) BI(x,9) BI(x,a) BI(x,b) \ - BI(x,c) BI(x,d) BI(x,e) BI(x,f) - -#define BUILD_14_IRQS(x) \ - BI(x,0) BI(x,1) BI(x,2) BI(x,3) \ - BI(x,4) BI(x,5) BI(x,6) BI(x,7) \ - BI(x,8) BI(x,9) BI(x,a) BI(x,b) \ - BI(x,c) BI(x,d) - -/* - * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: - * (these are usually mapped to vectors 0x20-0x2f) - */ -BUILD_16_IRQS(0x0) - -#ifdef CONFIG_X86_LOCAL_APIC -/* - * The IO-APIC gives us many more interrupt sources. Most of these - * are unused but an SMP system is supposed to have enough memory ... - * sometimes (mostly wrt. hw bugs) we get corrupted vectors all - * across the spectrum, so we really want to be prepared to get all - * of these. Plus, more powerful systems might have more than 64 - * IO-APIC registers. - * - * (these are usually mapped into the 0x30-0xff vector range) - */ - BUILD_16_IRQS(0x1) BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3) -BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7) -BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb) -BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) - -#ifdef CONFIG_PCI_MSI - BUILD_14_IRQS(0xe) -#endif - -#endif - -#undef BUILD_16_IRQS -#undef BUILD_14_IRQS -#undef BI - - -#define IRQ(x,y) \ - IRQ##x##y##_interrupt - -#define IRQLIST_16(x) \ - IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \ - IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \ - IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \ - IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f) - -#define IRQLIST_14(x) \ - IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \ - IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \ - IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \ - IRQ(x,c), IRQ(x,d) - -void (*interrupt[NR_IRQS])(void) = { - IRQLIST_16(0x0), - -#ifdef CONFIG_X86_IO_APIC - IRQLIST_16(0x1), IRQLIST_16(0x2), IRQLIST_16(0x3), - IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7), - IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb), - IRQLIST_16(0xc), IRQLIST_16(0xd) - -#ifdef CONFIG_PCI_MSI - , IRQLIST_14(0xe) -#endif - -#endif -}; - -#undef IRQ -#undef IRQLIST_16 -#undef IRQLIST_14 - -/* - * This is the 'legacy' 8259A Programmable Interrupt Controller, - * present in the majority of PC/AT boxes. - * plus some generic x86 specific things if generic specifics makes - * any sense at all. - * this file should become arch/i386/kernel/irq.c when the old irq.c - * moves to arch independent land - */ - -DEFINE_SPINLOCK(i8259A_lock); - -static void end_8259A_irq (unsigned int irq) -{ - if (irq > 256) { - char var; - printk("return %p stack %p ti %p\n", __builtin_return_address(0), &var, current->thread_info); - - BUG(); - } - - if (!(irq_desc[irq].status & (IRQ_DISABLED|IRQ_INPROGRESS)) && - irq_desc[irq].action) - enable_8259A_irq(irq); -} - -#define shutdown_8259A_irq disable_8259A_irq - -static void mask_and_ack_8259A(unsigned int); - -static unsigned int startup_8259A_irq(unsigned int irq) -{ - enable_8259A_irq(irq); - return 0; /* never anything pending */ -} - -static struct hw_interrupt_type i8259A_irq_type = { - .typename = "XT-PIC", - .startup = startup_8259A_irq, - .shutdown = shutdown_8259A_irq, - .enable = enable_8259A_irq, - .disable = disable_8259A_irq, - .ack = mask_and_ack_8259A, - .end = end_8259A_irq, -}; - -/* - * 8259A PIC functions to handle ISA devices: - */ - -/* - * This contains the irq mask for both 8259A irq controllers, - */ -static unsigned int cached_irq_mask = 0xffff; - -#define __byte(x,y) (((unsigned char *)&(y))[x]) -#define cached_21 (__byte(0,cached_irq_mask)) -#define cached_A1 (__byte(1,cached_irq_mask)) - -/* - * Not all IRQs can be routed through the IO-APIC, eg. on certain (older) - * boards the timer interrupt is not really connected to any IO-APIC pin, - * it's fed to the master 8259A's IR0 line only. - * - * Any '1' bit in this mask means the IRQ is routed through the IO-APIC. - * this 'mixed mode' IRQ handling costs nothing because it's only used - * at IRQ setup time. - */ -unsigned long io_apic_irqs; - -void disable_8259A_irq(unsigned int irq) -{ - unsigned int mask = 1 << irq; - unsigned long flags; - - spin_lock_irqsave(&i8259A_lock, flags); - cached_irq_mask |= mask; - if (irq & 8) - outb(cached_A1,0xA1); - else - outb(cached_21,0x21); - spin_unlock_irqrestore(&i8259A_lock, flags); -} - -void enable_8259A_irq(unsigned int irq) -{ - unsigned int mask = ~(1 << irq); - unsigned long flags; - - spin_lock_irqsave(&i8259A_lock, flags); - cached_irq_mask &= mask; - if (irq & 8) - outb(cached_A1,0xA1); - else - outb(cached_21,0x21); - spin_unlock_irqrestore(&i8259A_lock, flags); -} - -int i8259A_irq_pending(unsigned int irq) -{ - unsigned int mask = 1<<irq; - unsigned long flags; - int ret; - - spin_lock_irqsave(&i8259A_lock, flags); - if (irq < 8) - ret = inb(0x20) & mask; - else - ret = inb(0xA0) & (mask >> 8); - spin_unlock_irqrestore(&i8259A_lock, flags); - - return ret; -} - -void make_8259A_irq(unsigned int irq) -{ - disable_irq_nosync(irq); - io_apic_irqs &= ~(1<<irq); - irq_desc[irq].handler = &i8259A_irq_type; - enable_irq(irq); -} - -/* - * This function assumes to be called rarely. Switching between - * 8259A registers is slow. - * This has to be protected by the irq controller spinlock - * before being called. - */ -static inline int i8259A_irq_real(unsigned int irq) -{ - int value; - int irqmask = 1<<irq; - - if (irq < 8) { - outb(0x0B,0x20); /* ISR register */ - value = inb(0x20) & irqmask; - outb(0x0A,0x20); /* back to the IRR register */ - return value; - } - outb(0x0B,0xA0); /* ISR register */ - value = inb(0xA0) & (irqmask >> 8); - outb(0x0A,0xA0); /* back to the IRR register */ - return value; -} - -/* - * Careful! The 8259A is a fragile beast, it pretty - * much _has_ to be done exactly like this (mask it - * first, _then_ send the EOI, and the order of EOI - * to the two 8259s is important! - */ -static void mask_and_ack_8259A(unsigned int irq) -{ - unsigned int irqmask = 1 << irq; - unsigned long flags; - - spin_lock_irqsave(&i8259A_lock, flags); - /* - * Lightweight spurious IRQ detection. We do not want - * to overdo spurious IRQ handling - it's usually a sign - * of hardware problems, so we only do the checks we can - * do without slowing down good hardware unnecesserily. - * - * Note that IRQ7 and IRQ15 (the two spurious IRQs - * usually resulting from the 8259A-1|2 PICs) occur - * even if the IRQ is masked in the 8259A. Thus we - * can check spurious 8259A IRQs without doing the - * quite slow i8259A_irq_real() call for every IRQ. - * This does not cover 100% of spurious interrupts, - * but should be enough to warn the user that there - * is something bad going on ... - */ - if (cached_irq_mask & irqmask) - goto spurious_8259A_irq; - cached_irq_mask |= irqmask; - -handle_real_irq: - if (irq & 8) { - inb(0xA1); /* DUMMY - (do we need this?) */ - outb(cached_A1,0xA1); - outb(0x60+(irq&7),0xA0);/* 'Specific EOI' to slave */ - outb(0x62,0x20); /* 'Specific EOI' to master-IRQ2 */ - } else { - inb(0x21); /* DUMMY - (do we need this?) */ - outb(cached_21,0x21); - outb(0x60+irq,0x20); /* 'Specific EOI' to master */ - } - spin_unlock_irqrestore(&i8259A_lock, flags); - return; - -spurious_8259A_irq: - /* - * this is the slow path - should happen rarely. - */ - if (i8259A_irq_real(irq)) - /* - * oops, the IRQ _is_ in service according to the - * 8259A - not spurious, go handle it. - */ - goto handle_real_irq; - - { - static int spurious_irq_mask; - /* - * At this point we can be sure the IRQ is spurious, - * lets ACK and report it. [once per IRQ] - */ - if (!(spurious_irq_mask & irqmask)) { - printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq); - spurious_irq_mask |= irqmask; - } - atomic_inc(&irq_err_count); - /* - * Theoretically we do not have to handle this IRQ, - * but in Linux this does not cause problems and is - * simpler for us. - */ - goto handle_real_irq; - } -} - -void init_8259A(int auto_eoi) -{ - unsigned long flags; - - spin_lock_irqsave(&i8259A_lock, flags); - - outb(0xff, 0x21); /* mask all of 8259A-1 */ - outb(0xff, 0xA1); /* mask all of 8259A-2 */ - - /* - * outb_p - this has to work on a wide range of PC hardware. - */ - outb_p(0x11, 0x20); /* ICW1: select 8259A-1 init */ - outb_p(0x20 + 0, 0x21); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */ - outb_p(0x04, 0x21); /* 8259A-1 (the master) has a slave on IR2 */ - if (auto_eoi) - outb_p(0x03, 0x21); /* master does Auto EOI */ - else - outb_p(0x01, 0x21); /* master expects normal EOI */ - - outb_p(0x11, 0xA0); /* ICW1: select 8259A-2 init */ - outb_p(0x20 + 8, 0xA1); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */ - outb_p(0x02, 0xA1); /* 8259A-2 is a slave on master's IR2 */ - outb_p(0x01, 0xA1); /* (slave's support for AEOI in flat mode - is to be investigated) */ - - if (auto_eoi) - /* - * in AEOI mode we just have to mask the interrupt - * when acking. - */ - i8259A_irq_type.ack = disable_8259A_irq; - else - i8259A_irq_type.ack = mask_and_ack_8259A; - - udelay(100); /* wait for 8259A to initialize */ - - outb(cached_21, 0x21); /* restore master IRQ mask */ - outb(cached_A1, 0xA1); /* restore slave IRQ mask */ - - spin_unlock_irqrestore(&i8259A_lock, flags); -} - -static char irq_trigger[2]; -/** - * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ - */ -static void restore_ELCR(char *trigger) -{ - outb(trigger[0], 0x4d0); - outb(trigger[1], 0x4d1); -} - -static void save_ELCR(char *trigger) -{ - /* IRQ 0,1,2,8,13 are marked as reserved */ - trigger[0] = inb(0x4d0) & 0xF8; - trigger[1] = inb(0x4d1) & 0xDE; -} - -static int i8259A_resume(struct sys_device *dev) -{ - init_8259A(0); - restore_ELCR(irq_trigger); - return 0; -} - -static int i8259A_suspend(struct sys_device *dev, pm_message_t state) -{ - save_ELCR(irq_trigger); - return 0; -} - -static int i8259A_shutdown(struct sys_device *dev) -{ - /* Put the i8259A into a quiescent state that - * the kernel initialization code can get it - * out of. - */ - outb(0xff, 0x21); /* mask all of 8259A-1 */ - outb(0xff, 0xA1); /* mask all of 8259A-1 */ - return 0; -} - -static struct sysdev_class i8259_sysdev_class = { - set_kset_name("i8259"), - .suspend = i8259A_suspend, - .resume = i8259A_resume, - .shutdown = i8259A_shutdown, -}; - -static struct sys_device device_i8259A = { - .id = 0, - .cls = &i8259_sysdev_class, -}; - -static int __init i8259A_init_sysfs(void) -{ - int error = sysdev_class_register(&i8259_sysdev_class); - if (!error) - error = sysdev_register(&device_i8259A); - return error; -} - -device_initcall(i8259A_init_sysfs); - -/* - * IRQ2 is cascade interrupt to second interrupt controller - */ - -static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL}; - -void __init init_ISA_irqs (void) -{ - int i; - -#ifdef CONFIG_X86_LOCAL_APIC - init_bsp_APIC(); -#endif - init_8259A(0); - - for (i = 0; i < NR_IRQS; i++) { - irq_desc[i].status = IRQ_DISABLED; - irq_desc[i].action = NULL; - irq_desc[i].depth = 1; - - if (i < 16) { - /* - * 16 old-style INTA-cycle interrupts: - */ - irq_desc[i].handler = &i8259A_irq_type; - } else { - /* - * 'high' PCI IRQs filled in on demand - */ - irq_desc[i].handler = &no_irq_type; - } - } -} - -void apic_timer_interrupt(void); -void spurious_interrupt(void); -void error_interrupt(void); -void reschedule_interrupt(void); -void call_function_interrupt(void); -void invalidate_interrupt(void); -void thermal_interrupt(void); -void i8254_timer_resume(void); - -static void setup_timer(void) -{ - outb_p(0x34,0x43); /* binary, mode 2, LSB/MSB, ch 0 */ - udelay(10); - outb_p(LATCH & 0xff , 0x40); /* LSB */ - udelay(10); - outb(LATCH >> 8 , 0x40); /* MSB */ -} - -static int timer_resume(struct sys_device *dev) -{ - setup_timer(); - return 0; -} - -void i8254_timer_resume(void) -{ - setup_timer(); -} - -static struct sysdev_class timer_sysclass = { - set_kset_name("timer"), - .resume = timer_resume, -}; - -static struct sys_device device_timer = { - .id = 0, - .cls = &timer_sysclass, -}; - -static int __init init_timer_sysfs(void) -{ - int error = sysdev_class_register(&timer_sysclass); - if (!error) - error = sysdev_register(&device_timer); - return error; -} - -device_initcall(init_timer_sysfs); - -void __init init_IRQ(void) -{ - int i; - - init_ISA_irqs(); - /* - * Cover the whole vector space, no vector can escape - * us. (some of these will be overridden and become - * 'special' SMP interrupts) - */ - for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { - int vector = FIRST_EXTERNAL_VECTOR + i; - if (i >= NR_IRQS) - break; - if (vector != IA32_SYSCALL_VECTOR && vector != KDB_VECTOR) { - set_intr_gate(vector, interrupt[i]); - } - } - -#ifdef CONFIG_SMP - /* - * IRQ0 must be given a fixed assignment and initialized, - * because it's used before the IO-APIC is set up. - */ - set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]); - - /* - * The reschedule interrupt is a CPU-to-CPU reschedule-helper - * IPI, driven by wakeup. - */ - set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); - - /* IPI for invalidation */ - set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt); - - /* IPI for generic function call */ - set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); -#endif - set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); - -#ifdef CONFIG_X86_LOCAL_APIC - /* self generated IPI for local APIC timer */ - set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); - - /* IPI vectors for APIC spurious and error interrupts */ - set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); - set_intr_gate(ERROR_APIC_VECTOR, error_interrupt); -#endif - - /* - * Set the clock to HZ Hz, we already have a valid - * vector now: - */ - setup_timer(); - - if (!acpi_ioapic) - setup_irq(2, &irq2); -} diff --git a/arch/x86_64/kernel/init_task.c b/arch/x86_64/kernel/init_task.c deleted file mode 100644 index c4dc9149115..00000000000 --- a/arch/x86_64/kernel/init_task.c +++ /dev/null @@ -1,49 +0,0 @@ -#include <linux/mm.h> -#include <linux/module.h> -#include <linux/sched.h> -#include <linux/init.h> -#include <linux/init_task.h> -#include <linux/fs.h> -#include <linux/mqueue.h> - -#include <asm/uaccess.h> -#include <asm/pgtable.h> -#include <asm/desc.h> - -static struct fs_struct init_fs = INIT_FS; -static struct files_struct init_files = INIT_FILES; -static struct signal_struct init_signals = INIT_SIGNALS(init_signals); -static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); -struct mm_struct init_mm = INIT_MM(init_mm); - -EXPORT_SYMBOL(init_mm); - -/* - * Initial task structure. - * - * We need to make sure that this is 8192-byte aligned due to the - * way process stacks are handled. This is done by having a special - * "init_task" linker map entry.. - */ -union thread_union init_thread_union - __attribute__((__section__(".data.init_task"))) = - { INIT_THREAD_INFO(init_task) }; - -/* - * Initial task structure. - * - * All other task structs will be allocated on slabs in fork.c - */ -struct task_struct init_task = INIT_TASK(init_task); - -EXPORT_SYMBOL(init_task); -/* - * per-CPU TSS segments. Threads are completely 'soft' on Linux, - * no more per-task TSS's. The TSS size is kept cacheline-aligned - * so they are allowed to end up in the .data.cacheline_aligned - * section. Since TSS's are completely CPU-local, we want them - * on exact cacheline boundaries, to eliminate cacheline ping-pong. - */ -DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_maxaligned_in_smp; - -#define ALIGN_TO_4K __attribute__((section(".data.init_task"))) diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c deleted file mode 100644 index d206d7e49cf..00000000000 --- a/arch/x86_64/kernel/io_apic.c +++ /dev/null @@ -1,1951 +0,0 @@ -/* - * Intel IO-APIC support for multi-Pentium hosts. - * - * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo - * - * Many thanks to Stig Venaas for trying out countless experimental - * patches and reporting/debugging problems patiently! - * - * (c) 1999, Multiple IO-APIC support, developed by - * Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and - * Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>, - * further tested and cleaned up by Zach Brown <zab@redhat.com> - * and Ingo Molnar <mingo@redhat.com> - * - * Fixes - * Maciej W. Rozycki : Bits for genuine 82489DX APICs; - * thanks to Eric Gilmore - * and Rolf G. Tews - * for testing these extensively - * Paul Diefenbaugh : Added full ACPI support - */ - -#include <linux/mm.h> -#include <linux/irq.h> -#include <linux/interrupt.h> -#include <linux/init.h> -#include <linux/delay.h> -#include <linux/sched.h> -#include <linux/config.h> -#include <linux/smp_lock.h> -#include <linux/mc146818rtc.h> -#include <linux/acpi.h> -#include <linux/sysdev.h> - -#include <asm/io.h> -#include <asm/smp.h> -#include <asm/desc.h> -#include <asm/proto.h> -#include <asm/mach_apic.h> -#include <asm/acpi.h> - -#define __apicdebuginit __init - -int sis_apic_bug; /* not actually supported, dummy for compile */ - -static int no_timer_check; - -static DEFINE_SPINLOCK(ioapic_lock); - -/* - * # of IRQ routing registers - */ -int nr_ioapic_registers[MAX_IO_APICS]; - -/* - * Rough estimation of how many shared IRQs there are, can - * be changed anytime. - */ -#define MAX_PLUS_SHARED_IRQS NR_IRQS -#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) - -/* - * This is performance-critical, we want to do it O(1) - * - * the indexing order of this array favors 1:1 mappings - * between pins and IRQs. - */ - -static struct irq_pin_list { - short apic, pin, next; -} irq_2_pin[PIN_MAP_SIZE]; - -int vector_irq[NR_VECTORS] = { [0 ... NR_VECTORS - 1] = -1}; -#ifdef CONFIG_PCI_MSI -#define vector_to_irq(vector) \ - (platform_legacy_irq(vector) ? vector : vector_irq[vector]) -#else -#define vector_to_irq(vector) (vector) -#endif - -/* - * The common case is 1:1 IRQ<->pin mappings. Sometimes there are - * shared ISA-space IRQs, so we have to support them. We are super - * fast in the common case, and fast for shared ISA-space IRQs. - */ -static void add_pin_to_irq(unsigned int irq, int apic, int pin) -{ - static int first_free_entry = NR_IRQS; - struct irq_pin_list *entry = irq_2_pin + irq; - - while (entry->next) - entry = irq_2_pin + entry->next; - - if (entry->pin != -1) { - entry->next = first_free_entry; - entry = irq_2_pin + entry->next; - if (++first_free_entry >= PIN_MAP_SIZE) - panic("io_apic.c: whoops"); - } - entry->apic = apic; - entry->pin = pin; -} - -#define __DO_ACTION(R, ACTION, FINAL) \ - \ -{ \ - int pin; \ - struct irq_pin_list *entry = irq_2_pin + irq; \ - \ - for (;;) { \ - unsigned int reg; \ - pin = entry->pin; \ - if (pin == -1) \ - break; \ - reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ - reg ACTION; \ - io_apic_modify(entry->apic, reg); \ - if (!entry->next) \ - break; \ - entry = irq_2_pin + entry->next; \ - } \ - FINAL; \ -} - -#define DO_ACTION(name,R,ACTION, FINAL) \ - \ - static void name##_IO_APIC_irq (unsigned int irq) \ - __DO_ACTION(R, ACTION, FINAL) - -DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) ) - /* mask = 1 */ -DO_ACTION( __unmask, 0, &= 0xfffeffff, ) - /* mask = 0 */ - -static void mask_IO_APIC_irq (unsigned int irq) -{ - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - __mask_IO_APIC_irq(irq); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -static void unmask_IO_APIC_irq (unsigned int irq) -{ - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - __unmask_IO_APIC_irq(irq); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) -{ - struct IO_APIC_route_entry entry; - unsigned long flags; - - /* Check delivery_mode to be sure we're not clearing an SMI pin */ - spin_lock_irqsave(&ioapic_lock, flags); - *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin); - *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin); - spin_unlock_irqrestore(&ioapic_lock, flags); - if (entry.delivery_mode == dest_SMI) - return; - /* - * Disable it in the IO-APIC irq-routing table: - */ - memset(&entry, 0, sizeof(entry)); - entry.mask = 1; - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0)); - io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1)); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -static void clear_IO_APIC (void) -{ - int apic, pin; - - for (apic = 0; apic < nr_ioapics; apic++) - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) - clear_IO_APIC_pin(apic, pin); -} - -/* - * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to - * specific CPU-side IRQs. - */ - -#define MAX_PIRQS 8 -static int pirq_entries [MAX_PIRQS]; -static int pirqs_enabled; -int skip_ioapic_setup; -int ioapic_force; - -/* dummy parsing: see setup.c */ - -static int __init disable_ioapic_setup(char *str) -{ - skip_ioapic_setup = 1; - return 1; -} - -static int __init enable_ioapic_setup(char *str) -{ - ioapic_force = 1; - skip_ioapic_setup = 0; - return 1; -} - -__setup("noapic", disable_ioapic_setup); -__setup("apic", enable_ioapic_setup); - -#include <asm/pci-direct.h> -#include <linux/pci_ids.h> -#include <linux/pci.h> - -/* Temporary Hack. Nvidia and VIA boards currently only work with IO-APIC - off. Check for an Nvidia or VIA PCI bridge and turn it off. - Use pci direct infrastructure because this runs before the PCI subsystem. - - Can be overwritten with "apic" - - And another hack to disable the IOMMU on VIA chipsets. - - Kludge-O-Rama. */ -void __init check_ioapic(void) -{ - int num,slot,func; - if (ioapic_force) - return; - - /* Poor man's PCI discovery */ - for (num = 0; num < 32; num++) { - for (slot = 0; slot < 32; slot++) { - for (func = 0; func < 8; func++) { - u32 class; - u32 vendor; - u8 type; - class = read_pci_config(num,slot,func, - PCI_CLASS_REVISION); - if (class == 0xffffffff) - break; - - if ((class >> 16) != PCI_CLASS_BRIDGE_PCI) - continue; - - vendor = read_pci_config(num, slot, func, - PCI_VENDOR_ID); - vendor &= 0xffff; - switch (vendor) { - case PCI_VENDOR_ID_VIA: -#ifdef CONFIG_GART_IOMMU - if ((end_pfn >= (0xffffffff>>PAGE_SHIFT) || - force_iommu) && - !iommu_aperture_allowed) { - printk(KERN_INFO - "Looks like a VIA chipset. Disabling IOMMU. Overwrite with \"iommu=allowed\"\n"); - iommu_aperture_disabled = 1; - } -#endif - return; - case PCI_VENDOR_ID_NVIDIA: -#ifdef CONFIG_ACPI - /* All timer overrides on Nvidia - seem to be wrong. Skip them. */ - acpi_skip_timer_override = 1; - printk(KERN_INFO - "Nvidia board detected. Ignoring ACPI timer override.\n"); -#endif - /* RED-PEN skip them on mptables too? */ - return; - } - - /* No multi-function device? */ - type = read_pci_config_byte(num,slot,func, - PCI_HEADER_TYPE); - if (!(type & 0x80)) - break; - } - } - } -} - -static int __init ioapic_pirq_setup(char *str) -{ - int i, max; - int ints[MAX_PIRQS+1]; - - get_options(str, ARRAY_SIZE(ints), ints); - - for (i = 0; i < MAX_PIRQS; i++) - pirq_entries[i] = -1; - - pirqs_enabled = 1; - apic_printk(APIC_VERBOSE, "PIRQ redirection, working around broken MP-BIOS.\n"); - max = MAX_PIRQS; - if (ints[0] < MAX_PIRQS) - max = ints[0]; - - for (i = 0; i < max; i++) { - apic_printk(APIC_VERBOSE, "... PIRQ%d -> IRQ %d\n", i, ints[i+1]); - /* - * PIRQs are mapped upside down, usually. - */ - pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; - } - return 1; -} - -__setup("pirq=", ioapic_pirq_setup); - -/* - * Find the IRQ entry number of a certain pin. - */ -static int find_irq_entry(int apic, int pin, int type) -{ - int i; - - for (i = 0; i < mp_irq_entries; i++) - if (mp_irqs[i].mpc_irqtype == type && - (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid || - mp_irqs[i].mpc_dstapic == MP_APIC_ALL) && - mp_irqs[i].mpc_dstirq == pin) - return i; - - return -1; -} - -/* - * Find the pin to which IRQ[irq] (ISA) is connected - */ -static int find_isa_irq_pin(int irq, int type) -{ - int i; - - for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].mpc_srcbus; - - if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || - mp_bus_id_to_type[lbus] == MP_BUS_EISA || - mp_bus_id_to_type[lbus] == MP_BUS_MCA) && - (mp_irqs[i].mpc_irqtype == type) && - (mp_irqs[i].mpc_srcbusirq == irq)) - - return mp_irqs[i].mpc_dstirq; - } - return -1; -} - -/* - * Find a specific PCI IRQ entry. - * Not an __init, possibly needed by modules - */ -static int pin_2_irq(int idx, int apic, int pin); - -int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) -{ - int apic, i, best_guess = -1; - - apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", - bus, slot, pin); - if (mp_bus_id_to_pci_bus[bus] == -1) { - apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus); - return -1; - } - for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].mpc_srcbus; - - for (apic = 0; apic < nr_ioapics; apic++) - if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic || - mp_irqs[i].mpc_dstapic == MP_APIC_ALL) - break; - - if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) && - !mp_irqs[i].mpc_irqtype && - (bus == lbus) && - (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) { - int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq); - - if (!(apic || IO_APIC_IRQ(irq))) - continue; - - if (pin == (mp_irqs[i].mpc_srcbusirq & 3)) - return irq; - /* - * Use the first all-but-pin matching entry as a - * best-guess fuzzy result for broken mptables. - */ - if (best_guess < 0) - best_guess = irq; - } - } - return best_guess; -} - -/* - * EISA Edge/Level control register, ELCR - */ -static int EISA_ELCR(unsigned int irq) -{ - if (irq < 16) { - unsigned int port = 0x4d0 + (irq >> 3); - return (inb(port) >> (irq & 7)) & 1; - } - apic_printk(APIC_VERBOSE, "Broken MPtable reports ISA irq %d\n", irq); - return 0; -} - -/* EISA interrupts are always polarity zero and can be edge or level - * trigger depending on the ELCR value. If an interrupt is listed as - * EISA conforming in the MP table, that means its trigger type must - * be read in from the ELCR */ - -#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq)) -#define default_EISA_polarity(idx) (0) - -/* ISA interrupts are always polarity zero edge triggered, - * when listed as conforming in the MP table. */ - -#define default_ISA_trigger(idx) (0) -#define default_ISA_polarity(idx) (0) - -/* PCI interrupts are always polarity one level triggered, - * when listed as conforming in the MP table. */ - -#define default_PCI_trigger(idx) (1) -#define default_PCI_polarity(idx) (1) - -/* MCA interrupts are always polarity zero level triggered, - * when listed as conforming in the MP table. */ - -#define default_MCA_trigger(idx) (1) -#define default_MCA_polarity(idx) (0) - -static int __init MPBIOS_polarity(int idx) -{ - int bus = mp_irqs[idx].mpc_srcbus; - int polarity; - - /* - * Determine IRQ line polarity (high active or low active): - */ - switch (mp_irqs[idx].mpc_irqflag & 3) - { - case 0: /* conforms, ie. bus-type dependent polarity */ - { - switch (mp_bus_id_to_type[bus]) - { - case MP_BUS_ISA: /* ISA pin */ - { - polarity = default_ISA_polarity(idx); - break; - } - case MP_BUS_EISA: /* EISA pin */ - { - polarity = default_EISA_polarity(idx); - break; - } - case MP_BUS_PCI: /* PCI pin */ - { - polarity = default_PCI_polarity(idx); - break; - } - case MP_BUS_MCA: /* MCA pin */ - { - polarity = default_MCA_polarity(idx); - break; - } - default: - { - printk(KERN_WARNING "broken BIOS!!\n"); - polarity = 1; - break; - } - } - break; - } - case 1: /* high active */ - { - polarity = 0; - break; - } - case 2: /* reserved */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - polarity = 1; - break; - } - case 3: /* low active */ - { - polarity = 1; - break; - } - default: /* invalid */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - polarity = 1; - break; - } - } - return polarity; -} - -static int MPBIOS_trigger(int idx) -{ - int bus = mp_irqs[idx].mpc_srcbus; - int trigger; - - /* - * Determine IRQ trigger mode (edge or level sensitive): - */ - switch ((mp_irqs[idx].mpc_irqflag>>2) & 3) - { - case 0: /* conforms, ie. bus-type dependent */ - { - switch (mp_bus_id_to_type[bus]) - { - case MP_BUS_ISA: /* ISA pin */ - { - trigger = default_ISA_trigger(idx); - break; - } - case MP_BUS_EISA: /* EISA pin */ - { - trigger = default_EISA_trigger(idx); - break; - } - case MP_BUS_PCI: /* PCI pin */ - { - trigger = default_PCI_trigger(idx); - break; - } - case MP_BUS_MCA: /* MCA pin */ - { - trigger = default_MCA_trigger(idx); - break; - } - default: - { - printk(KERN_WARNING "broken BIOS!!\n"); - trigger = 1; - break; - } - } - break; - } - case 1: /* edge */ - { - trigger = 0; - break; - } - case 2: /* reserved */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - trigger = 1; - break; - } - case 3: /* level */ - { - trigger = 1; - break; - } - default: /* invalid */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - trigger = 0; - break; - } - } - return trigger; -} - -static inline int irq_polarity(int idx) -{ - return MPBIOS_polarity(idx); -} - -static inline int irq_trigger(int idx) -{ - return MPBIOS_trigger(idx); -} - -static int pin_2_irq(int idx, int apic, int pin) -{ - int irq, i; - int bus = mp_irqs[idx].mpc_srcbus; - - /* - * Debugging check, we are in big trouble if this message pops up! - */ - if (mp_irqs[idx].mpc_dstirq != pin) - printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); - - switch (mp_bus_id_to_type[bus]) - { - case MP_BUS_ISA: /* ISA pin */ - case MP_BUS_EISA: - case MP_BUS_MCA: - { - irq = mp_irqs[idx].mpc_srcbusirq; - break; - } - case MP_BUS_PCI: /* PCI pin */ - { - /* - * PCI IRQs are mapped in order - */ - i = irq = 0; - while (i < apic) - irq += nr_ioapic_registers[i++]; - irq += pin; - break; - } - default: - { - printk(KERN_ERR "unknown bus type %d.\n",bus); - irq = 0; - break; - } - } - - /* - * PCI IRQ command line redirection. Yes, limits are hardcoded. - */ - if ((pin >= 16) && (pin <= 23)) { - if (pirq_entries[pin-16] != -1) { - if (!pirq_entries[pin-16]) { - apic_printk(APIC_VERBOSE, "disabling PIRQ%d\n", pin-16); - } else { - irq = pirq_entries[pin-16]; - apic_printk(APIC_VERBOSE, "using PIRQ%d -> IRQ %d\n", - pin-16, irq); - } - } - } - return irq; -} - -static inline int IO_APIC_irq_trigger(int irq) -{ - int apic, idx, pin; - - for (apic = 0; apic < nr_ioapics; apic++) { - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - idx = find_irq_entry(apic,pin,mp_INT); - if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin))) - return irq_trigger(idx); - } - } - /* - * nonexistent IRQs are edge default - */ - return 0; -} - -/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */ -u8 irq_vector[NR_IRQ_VECTORS] = { FIRST_DEVICE_VECTOR , 0 }; - -int assign_irq_vector(int irq) -{ - static int current_vector = FIRST_DEVICE_VECTOR, offset = 0; - - BUG_ON(irq >= NR_IRQ_VECTORS); - if (IO_APIC_VECTOR(irq) > 0) - return IO_APIC_VECTOR(irq); -next: - current_vector += 8; - if (current_vector == IA32_SYSCALL_VECTOR) - goto next; - - if (current_vector >= FIRST_SYSTEM_VECTOR) { - offset++; - if (!(offset%8)) - return -ENOSPC; - current_vector = FIRST_DEVICE_VECTOR + offset; - } - - vector_irq[current_vector] = irq; - if (irq != AUTO_ASSIGN) - IO_APIC_VECTOR(irq) = current_vector; - - return current_vector; -} - -extern void (*interrupt[NR_IRQS])(void); -static struct hw_interrupt_type ioapic_level_type; -static struct hw_interrupt_type ioapic_edge_type; - -#define IOAPIC_AUTO -1 -#define IOAPIC_EDGE 0 -#define IOAPIC_LEVEL 1 - -static inline void ioapic_register_intr(int irq, int vector, unsigned long trigger) -{ - if (use_pci_vector() && !platform_legacy_irq(irq)) { - if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || - trigger == IOAPIC_LEVEL) - irq_desc[vector].handler = &ioapic_level_type; - else - irq_desc[vector].handler = &ioapic_edge_type; - set_intr_gate(vector, interrupt[vector]); - } else { - if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || - trigger == IOAPIC_LEVEL) - irq_desc[irq].handler = &ioapic_level_type; - else - irq_desc[irq].handler = &ioapic_edge_type; - set_intr_gate(vector, interrupt[irq]); - } -} - -static void __init setup_IO_APIC_irqs(void) -{ - struct IO_APIC_route_entry entry; - int apic, pin, idx, irq, first_notcon = 1, vector; - unsigned long flags; - - apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); - - for (apic = 0; apic < nr_ioapics; apic++) { - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - - /* - * add it to the IO-APIC irq-routing table: - */ - memset(&entry,0,sizeof(entry)); - - entry.delivery_mode = INT_DELIVERY_MODE; - entry.dest_mode = INT_DEST_MODE; - entry.mask = 0; /* enable IRQ */ - entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); - - idx = find_irq_entry(apic,pin,mp_INT); - if (idx == -1) { - if (first_notcon) { - apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin); - first_notcon = 0; - } else - apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin); - continue; - } - - entry.trigger = irq_trigger(idx); - entry.polarity = irq_polarity(idx); - - if (irq_trigger(idx)) { - entry.trigger = 1; - entry.mask = 1; - entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); - } - - irq = pin_2_irq(idx, apic, pin); - add_pin_to_irq(irq, apic, pin); - - if (!apic && !IO_APIC_IRQ(irq)) - continue; - - if (IO_APIC_IRQ(irq)) { - vector = assign_irq_vector(irq); - entry.vector = vector; - - ioapic_register_intr(irq, vector, IOAPIC_AUTO); - if (!apic && (irq < 16)) - disable_8259A_irq(irq); - } - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1)); - io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0)); - spin_unlock_irqrestore(&ioapic_lock, flags); - } - } - - if (!first_notcon) - apic_printk(APIC_VERBOSE," not connected.\n"); -} - -/* - * Set up the 8259A-master output pin as broadcast to all - * CPUs. - */ -static void __init setup_ExtINT_IRQ0_pin(unsigned int pin, int vector) -{ - struct IO_APIC_route_entry entry; - unsigned long flags; - - memset(&entry,0,sizeof(entry)); - - disable_8259A_irq(0); - - /* mask LVT0 */ - apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); - - /* - * We use logical delivery to get the timer IRQ - * to the first CPU. - */ - entry.dest_mode = INT_DEST_MODE; - entry.mask = 0; /* unmask IRQ now */ - entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); - entry.delivery_mode = INT_DELIVERY_MODE; - entry.polarity = 0; - entry.trigger = 0; - entry.vector = vector; - - /* - * The timer IRQ doesn't have to know that behind the - * scene we have a 8259A-master in AEOI mode ... - */ - irq_desc[0].handler = &ioapic_edge_type; - - /* - * Add it to the IO-APIC irq-routing table: - */ - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1)); - io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0)); - spin_unlock_irqrestore(&ioapic_lock, flags); - - enable_8259A_irq(0); -} - -void __init UNEXPECTED_IO_APIC(void) -{ -} - -void __apicdebuginit print_IO_APIC(void) -{ - int apic, i; - union IO_APIC_reg_00 reg_00; - union IO_APIC_reg_01 reg_01; - union IO_APIC_reg_02 reg_02; - unsigned long flags; - - if (apic_verbosity == APIC_QUIET) - return; - - printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); - for (i = 0; i < nr_ioapics; i++) - printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", - mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]); - - /* - * We are a bit conservative about what we expect. We have to - * know about every hardware change ASAP. - */ - printk(KERN_INFO "testing the IO APIC.......................\n"); - - for (apic = 0; apic < nr_ioapics; apic++) { - - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(apic, 0); - reg_01.raw = io_apic_read(apic, 1); - if (reg_01.bits.version >= 0x10) - reg_02.raw = io_apic_read(apic, 2); - spin_unlock_irqrestore(&ioapic_lock, flags); - - printk("\n"); - printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid); - printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); - printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); - if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2) - UNEXPECTED_IO_APIC(); - - printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)®_01); - printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); - if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */ - (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */ - (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */ - (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */ - (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */ - (reg_01.bits.entries != 0x2E) && - (reg_01.bits.entries != 0x3F) && - (reg_01.bits.entries != 0x03) - ) - UNEXPECTED_IO_APIC(); - - printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); - printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); - if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */ - (reg_01.bits.version != 0x02) && /* 82801BA IO-APICs (ICH2) */ - (reg_01.bits.version != 0x10) && /* oldest IO-APICs */ - (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */ - (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */ - (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */ - ) - UNEXPECTED_IO_APIC(); - if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2) - UNEXPECTED_IO_APIC(); - - if (reg_01.bits.version >= 0x10) { - printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); - printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); - if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2) - UNEXPECTED_IO_APIC(); - } - - printk(KERN_DEBUG ".... IRQ redirection table:\n"); - - printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol" - " Stat Dest Deli Vect: \n"); - - for (i = 0; i <= reg_01.bits.entries; i++) { - struct IO_APIC_route_entry entry; - - spin_lock_irqsave(&ioapic_lock, flags); - *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2); - *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2); - spin_unlock_irqrestore(&ioapic_lock, flags); - - printk(KERN_DEBUG " %02x %03X %02X ", - i, - entry.dest.logical.logical_dest, - entry.dest.physical.physical_dest - ); - - printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", - entry.mask, - entry.trigger, - entry.irr, - entry.polarity, - entry.delivery_status, - entry.dest_mode, - entry.delivery_mode, - entry.vector - ); - } - } - if (use_pci_vector()) - printk(KERN_INFO "Using vector-based indexing\n"); - printk(KERN_DEBUG "IRQ to pin mappings:\n"); - for (i = 0; i < NR_IRQS; i++) { - struct irq_pin_list *entry = irq_2_pin + i; - if (entry->pin < 0) - continue; - if (use_pci_vector() && !platform_legacy_irq(i)) - printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i)); - else - printk(KERN_DEBUG "IRQ%d ", i); - for (;;) { - printk("-> %d:%d", entry->apic, entry->pin); - if (!entry->next) - break; - entry = irq_2_pin + entry->next; - } - printk("\n"); - } - - printk(KERN_INFO ".................................... done.\n"); - - return; -} - -#if 0 - -static __apicdebuginit void print_APIC_bitfield (int base) -{ - unsigned int v; - int i, j; - - if (apic_verbosity == APIC_QUIET) - return; - - printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG); - for (i = 0; i < 8; i++) { - v = apic_read(base + i*0x10); - for (j = 0; j < 32; j++) { - if (v & (1<<j)) - printk("1"); - else - printk("0"); - } - printk("\n"); - } -} - -void __apicdebuginit print_local_APIC(void * dummy) -{ - unsigned int v, ver, maxlvt; - - if (apic_verbosity == APIC_QUIET) - return; - - printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", - smp_processor_id(), hard_smp_processor_id()); - v = apic_read(APIC_ID); - printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(v)); - v = apic_read(APIC_LVR); - printk(KERN_INFO "... APIC VERSION: %08x\n", v); - ver = GET_APIC_VERSION(v); - maxlvt = get_maxlvt(); - - v = apic_read(APIC_TASKPRI); - printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); - - if (APIC_INTEGRATED(ver)) { /* !82489DX */ - v = apic_read(APIC_ARBPRI); - printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v, - v & APIC_ARBPRI_MASK); - v = apic_read(APIC_PROCPRI); - printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v); - } - - v = apic_read(APIC_EOI); - printk(KERN_DEBUG "... APIC EOI: %08x\n", v); - v = apic_read(APIC_RRR); - printk(KERN_DEBUG "... APIC RRR: %08x\n", v); - v = apic_read(APIC_LDR); - printk(KERN_DEBUG "... APIC LDR: %08x\n", v); - v = apic_read(APIC_DFR); - printk(KERN_DEBUG "... APIC DFR: %08x\n", v); - v = apic_read(APIC_SPIV); - printk(KERN_DEBUG "... APIC SPIV: %08x\n", v); - - printk(KERN_DEBUG "... APIC ISR field:\n"); - print_APIC_bitfield(APIC_ISR); - printk(KERN_DEBUG "... APIC TMR field:\n"); - print_APIC_bitfield(APIC_TMR); - printk(KERN_DEBUG "... APIC IRR field:\n"); - print_APIC_bitfield(APIC_IRR); - - if (APIC_INTEGRATED(ver)) { /* !82489DX */ - if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ - apic_write(APIC_ESR, 0); - v = apic_read(APIC_ESR); - printk(KERN_DEBUG "... APIC ESR: %08x\n", v); - } - - v = apic_read(APIC_ICR); - printk(KERN_DEBUG "... APIC ICR: %08x\n", v); - v = apic_read(APIC_ICR2); - printk(KERN_DEBUG "... APIC ICR2: %08x\n", v); - - v = apic_read(APIC_LVTT); - printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); - - if (maxlvt > 3) { /* PC is LVT#4. */ - v = apic_read(APIC_LVTPC); - printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v); - } - v = apic_read(APIC_LVT0); - printk(KERN_DEBUG "... APIC LVT0: %08x\n", v); - v = apic_read(APIC_LVT1); - printk(KERN_DEBUG "... APIC LVT1: %08x\n", v); - - if (maxlvt > 2) { /* ERR is LVT#3. */ - v = apic_read(APIC_LVTERR); - printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v); - } - - v = apic_read(APIC_TMICT); - printk(KERN_DEBUG "... APIC TMICT: %08x\n", v); - v = apic_read(APIC_TMCCT); - printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); - v = apic_read(APIC_TDCR); - printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); - printk("\n"); -} - -void print_all_local_APICs (void) -{ - on_each_cpu(print_local_APIC, NULL, 1, 1); -} - -void __apicdebuginit print_PIC(void) -{ - unsigned int v; - unsigned long flags; - - if (apic_verbosity == APIC_QUIET) - return; - - printk(KERN_DEBUG "\nprinting PIC contents\n"); - - spin_lock_irqsave(&i8259A_lock, flags); - - v = inb(0xa1) << 8 | inb(0x21); - printk(KERN_DEBUG "... PIC IMR: %04x\n", v); - - v = inb(0xa0) << 8 | inb(0x20); - printk(KERN_DEBUG "... PIC IRR: %04x\n", v); - - outb(0x0b,0xa0); - outb(0x0b,0x20); - v = inb(0xa0) << 8 | inb(0x20); - outb(0x0a,0xa0); - outb(0x0a,0x20); - - spin_unlock_irqrestore(&i8259A_lock, flags); - - printk(KERN_DEBUG "... PIC ISR: %04x\n", v); - - v = inb(0x4d1) << 8 | inb(0x4d0); - printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); -} - -#endif /* 0 */ - -static void __init enable_IO_APIC(void) -{ - union IO_APIC_reg_01 reg_01; - int i; - unsigned long flags; - - for (i = 0; i < PIN_MAP_SIZE; i++) { - irq_2_pin[i].pin = -1; - irq_2_pin[i].next = 0; - } - if (!pirqs_enabled) - for (i = 0; i < MAX_PIRQS; i++) - pirq_entries[i] = -1; - - /* - * The number of IO-APIC IRQ registers (== #pins): - */ - for (i = 0; i < nr_ioapics; i++) { - spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(i, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); - nr_ioapic_registers[i] = reg_01.bits.entries+1; - } - - /* - * Do not trust the IO-APIC being empty at bootup - */ - clear_IO_APIC(); -} - -/* - * Not an __init, needed by the reboot code - */ -void disable_IO_APIC(void) -{ - int pin; - /* - * Clear the IO-APIC before rebooting: - */ - clear_IO_APIC(); - - /* - * If the i82559 is routed through an IOAPIC - * Put that IOAPIC in virtual wire mode - * so legacy interrups can be delivered. - */ - pin = find_isa_irq_pin(0, mp_ExtINT); - if (pin != -1) { - struct IO_APIC_route_entry entry; - unsigned long flags; - - memset(&entry, 0, sizeof(entry)); - entry.mask = 0; /* Enabled */ - entry.trigger = 0; /* Edge */ - entry.irr = 0; - entry.polarity = 0; /* High */ - entry.delivery_status = 0; - entry.dest_mode = 0; /* Physical */ - entry.delivery_mode = 7; /* ExtInt */ - entry.vector = 0; - entry.dest.physical.physical_dest = 0; - - - /* - * Add it to the IO-APIC irq-routing table: - */ - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1)); - io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0)); - spin_unlock_irqrestore(&ioapic_lock, flags); - } - - disconnect_bsp_APIC(pin != -1); -} - -/* - * function to set the IO-APIC physical IDs based on the - * values stored in the MPC table. - * - * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999 - */ - -static void __init setup_ioapic_ids_from_mpc (void) -{ - union IO_APIC_reg_00 reg_00; - int apic; - int i; - unsigned char old_id; - unsigned long flags; - - /* - * Set the IOAPIC ID to the value stored in the MPC table. - */ - for (apic = 0; apic < nr_ioapics; apic++) { - - /* Read the register 0 value */ - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(apic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - - old_id = mp_ioapics[apic].mpc_apicid; - - - printk(KERN_INFO "Using IO-APIC %d\n", mp_ioapics[apic].mpc_apicid); - - - /* - * We need to adjust the IRQ routing table - * if the ID changed. - */ - if (old_id != mp_ioapics[apic].mpc_apicid) - for (i = 0; i < mp_irq_entries; i++) - if (mp_irqs[i].mpc_dstapic == old_id) - mp_irqs[i].mpc_dstapic - = mp_ioapics[apic].mpc_apicid; - - /* - * Read the right value from the MPC table and - * write it into the ID register. - */ - apic_printk(APIC_VERBOSE,KERN_INFO "...changing IO-APIC physical APIC ID to %d ...", - mp_ioapics[apic].mpc_apicid); - - reg_00.bits.ID = mp_ioapics[apic].mpc_apicid; - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0, reg_00.raw); - spin_unlock_irqrestore(&ioapic_lock, flags); - - /* - * Sanity check - */ - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(apic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid) - printk("could not set ID!\n"); - else - apic_printk(APIC_VERBOSE," ok.\n"); - } -} - -/* - * There is a nasty bug in some older SMP boards, their mptable lies - * about the timer IRQ. We do the following to work around the situation: - * - * - timer IRQ defaults to IO-APIC IRQ - * - if this function detects that timer IRQs are defunct, then we fall - * back to ISA timer IRQs - */ -static int __init timer_irq_works(void) -{ - unsigned long t1 = jiffies; - - local_irq_enable(); - /* Let ten ticks pass... */ - mdelay((10 * 1000) / HZ); - - /* - * Expect a few ticks at least, to be sure some possible - * glue logic does not lock up after one or two first - * ticks in a non-ExtINT mode. Also the local APIC - * might have cached one ExtINT interrupt. Finally, at - * least one tick may be lost due to delays. - */ - - /* jiffies wrap? */ - if (jiffies - t1 > 4) - return 1; - return 0; -} - -/* - * In the SMP+IOAPIC case it might happen that there are an unspecified - * number of pending IRQ events unhandled. These cases are very rare, - * so we 'resend' these IRQs via IPIs, to the same CPU. It's much - * better to do it this way as thus we do not have to be aware of - * 'pending' interrupts in the IRQ path, except at this point. - */ -/* - * Edge triggered needs to resend any interrupt - * that was delayed but this is now handled in the device - * independent code. - */ - -/* - * Starting up a edge-triggered IO-APIC interrupt is - * nasty - we need to make sure that we get the edge. - * If it is already asserted for some reason, we need - * return 1 to indicate that is was pending. - * - * This is not complete - we should be able to fake - * an edge even if it isn't on the 8259A... - */ - -static unsigned int startup_edge_ioapic_irq(unsigned int irq) -{ - int was_pending = 0; - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - if (irq < 16) { - disable_8259A_irq(irq); - if (i8259A_irq_pending(irq)) - was_pending = 1; - } - __unmask_IO_APIC_irq(irq); - spin_unlock_irqrestore(&ioapic_lock, flags); - - return was_pending; -} - -/* - * Once we have recorded IRQ_PENDING already, we can mask the - * interrupt for real. This prevents IRQ storms from unhandled - * devices. - */ -static void ack_edge_ioapic_irq(unsigned int irq) -{ - if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED)) - == (IRQ_PENDING | IRQ_DISABLED)) - mask_IO_APIC_irq(irq); - ack_APIC_irq(); -} - -/* - * Level triggered interrupts can just be masked, - * and shutting down and starting up the interrupt - * is the same as enabling and disabling them -- except - * with a startup need to return a "was pending" value. - * - * Level triggered interrupts are special because we - * do not touch any IO-APIC register while handling - * them. We ack the APIC in the end-IRQ handler, not - * in the start-IRQ-handler. Protection against reentrance - * from the same interrupt is still provided, both by the - * generic IRQ layer and by the fact that an unacked local - * APIC does not accept IRQs. - */ -static unsigned int startup_level_ioapic_irq (unsigned int irq) -{ - unmask_IO_APIC_irq(irq); - - return 0; /* don't check for pending */ -} - -static void end_level_ioapic_irq (unsigned int irq) -{ - ack_APIC_irq(); -} - -static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) -{ - unsigned long flags; - unsigned int dest; - - dest = cpu_mask_to_apicid(mask); - - /* - * Only the high 8 bits are valid. - */ - dest = SET_APIC_LOGICAL_ID(dest); - - spin_lock_irqsave(&ioapic_lock, flags); - __DO_ACTION(1, = dest, ) - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -#ifdef CONFIG_PCI_MSI -static unsigned int startup_edge_ioapic_vector(unsigned int vector) -{ - int irq = vector_to_irq(vector); - - return startup_edge_ioapic_irq(irq); -} - -static void ack_edge_ioapic_vector(unsigned int vector) -{ - int irq = vector_to_irq(vector); - - ack_edge_ioapic_irq(irq); -} - -static unsigned int startup_level_ioapic_vector (unsigned int vector) -{ - int irq = vector_to_irq(vector); - - return startup_level_ioapic_irq (irq); -} - -static void end_level_ioapic_vector (unsigned int vector) -{ - int irq = vector_to_irq(vector); - - end_level_ioapic_irq(irq); -} - -static void mask_IO_APIC_vector (unsigned int vector) -{ - int irq = vector_to_irq(vector); - - mask_IO_APIC_irq(irq); -} - -static void unmask_IO_APIC_vector (unsigned int vector) -{ - int irq = vector_to_irq(vector); - - unmask_IO_APIC_irq(irq); -} - -static void set_ioapic_affinity_vector (unsigned int vector, - cpumask_t cpu_mask) -{ - int irq = vector_to_irq(vector); - - set_ioapic_affinity_irq(irq, cpu_mask); -} -#endif - -/* - * Level and edge triggered IO-APIC interrupts need different handling, - * so we use two separate IRQ descriptors. Edge triggered IRQs can be - * handled with the level-triggered descriptor, but that one has slightly - * more overhead. Level-triggered interrupts cannot be handled with the - * edge-triggered handler, without risking IRQ storms and other ugly - * races. - */ - -static struct hw_interrupt_type ioapic_edge_type = { - .typename = "IO-APIC-edge", - .startup = startup_edge_ioapic, - .shutdown = shutdown_edge_ioapic, - .enable = enable_edge_ioapic, - .disable = disable_edge_ioapic, - .ack = ack_edge_ioapic, - .end = end_edge_ioapic, - .set_affinity = set_ioapic_affinity, -}; - -static struct hw_interrupt_type ioapic_level_type = { - .typename = "IO-APIC-level", - .startup = startup_level_ioapic, - .shutdown = shutdown_level_ioapic, - .enable = enable_level_ioapic, - .disable = disable_level_ioapic, - .ack = mask_and_ack_level_ioapic, - .end = end_level_ioapic, - .set_affinity = set_ioapic_affinity, -}; - -static inline void init_IO_APIC_traps(void) -{ - int irq; - - /* - * NOTE! The local APIC isn't very good at handling - * multiple interrupts at the same interrupt level. - * As the interrupt level is determined by taking the - * vector number and shifting that right by 4, we - * want to spread these out a bit so that they don't - * all fall in the same interrupt level. - * - * Also, we've got to be careful not to trash gate - * 0x80, because int 0x80 is hm, kind of importantish. ;) - */ - for (irq = 0; irq < NR_IRQS ; irq++) { - int tmp = irq; - if (use_pci_vector()) { - if (!platform_legacy_irq(tmp)) - if ((tmp = vector_to_irq(tmp)) == -1) - continue; - } - if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) { - /* - * Hmm.. We don't have an entry for this, - * so default to an old-fashioned 8259 - * interrupt if we can.. - */ - if (irq < 16) - make_8259A_irq(irq); - else - /* Strange. Oh, well.. */ - irq_desc[irq].handler = &no_irq_type; - } - } -} - -static void enable_lapic_irq (unsigned int irq) -{ - unsigned long v; - - v = apic_read(APIC_LVT0); - apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED); -} - -static void disable_lapic_irq (unsigned int irq) -{ - unsigned long v; - - v = apic_read(APIC_LVT0); - apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); -} - -static void ack_lapic_irq (unsigned int irq) -{ - ack_APIC_irq(); -} - -static void end_lapic_irq (unsigned int i) { /* nothing */ } - -static struct hw_interrupt_type lapic_irq_type = { - .typename = "local-APIC-edge", - .startup = NULL, /* startup_irq() not used for IRQ0 */ - .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */ - .enable = enable_lapic_irq, - .disable = disable_lapic_irq, - .ack = ack_lapic_irq, - .end = end_lapic_irq, -}; - -static void setup_nmi (void) -{ - /* - * Dirty trick to enable the NMI watchdog ... - * We put the 8259A master into AEOI mode and - * unmask on all local APICs LVT0 as NMI. - * - * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') - * is from Maciej W. Rozycki - so we do not have to EOI from - * the NMI handler or the timer interrupt. - */ - printk(KERN_INFO "activating NMI Watchdog ..."); - - enable_NMI_through_LVT0(NULL); - - printk(" done.\n"); -} - -/* - * This looks a bit hackish but it's about the only one way of sending - * a few INTA cycles to 8259As and any associated glue logic. ICR does - * not support the ExtINT mode, unfortunately. We need to send these - * cycles as some i82489DX-based boards have glue logic that keeps the - * 8259A interrupt line asserted until INTA. --macro - */ -static inline void unlock_ExtINT_logic(void) -{ - int pin, i; - struct IO_APIC_route_entry entry0, entry1; - unsigned char save_control, save_freq_select; - unsigned long flags; - - pin = find_isa_irq_pin(8, mp_INT); - if (pin == -1) - return; - - spin_lock_irqsave(&ioapic_lock, flags); - *(((int *)&entry0) + 1) = io_apic_read(0, 0x11 + 2 * pin); - *(((int *)&entry0) + 0) = io_apic_read(0, 0x10 + 2 * pin); - spin_unlock_irqrestore(&ioapic_lock, flags); - clear_IO_APIC_pin(0, pin); - - memset(&entry1, 0, sizeof(entry1)); - - entry1.dest_mode = 0; /* physical delivery */ - entry1.mask = 0; /* unmask IRQ now */ - entry1.dest.physical.physical_dest = hard_smp_processor_id(); - entry1.delivery_mode = dest_ExtINT; - entry1.polarity = entry0.polarity; - entry1.trigger = 0; - entry1.vector = 0; - - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(0, 0x11 + 2 * pin, *(((int *)&entry1) + 1)); - io_apic_write(0, 0x10 + 2 * pin, *(((int *)&entry1) + 0)); - spin_unlock_irqrestore(&ioapic_lock, flags); - - save_control = CMOS_READ(RTC_CONTROL); - save_freq_select = CMOS_READ(RTC_FREQ_SELECT); - CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6, - RTC_FREQ_SELECT); - CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL); - - i = 100; - while (i-- > 0) { - mdelay(10); - if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF) - i -= 10; - } - - CMOS_WRITE(save_control, RTC_CONTROL); - CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); - clear_IO_APIC_pin(0, pin); - - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(0, 0x11 + 2 * pin, *(((int *)&entry0) + 1)); - io_apic_write(0, 0x10 + 2 * pin, *(((int *)&entry0) + 0)); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -/* - * This code may look a bit paranoid, but it's supposed to cooperate with - * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ - * is so screwy. Thanks to Brian Perkins for testing/hacking this beast - * fanatically on his truly buggy board. - */ -static inline void check_timer(void) -{ - int pin1, pin2; - int vector; - - /* - * get/set the timer IRQ vector: - */ - disable_8259A_irq(0); - vector = assign_irq_vector(0); - set_intr_gate(vector, interrupt[0]); - - /* - * Subtle, code in do_timer_interrupt() expects an AEOI - * mode for the 8259A whenever interrupts are routed - * through I/O APICs. Also IRQ0 has to be enabled in - * the 8259A which implies the virtual wire has to be - * disabled in the local APIC. - */ - apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); - init_8259A(1); - enable_8259A_irq(0); - - pin1 = find_isa_irq_pin(0, mp_INT); - pin2 = find_isa_irq_pin(0, mp_ExtINT); - - apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X pin1=%d pin2=%d\n", vector, pin1, pin2); - - if (pin1 != -1) { - /* - * Ok, does IRQ0 through the IOAPIC work? - */ - unmask_IO_APIC_irq(0); - if (!no_timer_check && timer_irq_works()) { - nmi_watchdog_default(); - if (nmi_watchdog == NMI_IO_APIC) { - disable_8259A_irq(0); - setup_nmi(); - enable_8259A_irq(0); - } - return; - } - clear_IO_APIC_pin(0, pin1); - apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not connected to IO-APIC\n"); - } - - apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... "); - if (pin2 != -1) { - apic_printk(APIC_VERBOSE,"\n..... (found pin %d) ...", pin2); - /* - * legacy devices should be connected to IO APIC #0 - */ - setup_ExtINT_IRQ0_pin(pin2, vector); - if (timer_irq_works()) { - printk("works.\n"); - nmi_watchdog_default(); - if (nmi_watchdog == NMI_IO_APIC) { - setup_nmi(); - } - return; - } - /* - * Cleanup, just in case ... - */ - clear_IO_APIC_pin(0, pin2); - } - printk(" failed.\n"); - - if (nmi_watchdog) { - printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); - nmi_watchdog = 0; - } - - apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); - - disable_8259A_irq(0); - irq_desc[0].handler = &lapic_irq_type; - apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ - enable_8259A_irq(0); - - if (timer_irq_works()) { - apic_printk(APIC_QUIET, " works.\n"); - return; - } - apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector); - apic_printk(APIC_VERBOSE," failed.\n"); - - apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ..."); - - init_8259A(0); - make_8259A_irq(0); - apic_write_around(APIC_LVT0, APIC_DM_EXTINT); - - unlock_ExtINT_logic(); - - if (timer_irq_works()) { - apic_printk(APIC_VERBOSE," works.\n"); - return; - } - apic_printk(APIC_VERBOSE," failed :(.\n"); - panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n"); -} - -static int __init notimercheck(char *s) -{ - no_timer_check = 1; - return 1; -} -__setup("no_timer_check", notimercheck); - -/* - * - * IRQ's that are handled by the PIC in the MPS IOAPIC case. - * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ. - * Linux doesn't really care, as it's not actually used - * for any interrupt handling anyway. - */ -#define PIC_IRQS (1<<2) - -void __init setup_IO_APIC(void) -{ - enable_IO_APIC(); - - if (acpi_ioapic) - io_apic_irqs = ~0; /* all IRQs go through IOAPIC */ - else - io_apic_irqs = ~PIC_IRQS; - - apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); - - /* - * Set up the IO-APIC IRQ routing table. - */ - if (!acpi_ioapic) - setup_ioapic_ids_from_mpc(); - sync_Arb_IDs(); - setup_IO_APIC_irqs(); - init_IO_APIC_traps(); - check_timer(); - if (!acpi_ioapic) - print_IO_APIC(); -} - -struct sysfs_ioapic_data { - struct sys_device dev; - struct IO_APIC_route_entry entry[0]; -}; -static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS]; - -static int ioapic_suspend(struct sys_device *dev, pm_message_t state) -{ - struct IO_APIC_route_entry *entry; - struct sysfs_ioapic_data *data; - unsigned long flags; - int i; - - data = container_of(dev, struct sysfs_ioapic_data, dev); - entry = data->entry; - spin_lock_irqsave(&ioapic_lock, flags); - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { - *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i); - *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i); - } - spin_unlock_irqrestore(&ioapic_lock, flags); - - return 0; -} - -static int ioapic_resume(struct sys_device *dev) -{ - struct IO_APIC_route_entry *entry; - struct sysfs_ioapic_data *data; - unsigned long flags; - union IO_APIC_reg_00 reg_00; - int i; - - data = container_of(dev, struct sysfs_ioapic_data, dev); - entry = data->entry; - - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(dev->id, 0); - if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) { - reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid; - io_apic_write(dev->id, 0, reg_00.raw); - } - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { - io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1)); - io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0)); - } - spin_unlock_irqrestore(&ioapic_lock, flags); - - return 0; -} - -static struct sysdev_class ioapic_sysdev_class = { - set_kset_name("ioapic"), - .suspend = ioapic_suspend, - .resume = ioapic_resume, -}; - -static int __init ioapic_init_sysfs(void) -{ - struct sys_device * dev; - int i, size, error = 0; - - error = sysdev_class_register(&ioapic_sysdev_class); - if (error) - return error; - - for (i = 0; i < nr_ioapics; i++ ) { - size = sizeof(struct sys_device) + nr_ioapic_registers[i] - * sizeof(struct IO_APIC_route_entry); - mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL); - if (!mp_ioapic_data[i]) { - printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); - continue; - } - memset(mp_ioapic_data[i], 0, size); - dev = &mp_ioapic_data[i]->dev; - dev->id = i; - dev->cls = &ioapic_sysdev_class; - error = sysdev_register(dev); - if (error) { - kfree(mp_ioapic_data[i]); - mp_ioapic_data[i] = NULL; - printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); - continue; - } - } - - return 0; -} - -device_initcall(ioapic_init_sysfs); - -/* -------------------------------------------------------------------------- - ACPI-based IOAPIC Configuration - -------------------------------------------------------------------------- */ - -#ifdef CONFIG_ACPI_BOOT - -#define IO_APIC_MAX_ID 0xFE - -int __init io_apic_get_version (int ioapic) -{ - union IO_APIC_reg_01 reg_01; - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(ioapic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); - - return reg_01.bits.version; -} - - -int __init io_apic_get_redir_entries (int ioapic) -{ - union IO_APIC_reg_01 reg_01; - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(ioapic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); - - return reg_01.bits.entries; -} - - -int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low) -{ - struct IO_APIC_route_entry entry; - unsigned long flags; - - if (!IO_APIC_IRQ(irq)) { - apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", - ioapic); - return -EINVAL; - } - - /* - * Generate a PCI IRQ routing entry and program the IOAPIC accordingly. - * Note that we mask (disable) IRQs now -- these get enabled when the - * corresponding device driver registers for this IRQ. - */ - - memset(&entry,0,sizeof(entry)); - - entry.delivery_mode = INT_DELIVERY_MODE; - entry.dest_mode = INT_DEST_MODE; - entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); - entry.trigger = edge_level; - entry.polarity = active_high_low; - entry.mask = 1; /* Disabled (masked) */ - - /* - * IRQs < 16 are already in the irq_2_pin[] map - */ - if (irq >= 16) - add_pin_to_irq(irq, ioapic, pin); - - entry.vector = assign_irq_vector(irq); - - apic_printk(APIC_VERBOSE,KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry (%d-%d -> 0x%x -> " - "IRQ %d Mode:%i Active:%i)\n", ioapic, - mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq, - edge_level, active_high_low); - - ioapic_register_intr(irq, entry.vector, edge_level); - - if (!ioapic && (irq < 16)) - disable_8259A_irq(irq); - - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1)); - io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0)); - spin_unlock_irqrestore(&ioapic_lock, flags); - - return 0; -} - -#endif /*CONFIG_ACPI_BOOT*/ - - -/* - * This function currently is only a helper for the i386 smp boot process where - * we need to reprogram the ioredtbls to cater for the cpus which have come online - * so mask in all cases should simply be TARGET_CPUS - */ -void __init setup_ioapic_dest(void) -{ - int pin, ioapic, irq, irq_entry; - - if (skip_ioapic_setup == 1) - return; - - for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { - for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { - irq_entry = find_irq_entry(ioapic, pin, mp_INT); - if (irq_entry == -1) - continue; - irq = pin_2_irq(irq_entry, ioapic, pin); - set_ioapic_affinity_irq(irq, TARGET_CPUS); - } - - } -} diff --git a/arch/x86_64/kernel/ioport.c b/arch/x86_64/kernel/ioport.c deleted file mode 100644 index cde0e868efe..00000000000 --- a/arch/x86_64/kernel/ioport.c +++ /dev/null @@ -1,117 +0,0 @@ -/* - * linux/arch/x86_64/kernel/ioport.c - * - * This contains the io-permission bitmap code - written by obz, with changes - * by Linus. - */ - -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/types.h> -#include <linux/ioport.h> -#include <linux/smp.h> -#include <linux/smp_lock.h> -#include <linux/stddef.h> -#include <linux/slab.h> -#include <linux/thread_info.h> - -/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ -static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value) -{ - int i; - if (new_value) - for (i = base; i < base + extent; i++) - __set_bit(i, bitmap); - else - for (i = base; i < base + extent; i++) - clear_bit(i, bitmap); -} - -/* - * this changes the io permissions bitmap in the current task. - */ -asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) -{ - unsigned int i, max_long, bytes, bytes_updated; - struct thread_struct * t = ¤t->thread; - struct tss_struct * tss; - unsigned long *bitmap; - - if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) - return -EINVAL; - if (turn_on && !capable(CAP_SYS_RAWIO)) - return -EPERM; - - /* - * If it's the first ioperm() call in this thread's lifetime, set the - * IO bitmap up. ioperm() is much less timing critical than clone(), - * this is why we delay this operation until now: - */ - if (!t->io_bitmap_ptr) { - bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); - if (!bitmap) - return -ENOMEM; - - memset(bitmap, 0xff, IO_BITMAP_BYTES); - t->io_bitmap_ptr = bitmap; - } - - /* - * do it in the per-thread copy and in the TSS ... - * - * Disable preemption via get_cpu() - we must not switch away - * because the ->io_bitmap_max value must match the bitmap - * contents: - */ - tss = &per_cpu(init_tss, get_cpu()); - - set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); - - /* - * Search for a (possibly new) maximum. This is simple and stupid, - * to keep it obviously correct: - */ - max_long = 0; - for (i = 0; i < IO_BITMAP_LONGS; i++) - if (t->io_bitmap_ptr[i] != ~0UL) - max_long = i; - - bytes = (max_long + 1) * sizeof(long); - bytes_updated = max(bytes, t->io_bitmap_max); - - t->io_bitmap_max = bytes; - - /* Update the TSS: */ - memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated); - - put_cpu(); - - return 0; -} - -/* - * sys_iopl has to be used when you want to access the IO ports - * beyond the 0x3ff range: to get the full 65536 ports bitmapped - * you'd need 8kB of bitmaps/process, which is a bit excessive. - * - * Here we just change the eflags value on the stack: we allow - * only the super-user to do it. This depends on the stack-layout - * on system-call entry - see also fork() and the signal handling - * code. - */ - -asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs) -{ - unsigned int old = (regs->eflags >> 12) & 3; - - if (level > 3) - return -EINVAL; - /* Trying to gain more privileges? */ - if (level > old) { - if (!capable(CAP_SYS_RAWIO)) - return -EPERM; - } - regs->eflags = (regs->eflags &~ 0x3000UL) | (level << 12); - return 0; -} diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c deleted file mode 100644 index 849a20aec7c..00000000000 --- a/arch/x86_64/kernel/irq.c +++ /dev/null @@ -1,156 +0,0 @@ -/* - * linux/arch/x86_64/kernel/irq.c - * - * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar - * - * This file contains the lowest level x86_64-specific interrupt - * entry and irq statistics code. All the remaining irq logic is - * done by the generic kernel/irq/ code and in the - * x86_64-specific irq controller code. (e.g. i8259.c and - * io_apic.c.) - */ - -#include <linux/kernel_stat.h> -#include <linux/interrupt.h> -#include <linux/seq_file.h> -#include <linux/module.h> -#include <linux/delay.h> -#include <asm/uaccess.h> -#include <asm/io_apic.h> - -atomic_t irq_err_count; -#ifdef CONFIG_X86_IO_APIC -#ifdef APIC_MISMATCH_DEBUG -atomic_t irq_mis_count; -#endif -#endif - -/* - * Generic, controller-independent functions: - */ - -int show_interrupts(struct seq_file *p, void *v) -{ - int i = *(loff_t *) v, j; - struct irqaction * action; - unsigned long flags; - - if (i == 0) { - seq_printf(p, " "); - for (j=0; j<NR_CPUS; j++) - if (cpu_online(j)) - seq_printf(p, "CPU%d ",j); - seq_putc(p, '\n'); - } - - if (i < NR_IRQS) { - spin_lock_irqsave(&irq_desc[i].lock, flags); - action = irq_desc[i].action; - if (!action) - goto skip; - seq_printf(p, "%3d: ",i); -#ifndef CONFIG_SMP - seq_printf(p, "%10u ", kstat_irqs(i)); -#else - for (j=0; j<NR_CPUS; j++) - if (cpu_online(j)) - seq_printf(p, "%10u ", - kstat_cpu(j).irqs[i]); -#endif - seq_printf(p, " %14s", irq_desc[i].handler->typename); - - seq_printf(p, " %s", action->name); - for (action=action->next; action; action = action->next) - seq_printf(p, ", %s", action->name); - seq_putc(p, '\n'); -skip: - spin_unlock_irqrestore(&irq_desc[i].lock, flags); - } else if (i == NR_IRQS) { - seq_printf(p, "NMI: "); - for (j = 0; j < NR_CPUS; j++) - if (cpu_online(j)) - seq_printf(p, "%10u ", cpu_pda[j].__nmi_count); - seq_putc(p, '\n'); -#ifdef CONFIG_X86_LOCAL_APIC - seq_printf(p, "LOC: "); - for (j = 0; j < NR_CPUS; j++) - if (cpu_online(j)) - seq_printf(p, "%10u ", cpu_pda[j].apic_timer_irqs); - seq_putc(p, '\n'); -#endif - seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); -#ifdef CONFIG_X86_IO_APIC -#ifdef APIC_MISMATCH_DEBUG - seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); -#endif -#endif - } - return 0; -} - -/* - * do_IRQ handles all normal device IRQ's (the special - * SMP cross-CPU interrupts have their own specific - * handlers). - */ -asmlinkage unsigned int do_IRQ(struct pt_regs *regs) -{ - /* high bits used in ret_from_ code */ - unsigned irq = regs->orig_rax & 0xff; - - irq_enter(); - BUG_ON(irq > 256); - - __do_IRQ(irq, regs); - irq_exit(); - - return 1; -} - -#ifdef CONFIG_HOTPLUG_CPU -void fixup_irqs(cpumask_t map) -{ - unsigned int irq; - static int warned; - - for (irq = 0; irq < NR_IRQS; irq++) { - cpumask_t mask; - if (irq == 2) - continue; - - cpus_and(mask, irq_affinity[irq], map); - if (any_online_cpu(mask) == NR_CPUS) { - printk("Breaking affinity for irq %i\n", irq); - mask = map; - } - if (irq_desc[irq].handler->set_affinity) - irq_desc[irq].handler->set_affinity(irq, mask); - else if (irq_desc[irq].action && !(warned++)) - printk("Cannot set affinity for irq %i\n", irq); - } - - /* That doesn't seem sufficient. Give it 1ms. */ - local_irq_enable(); - mdelay(1); - local_irq_disable(); -} -#endif - -extern void call_softirq(void); - -asmlinkage void do_softirq(void) -{ - __u32 pending; - unsigned long flags; - - if (in_interrupt()) - return; - - local_irq_save(flags); - pending = local_softirq_pending(); - /* Switch to interrupt stack */ - if (pending) - call_softirq(); - local_irq_restore(flags); -} -EXPORT_SYMBOL(do_softirq); diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c deleted file mode 100644 index 5c6dc705148..00000000000 --- a/arch/x86_64/kernel/kprobes.c +++ /dev/null @@ -1,688 +0,0 @@ -/* - * Kernel Probes (KProbes) - * arch/x86_64/kernel/kprobes.c - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright (C) IBM Corporation, 2002, 2004 - * - * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel - * Probes initial implementation ( includes contributions from - * Rusty Russell). - * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes - * interface to access function arguments. - * 2004-Oct Jim Keniston <kenistoj@us.ibm.com> and Prasanna S Panchamukhi - * <prasanna@in.ibm.com> adapted for x86_64 - * 2005-Mar Roland McGrath <roland@redhat.com> - * Fixed to handle %rip-relative addressing mode correctly. - * 2005-May Rusty Lynch <rusty.lynch@intel.com> - * Added function return probes functionality - */ - -#include <linux/config.h> -#include <linux/kprobes.h> -#include <linux/ptrace.h> -#include <linux/spinlock.h> -#include <linux/string.h> -#include <linux/slab.h> -#include <linux/preempt.h> - -#include <asm/cacheflush.h> -#include <asm/pgtable.h> -#include <asm/kdebug.h> - -static DECLARE_MUTEX(kprobe_mutex); - -static struct kprobe *current_kprobe; -static unsigned long kprobe_status, kprobe_old_rflags, kprobe_saved_rflags; -static struct kprobe *kprobe_prev; -static unsigned long kprobe_status_prev, kprobe_old_rflags_prev, kprobe_saved_rflags_prev; -static struct pt_regs jprobe_saved_regs; -static long *jprobe_saved_rsp; -void jprobe_return_end(void); - -/* copy of the kernel stack at the probe fire time */ -static kprobe_opcode_t jprobes_stack[MAX_STACK_SIZE]; - -/* - * returns non-zero if opcode modifies the interrupt flag. - */ -static inline int is_IF_modifier(kprobe_opcode_t *insn) -{ - switch (*insn) { - case 0xfa: /* cli */ - case 0xfb: /* sti */ - case 0xcf: /* iret/iretd */ - case 0x9d: /* popf/popfd */ - return 1; - } - - if (*insn >= 0x40 && *insn <= 0x4f && *++insn == 0xcf) - return 1; - return 0; -} - -int arch_prepare_kprobe(struct kprobe *p) -{ - /* insn: must be on special executable page on x86_64. */ - up(&kprobe_mutex); - p->ainsn.insn = get_insn_slot(); - down(&kprobe_mutex); - if (!p->ainsn.insn) { - return -ENOMEM; - } - return 0; -} - -/* - * Determine if the instruction uses the %rip-relative addressing mode. - * If it does, return the address of the 32-bit displacement word. - * If not, return null. - */ -static inline s32 *is_riprel(u8 *insn) -{ -#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf) \ - (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ - (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \ - (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \ - (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \ - << (row % 64)) - static const u64 onebyte_has_modrm[256 / 64] = { - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - /* ------------------------------- */ - W(0x00, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 00 */ - W(0x10, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 10 */ - W(0x20, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 20 */ - W(0x30, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0), /* 30 */ - W(0x40, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 40 */ - W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 50 */ - W(0x60, 0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0)| /* 60 */ - W(0x70, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 70 */ - W(0x80, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 80 */ - W(0x90, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 90 */ - W(0xa0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* a0 */ - W(0xb0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* b0 */ - W(0xc0, 1,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0)| /* c0 */ - W(0xd0, 1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1)| /* d0 */ - W(0xe0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* e0 */ - W(0xf0, 0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1) /* f0 */ - /* ------------------------------- */ - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - }; - static const u64 twobyte_has_modrm[256 / 64] = { - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - /* ------------------------------- */ - W(0x00, 1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,1)| /* 0f */ - W(0x10, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0)| /* 1f */ - W(0x20, 1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1)| /* 2f */ - W(0x30, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 3f */ - W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 4f */ - W(0x50, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 5f */ - W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 6f */ - W(0x70, 1,1,1,1,1,1,1,0,0,0,0,0,1,1,1,1), /* 7f */ - W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 8f */ - W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 9f */ - W(0xa0, 0,0,0,1,1,1,1,1,0,0,0,1,1,1,1,1)| /* af */ - W(0xb0, 1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1), /* bf */ - W(0xc0, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0)| /* cf */ - W(0xd0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* df */ - W(0xe0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* ef */ - W(0xf0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0) /* ff */ - /* ------------------------------- */ - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - }; -#undef W - int need_modrm; - - /* Skip legacy instruction prefixes. */ - while (1) { - switch (*insn) { - case 0x66: - case 0x67: - case 0x2e: - case 0x3e: - case 0x26: - case 0x64: - case 0x65: - case 0x36: - case 0xf0: - case 0xf3: - case 0xf2: - ++insn; - continue; - } - break; - } - - /* Skip REX instruction prefix. */ - if ((*insn & 0xf0) == 0x40) - ++insn; - - if (*insn == 0x0f) { /* Two-byte opcode. */ - ++insn; - need_modrm = test_bit(*insn, twobyte_has_modrm); - } else { /* One-byte opcode. */ - need_modrm = test_bit(*insn, onebyte_has_modrm); - } - - if (need_modrm) { - u8 modrm = *++insn; - if ((modrm & 0xc7) == 0x05) { /* %rip+disp32 addressing mode */ - /* Displacement follows ModRM byte. */ - return (s32 *) ++insn; - } - } - - /* No %rip-relative addressing mode here. */ - return NULL; -} - -void arch_copy_kprobe(struct kprobe *p) -{ - s32 *ripdisp; - memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE); - ripdisp = is_riprel(p->ainsn.insn); - if (ripdisp) { - /* - * The copied instruction uses the %rip-relative - * addressing mode. Adjust the displacement for the - * difference between the original location of this - * instruction and the location of the copy that will - * actually be run. The tricky bit here is making sure - * that the sign extension happens correctly in this - * calculation, since we need a signed 32-bit result to - * be sign-extended to 64 bits when it's added to the - * %rip value and yield the same 64-bit result that the - * sign-extension of the original signed 32-bit - * displacement would have given. - */ - s64 disp = (u8 *) p->addr + *ripdisp - (u8 *) p->ainsn.insn; - BUG_ON((s64) (s32) disp != disp); /* Sanity check. */ - *ripdisp = disp; - } - p->opcode = *p->addr; -} - -void arch_arm_kprobe(struct kprobe *p) -{ - *p->addr = BREAKPOINT_INSTRUCTION; - flush_icache_range((unsigned long) p->addr, - (unsigned long) p->addr + sizeof(kprobe_opcode_t)); -} - -void arch_disarm_kprobe(struct kprobe *p) -{ - *p->addr = p->opcode; - flush_icache_range((unsigned long) p->addr, - (unsigned long) p->addr + sizeof(kprobe_opcode_t)); -} - -void arch_remove_kprobe(struct kprobe *p) -{ - up(&kprobe_mutex); - free_insn_slot(p->ainsn.insn); - down(&kprobe_mutex); -} - -static inline void save_previous_kprobe(void) -{ - kprobe_prev = current_kprobe; - kprobe_status_prev = kprobe_status; - kprobe_old_rflags_prev = kprobe_old_rflags; - kprobe_saved_rflags_prev = kprobe_saved_rflags; -} - -static inline void restore_previous_kprobe(void) -{ - current_kprobe = kprobe_prev; - kprobe_status = kprobe_status_prev; - kprobe_old_rflags = kprobe_old_rflags_prev; - kprobe_saved_rflags = kprobe_saved_rflags_prev; -} - -static inline void set_current_kprobe(struct kprobe *p, struct pt_regs *regs) -{ - current_kprobe = p; - kprobe_saved_rflags = kprobe_old_rflags - = (regs->eflags & (TF_MASK | IF_MASK)); - if (is_IF_modifier(p->ainsn.insn)) - kprobe_saved_rflags &= ~IF_MASK; -} - -static void prepare_singlestep(struct kprobe *p, struct pt_regs *regs) -{ - regs->eflags |= TF_MASK; - regs->eflags &= ~IF_MASK; - /*single step inline if the instruction is an int3*/ - if (p->opcode == BREAKPOINT_INSTRUCTION) - regs->rip = (unsigned long)p->addr; - else - regs->rip = (unsigned long)p->ainsn.insn; -} - -void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs) -{ - unsigned long *sara = (unsigned long *)regs->rsp; - struct kretprobe_instance *ri; - - if ((ri = get_free_rp_inst(rp)) != NULL) { - ri->rp = rp; - ri->task = current; - ri->ret_addr = (kprobe_opcode_t *) *sara; - - /* Replace the return addr with trampoline addr */ - *sara = (unsigned long) &kretprobe_trampoline; - - add_rp_inst(ri); - } else { - rp->nmissed++; - } -} - -/* - * Interrupts are disabled on entry as trap3 is an interrupt gate and they - * remain disabled thorough out this function. - */ -int kprobe_handler(struct pt_regs *regs) -{ - struct kprobe *p; - int ret = 0; - kprobe_opcode_t *addr = (kprobe_opcode_t *)(regs->rip - sizeof(kprobe_opcode_t)); - - /* We're in an interrupt, but this is clear and BUG()-safe. */ - preempt_disable(); - - /* Check we're not actually recursing */ - if (kprobe_running()) { - /* We *are* holding lock here, so this is safe. - Disarm the probe we just hit, and ignore it. */ - p = get_kprobe(addr); - if (p) { - if (kprobe_status == KPROBE_HIT_SS) { - regs->eflags &= ~TF_MASK; - regs->eflags |= kprobe_saved_rflags; - unlock_kprobes(); - goto no_kprobe; - } else if (kprobe_status == KPROBE_HIT_SSDONE) { - /* TODO: Provide re-entrancy from - * post_kprobes_handler() and avoid exception - * stack corruption while single-stepping on - * the instruction of the new probe. - */ - arch_disarm_kprobe(p); - regs->rip = (unsigned long)p->addr; - ret = 1; - } else { - /* We have reentered the kprobe_handler(), since - * another probe was hit while within the - * handler. We here save the original kprobe - * variables and just single step on instruction - * of the new probe without calling any user - * handlers. - */ - save_previous_kprobe(); - set_current_kprobe(p, regs); - p->nmissed++; - prepare_singlestep(p, regs); - kprobe_status = KPROBE_REENTER; - return 1; - } - } else { - p = current_kprobe; - if (p->break_handler && p->break_handler(p, regs)) { - goto ss_probe; - } - } - /* If it's not ours, can't be delete race, (we hold lock). */ - goto no_kprobe; - } - - lock_kprobes(); - p = get_kprobe(addr); - if (!p) { - unlock_kprobes(); - if (*addr != BREAKPOINT_INSTRUCTION) { - /* - * The breakpoint instruction was removed right - * after we hit it. Another cpu has removed - * either a probepoint or a debugger breakpoint - * at this address. In either case, no further - * handling of this interrupt is appropriate. - */ - ret = 1; - } - /* Not one of ours: let kernel handle it */ - goto no_kprobe; - } - - kprobe_status = KPROBE_HIT_ACTIVE; - set_current_kprobe(p, regs); - - if (p->pre_handler && p->pre_handler(p, regs)) - /* handler has already set things up, so skip ss setup */ - return 1; - -ss_probe: - prepare_singlestep(p, regs); - kprobe_status = KPROBE_HIT_SS; - return 1; - -no_kprobe: - preempt_enable_no_resched(); - return ret; -} - -/* - * For function-return probes, init_kprobes() establishes a probepoint - * here. When a retprobed function returns, this probe is hit and - * trampoline_probe_handler() runs, calling the kretprobe's handler. - */ - void kretprobe_trampoline_holder(void) - { - asm volatile ( ".global kretprobe_trampoline\n" - "kretprobe_trampoline: \n" - "nop\n"); - } - -/* - * Called when we hit the probe point at kretprobe_trampoline - */ -int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) -{ - struct kretprobe_instance *ri = NULL; - struct hlist_head *head; - struct hlist_node *node, *tmp; - unsigned long orig_ret_address = 0; - unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline; - - head = kretprobe_inst_table_head(current); - - /* - * It is possible to have multiple instances associated with a given - * task either because an multiple functions in the call path - * have a return probe installed on them, and/or more then one return - * return probe was registered for a target function. - * - * We can handle this because: - * - instances are always inserted at the head of the list - * - when multiple return probes are registered for the same - * function, the first instance's ret_addr will point to the - * real return address, and all the rest will point to - * kretprobe_trampoline - */ - hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { - if (ri->task != current) - /* another task is sharing our hash bucket */ - continue; - - if (ri->rp && ri->rp->handler) - ri->rp->handler(ri, regs); - - orig_ret_address = (unsigned long)ri->ret_addr; - recycle_rp_inst(ri); - - if (orig_ret_address != trampoline_address) - /* - * This is the real return address. Any other - * instances associated with this task are for - * other calls deeper on the call stack - */ - break; - } - - BUG_ON(!orig_ret_address || (orig_ret_address == trampoline_address)); - regs->rip = orig_ret_address; - - unlock_kprobes(); - preempt_enable_no_resched(); - - /* - * By returning a non-zero value, we are telling - * kprobe_handler() that we have handled unlocking - * and re-enabling preemption. - */ - return 1; -} - -/* - * Called after single-stepping. p->addr is the address of the - * instruction whose first byte has been replaced by the "int 3" - * instruction. To avoid the SMP problems that can occur when we - * temporarily put back the original opcode to single-step, we - * single-stepped a copy of the instruction. The address of this - * copy is p->ainsn.insn. - * - * This function prepares to return from the post-single-step - * interrupt. We have to fix up the stack as follows: - * - * 0) Except in the case of absolute or indirect jump or call instructions, - * the new rip is relative to the copied instruction. We need to make - * it relative to the original instruction. - * - * 1) If the single-stepped instruction was pushfl, then the TF and IF - * flags are set in the just-pushed eflags, and may need to be cleared. - * - * 2) If the single-stepped instruction was a call, the return address - * that is atop the stack is the address following the copied instruction. - * We need to make it the address following the original instruction. - */ -static void resume_execution(struct kprobe *p, struct pt_regs *regs) -{ - unsigned long *tos = (unsigned long *)regs->rsp; - unsigned long next_rip = 0; - unsigned long copy_rip = (unsigned long)p->ainsn.insn; - unsigned long orig_rip = (unsigned long)p->addr; - kprobe_opcode_t *insn = p->ainsn.insn; - - /*skip the REX prefix*/ - if (*insn >= 0x40 && *insn <= 0x4f) - insn++; - - switch (*insn) { - case 0x9c: /* pushfl */ - *tos &= ~(TF_MASK | IF_MASK); - *tos |= kprobe_old_rflags; - break; - case 0xc3: /* ret/lret */ - case 0xcb: - case 0xc2: - case 0xca: - regs->eflags &= ~TF_MASK; - /* rip is already adjusted, no more changes required*/ - return; - case 0xe8: /* call relative - Fix return addr */ - *tos = orig_rip + (*tos - copy_rip); - break; - case 0xff: - if ((*insn & 0x30) == 0x10) { - /* call absolute, indirect */ - /* Fix return addr; rip is correct. */ - next_rip = regs->rip; - *tos = orig_rip + (*tos - copy_rip); - } else if (((*insn & 0x31) == 0x20) || /* jmp near, absolute indirect */ - ((*insn & 0x31) == 0x21)) { /* jmp far, absolute indirect */ - /* rip is correct. */ - next_rip = regs->rip; - } - break; - case 0xea: /* jmp absolute -- rip is correct */ - next_rip = regs->rip; - break; - default: - break; - } - - regs->eflags &= ~TF_MASK; - if (next_rip) { - regs->rip = next_rip; - } else { - regs->rip = orig_rip + (regs->rip - copy_rip); - } -} - -/* - * Interrupts are disabled on entry as trap1 is an interrupt gate and they - * remain disabled thoroughout this function. And we hold kprobe lock. - */ -int post_kprobe_handler(struct pt_regs *regs) -{ - if (!kprobe_running()) - return 0; - - if ((kprobe_status != KPROBE_REENTER) && current_kprobe->post_handler) { - kprobe_status = KPROBE_HIT_SSDONE; - current_kprobe->post_handler(current_kprobe, regs, 0); - } - - resume_execution(current_kprobe, regs); - regs->eflags |= kprobe_saved_rflags; - - /* Restore the original saved kprobes variables and continue. */ - if (kprobe_status == KPROBE_REENTER) { - restore_previous_kprobe(); - goto out; - } else { - unlock_kprobes(); - } -out: - preempt_enable_no_resched(); - - /* - * if somebody else is singlestepping across a probe point, eflags - * will have TF set, in which case, continue the remaining processing - * of do_debug, as if this is not a probe hit. - */ - if (regs->eflags & TF_MASK) - return 0; - - return 1; -} - -/* Interrupts disabled, kprobe_lock held. */ -int kprobe_fault_handler(struct pt_regs *regs, int trapnr) -{ - if (current_kprobe->fault_handler - && current_kprobe->fault_handler(current_kprobe, regs, trapnr)) - return 1; - - if (kprobe_status & KPROBE_HIT_SS) { - resume_execution(current_kprobe, regs); - regs->eflags |= kprobe_old_rflags; - - unlock_kprobes(); - preempt_enable_no_resched(); - } - return 0; -} - -/* - * Wrapper routine for handling exceptions. - */ -int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, - void *data) -{ - struct die_args *args = (struct die_args *)data; - switch (val) { - case DIE_INT3: - if (kprobe_handler(args->regs)) - return NOTIFY_STOP; - break; - case DIE_DEBUG: - if (post_kprobe_handler(args->regs)) - return NOTIFY_STOP; - break; - case DIE_GPF: - if (kprobe_running() && - kprobe_fault_handler(args->regs, args->trapnr)) - return NOTIFY_STOP; - break; - case DIE_PAGE_FAULT: - if (kprobe_running() && - kprobe_fault_handler(args->regs, args->trapnr)) - return NOTIFY_STOP; - break; - default: - break; - } - return NOTIFY_DONE; -} - -int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) -{ - struct jprobe *jp = container_of(p, struct jprobe, kp); - unsigned long addr; - - jprobe_saved_regs = *regs; - jprobe_saved_rsp = (long *) regs->rsp; - addr = (unsigned long)jprobe_saved_rsp; - /* - * As Linus pointed out, gcc assumes that the callee - * owns the argument space and could overwrite it, e.g. - * tailcall optimization. So, to be absolutely safe - * we also save and restore enough stack bytes to cover - * the argument area. - */ - memcpy(jprobes_stack, (kprobe_opcode_t *) addr, MIN_STACK_SIZE(addr)); - regs->eflags &= ~IF_MASK; - regs->rip = (unsigned long)(jp->entry); - return 1; -} - -void jprobe_return(void) -{ - preempt_enable_no_resched(); - asm volatile (" xchg %%rbx,%%rsp \n" - " int3 \n" - " .globl jprobe_return_end \n" - " jprobe_return_end: \n" - " nop \n"::"b" - (jprobe_saved_rsp):"memory"); -} - -int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) -{ - u8 *addr = (u8 *) (regs->rip - 1); - unsigned long stack_addr = (unsigned long)jprobe_saved_rsp; - struct jprobe *jp = container_of(p, struct jprobe, kp); - - if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) { - if ((long *)regs->rsp != jprobe_saved_rsp) { - struct pt_regs *saved_regs = - container_of(jprobe_saved_rsp, struct pt_regs, rsp); - printk("current rsp %p does not match saved rsp %p\n", - (long *)regs->rsp, jprobe_saved_rsp); - printk("Saved registers for jprobe %p\n", jp); - show_registers(saved_regs); - printk("Current registers\n"); - show_registers(regs); - BUG(); - } - *regs = jprobe_saved_regs; - memcpy((kprobe_opcode_t *) stack_addr, jprobes_stack, - MIN_STACK_SIZE(stack_addr)); - return 1; - } - return 0; -} - -static struct kprobe trampoline_p = { - .addr = (kprobe_opcode_t *) &kretprobe_trampoline, - .pre_handler = trampoline_probe_handler -}; - -int __init arch_init_kprobes(void) -{ - return register_kprobe(&trampoline_p); -} diff --git a/arch/x86_64/kernel/ldt.c b/arch/x86_64/kernel/ldt.c deleted file mode 100644 index d7e5d0cf428..00000000000 --- a/arch/x86_64/kernel/ldt.c +++ /dev/null @@ -1,253 +0,0 @@ -/* - * linux/arch/x86_64/kernel/ldt.c - * - * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds - * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> - * Copyright (C) 2002 Andi Kleen - * - * This handles calls from both 32bit and 64bit mode. - */ - -#include <linux/errno.h> -#include <linux/sched.h> -#include <linux/string.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/smp_lock.h> -#include <linux/vmalloc.h> -#include <linux/slab.h> - -#include <asm/uaccess.h> -#include <asm/system.h> -#include <asm/ldt.h> -#include <asm/desc.h> -#include <asm/proto.h> - -#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ -static void flush_ldt(void *null) -{ - if (current->active_mm) - load_LDT(¤t->active_mm->context); -} -#endif - -static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload) -{ - void *oldldt; - void *newldt; - unsigned oldsize; - - if (mincount <= (unsigned)pc->size) - return 0; - oldsize = pc->size; - mincount = (mincount+511)&(~511); - if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) - newldt = vmalloc(mincount*LDT_ENTRY_SIZE); - else - newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); - - if (!newldt) - return -ENOMEM; - - if (oldsize) - memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE); - oldldt = pc->ldt; - memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE); - wmb(); - pc->ldt = newldt; - wmb(); - pc->size = mincount; - wmb(); - if (reload) { -#ifdef CONFIG_SMP - cpumask_t mask; - - preempt_disable(); - mask = cpumask_of_cpu(smp_processor_id()); - load_LDT(pc); - if (!cpus_equal(current->mm->cpu_vm_mask, mask)) - smp_call_function(flush_ldt, NULL, 1, 1); - preempt_enable(); -#else - load_LDT(pc); -#endif - } - if (oldsize) { - if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) - vfree(oldldt); - else - kfree(oldldt); - } - return 0; -} - -static inline int copy_ldt(mm_context_t *new, mm_context_t *old) -{ - int err = alloc_ldt(new, old->size, 0); - if (err < 0) - return err; - memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE); - return 0; -} - -/* - * we do not have to muck with descriptors here, that is - * done in switch_mm() as needed. - */ -int init_new_context(struct task_struct *tsk, struct mm_struct *mm) -{ - struct mm_struct * old_mm; - int retval = 0; - - init_MUTEX(&mm->context.sem); - mm->context.size = 0; - old_mm = current->mm; - if (old_mm && old_mm->context.size > 0) { - down(&old_mm->context.sem); - retval = copy_ldt(&mm->context, &old_mm->context); - up(&old_mm->context.sem); - } - return retval; -} - -/* - * - * Don't touch the LDT register - we're already in the next thread. - */ -void destroy_context(struct mm_struct *mm) -{ - if (mm->context.size) { - if ((unsigned)mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE) - vfree(mm->context.ldt); - else - kfree(mm->context.ldt); - mm->context.size = 0; - } -} - -static int read_ldt(void __user * ptr, unsigned long bytecount) -{ - int err; - unsigned long size; - struct mm_struct * mm = current->mm; - - if (!mm->context.size) - return 0; - if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES) - bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES; - - down(&mm->context.sem); - size = mm->context.size*LDT_ENTRY_SIZE; - if (size > bytecount) - size = bytecount; - - err = 0; - if (copy_to_user(ptr, mm->context.ldt, size)) - err = -EFAULT; - up(&mm->context.sem); - if (err < 0) - goto error_return; - if (size != bytecount) { - /* zero-fill the rest */ - if (clear_user(ptr+size, bytecount-size) != 0) { - err = -EFAULT; - goto error_return; - } - } - return bytecount; -error_return: - return err; -} - -static int read_default_ldt(void __user * ptr, unsigned long bytecount) -{ - /* Arbitrary number */ - /* x86-64 default LDT is all zeros */ - if (bytecount > 128) - bytecount = 128; - if (clear_user(ptr, bytecount)) - return -EFAULT; - return bytecount; -} - -static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode) -{ - struct task_struct *me = current; - struct mm_struct * mm = me->mm; - __u32 entry_1, entry_2, *lp; - int error; - struct user_desc ldt_info; - - error = -EINVAL; - - if (bytecount != sizeof(ldt_info)) - goto out; - error = -EFAULT; - if (copy_from_user(&ldt_info, ptr, bytecount)) - goto out; - - error = -EINVAL; - if (ldt_info.entry_number >= LDT_ENTRIES) - goto out; - if (ldt_info.contents == 3) { - if (oldmode) - goto out; - if (ldt_info.seg_not_present == 0) - goto out; - } - - down(&mm->context.sem); - if (ldt_info.entry_number >= (unsigned)mm->context.size) { - error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1); - if (error < 0) - goto out_unlock; - } - - lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt); - - /* Allow LDTs to be cleared by the user. */ - if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { - if (oldmode || LDT_empty(&ldt_info)) { - entry_1 = 0; - entry_2 = 0; - goto install; - } - } - - entry_1 = LDT_entry_a(&ldt_info); - entry_2 = LDT_entry_b(&ldt_info); - if (oldmode) - entry_2 &= ~(1 << 20); - - /* Install the new entry ... */ -install: - *lp = entry_1; - *(lp+1) = entry_2; - error = 0; - -out_unlock: - up(&mm->context.sem); -out: - return error; -} - -asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount) -{ - int ret = -ENOSYS; - - switch (func) { - case 0: - ret = read_ldt(ptr, bytecount); - break; - case 1: - ret = write_ldt(ptr, bytecount, 1); - break; - case 2: - ret = read_default_ldt(ptr, bytecount); - break; - case 0x11: - ret = write_ldt(ptr, bytecount, 0); - break; - } - return ret; -} diff --git a/arch/x86_64/kernel/machine_kexec.c b/arch/x86_64/kernel/machine_kexec.c deleted file mode 100644 index 89fab51e20f..00000000000 --- a/arch/x86_64/kernel/machine_kexec.c +++ /dev/null @@ -1,231 +0,0 @@ -/* - * machine_kexec.c - handle transition of Linux booting another kernel - * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com> - * - * This source code is licensed under the GNU General Public License, - * Version 2. See the file COPYING for more details. - */ - -#include <linux/mm.h> -#include <linux/kexec.h> -#include <linux/string.h> -#include <linux/reboot.h> -#include <asm/pgtable.h> -#include <asm/tlbflush.h> -#include <asm/mmu_context.h> -#include <asm/io.h> - -static void init_level2_page(pmd_t *level2p, unsigned long addr) -{ - unsigned long end_addr; - - addr &= PAGE_MASK; - end_addr = addr + PUD_SIZE; - while (addr < end_addr) { - set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); - addr += PMD_SIZE; - } -} - -static int init_level3_page(struct kimage *image, pud_t *level3p, - unsigned long addr, unsigned long last_addr) -{ - unsigned long end_addr; - int result; - - result = 0; - addr &= PAGE_MASK; - end_addr = addr + PGDIR_SIZE; - while ((addr < last_addr) && (addr < end_addr)) { - struct page *page; - pmd_t *level2p; - - page = kimage_alloc_control_pages(image, 0); - if (!page) { - result = -ENOMEM; - goto out; - } - level2p = (pmd_t *)page_address(page); - init_level2_page(level2p, addr); - set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE)); - addr += PUD_SIZE; - } - /* clear the unused entries */ - while (addr < end_addr) { - pud_clear(level3p++); - addr += PUD_SIZE; - } -out: - return result; -} - - -static int init_level4_page(struct kimage *image, pgd_t *level4p, - unsigned long addr, unsigned long last_addr) -{ - unsigned long end_addr; - int result; - - result = 0; - addr &= PAGE_MASK; - end_addr = addr + (PTRS_PER_PGD * PGDIR_SIZE); - while ((addr < last_addr) && (addr < end_addr)) { - struct page *page; - pud_t *level3p; - - page = kimage_alloc_control_pages(image, 0); - if (!page) { - result = -ENOMEM; - goto out; - } - level3p = (pud_t *)page_address(page); - result = init_level3_page(image, level3p, addr, last_addr); - if (result) { - goto out; - } - set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE)); - addr += PGDIR_SIZE; - } - /* clear the unused entries */ - while (addr < end_addr) { - pgd_clear(level4p++); - addr += PGDIR_SIZE; - } -out: - return result; -} - - -static int init_pgtable(struct kimage *image, unsigned long start_pgtable) -{ - pgd_t *level4p; - level4p = (pgd_t *)__va(start_pgtable); - return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT); -} - -static void set_idt(void *newidt, u16 limit) -{ - struct desc_ptr curidt; - - /* x86-64 supports unaliged loads & stores */ - curidt.size = limit; - curidt.address = (unsigned long)newidt; - - __asm__ __volatile__ ( - "lidtq %0\n" - : : "m" (curidt) - ); -}; - - -static void set_gdt(void *newgdt, u16 limit) -{ - struct desc_ptr curgdt; - - /* x86-64 supports unaligned loads & stores */ - curgdt.size = limit; - curgdt.address = (unsigned long)newgdt; - - __asm__ __volatile__ ( - "lgdtq %0\n" - : : "m" (curgdt) - ); -}; - -static void load_segments(void) -{ - __asm__ __volatile__ ( - "\tmovl %0,%%ds\n" - "\tmovl %0,%%es\n" - "\tmovl %0,%%ss\n" - "\tmovl %0,%%fs\n" - "\tmovl %0,%%gs\n" - : : "a" (__KERNEL_DS) - ); -} - -typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page, - unsigned long control_code_buffer, - unsigned long start_address, - unsigned long pgtable) ATTRIB_NORET; - -const extern unsigned char relocate_new_kernel[]; -const extern unsigned long relocate_new_kernel_size; - -int machine_kexec_prepare(struct kimage *image) -{ - unsigned long start_pgtable, control_code_buffer; - int result; - - /* Calculate the offsets */ - start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; - control_code_buffer = start_pgtable + PAGE_SIZE; - - /* Setup the identity mapped 64bit page table */ - result = init_pgtable(image, start_pgtable); - if (result) - return result; - - /* Place the code in the reboot code buffer */ - memcpy(__va(control_code_buffer), relocate_new_kernel, - relocate_new_kernel_size); - - return 0; -} - -void machine_kexec_cleanup(struct kimage *image) -{ - return; -} - -/* - * Do not allocate memory (or fail in any way) in machine_kexec(). - * We are past the point of no return, committed to rebooting now. - */ -NORET_TYPE void machine_kexec(struct kimage *image) -{ - unsigned long page_list; - unsigned long control_code_buffer; - unsigned long start_pgtable; - relocate_new_kernel_t rnk; - - /* Interrupts aren't acceptable while we reboot */ - local_irq_disable(); - - /* Calculate the offsets */ - page_list = image->head; - start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; - control_code_buffer = start_pgtable + PAGE_SIZE; - - /* Set the low half of the page table to my identity mapped - * page table for kexec. Leave the high half pointing at the - * kernel pages. Don't bother to flush the global pages - * as that will happen when I fully switch to my identity mapped - * page table anyway. - */ - memcpy(__va(read_cr3()), __va(start_pgtable), PAGE_SIZE/2); - __flush_tlb(); - - - /* The segment registers are funny things, they are - * automatically loaded from a table, in memory wherever you - * set them to a specific selector, but this table is never - * accessed again unless you set the segment to a different selector. - * - * The more common model are caches where the behide - * the scenes work is done, but is also dropped at arbitrary - * times. - * - * I take advantage of this here by force loading the - * segments, before I zap the gdt with an invalid value. - */ - load_segments(); - /* The gdt & idt are now invalid. - * If you want to load them you must set up your own idt & gdt. - */ - set_gdt(phys_to_virt(0),0); - set_idt(phys_to_virt(0),0); - /* now call it */ - rnk = (relocate_new_kernel_t) control_code_buffer; - (*rnk)(page_list, control_code_buffer, image->start, start_pgtable); -} diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c deleted file mode 100644 index 3b267c91bb0..00000000000 --- a/arch/x86_64/kernel/mce.c +++ /dev/null @@ -1,623 +0,0 @@ -/* - * Machine check handler. - * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. - * Rest from unknown author(s). - * 2004 Andi Kleen. Rewrote most of it. - */ - -#include <linux/init.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/string.h> -#include <linux/rcupdate.h> -#include <linux/kallsyms.h> -#include <linux/sysdev.h> -#include <linux/miscdevice.h> -#include <linux/fs.h> -#include <linux/cpu.h> -#include <linux/percpu.h> -#include <asm/processor.h> -#include <asm/msr.h> -#include <asm/mce.h> -#include <asm/kdebug.h> -#include <asm/uaccess.h> - -#define MISC_MCELOG_MINOR 227 -#define NR_BANKS 5 - -static int mce_dont_init; - -/* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic, - 3: never panic or exit (for testing only) */ -static int tolerant = 1; -static int banks; -static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL }; -static unsigned long console_logged; -static int notify_user; -static int rip_msr; - -/* - * Lockless MCE logging infrastructure. - * This avoids deadlocks on printk locks without having to break locks. Also - * separate MCEs from kernel messages to avoid bogus bug reports. - */ - -struct mce_log mcelog = { - MCE_LOG_SIGNATURE, - MCE_LOG_LEN, -}; - -void mce_log(struct mce *mce) -{ - unsigned next, entry; - mce->finished = 0; - smp_wmb(); - for (;;) { - entry = rcu_dereference(mcelog.next); - /* When the buffer fills up discard new entries. Assume - that the earlier errors are the more interesting. */ - if (entry >= MCE_LOG_LEN) { - set_bit(MCE_OVERFLOW, &mcelog.flags); - return; - } - /* Old left over entry. Skip. */ - if (mcelog.entry[entry].finished) - continue; - smp_rmb(); - next = entry + 1; - if (cmpxchg(&mcelog.next, entry, next) == entry) - break; - } - memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); - smp_wmb(); - mcelog.entry[entry].finished = 1; - smp_wmb(); - - if (!test_and_set_bit(0, &console_logged)) - notify_user = 1; -} - -static void print_mce(struct mce *m) -{ - printk(KERN_EMERG "\n" - KERN_EMERG - "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", - m->cpu, m->mcgstatus, m->bank, m->status); - if (m->rip) { - printk(KERN_EMERG - "RIP%s %02x:<%016Lx> ", - !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", - m->cs, m->rip); - if (m->cs == __KERNEL_CS) - print_symbol("{%s}", m->rip); - printk("\n"); - } - printk(KERN_EMERG "TSC %Lx ", m->tsc); - if (m->addr) - printk("ADDR %Lx ", m->addr); - if (m->misc) - printk("MISC %Lx ", m->misc); - printk("\n"); -} - -static void mce_panic(char *msg, struct mce *backup, unsigned long start) -{ - int i; - oops_begin(); - for (i = 0; i < MCE_LOG_LEN; i++) { - unsigned long tsc = mcelog.entry[i].tsc; - if (time_before(tsc, start)) - continue; - print_mce(&mcelog.entry[i]); - if (backup && mcelog.entry[i].tsc == backup->tsc) - backup = NULL; - } - if (backup) - print_mce(backup); - if (tolerant >= 3) - printk("Fake panic: %s\n", msg); - else - panic(msg); -} - -static int mce_available(struct cpuinfo_x86 *c) -{ - return test_bit(X86_FEATURE_MCE, &c->x86_capability) && - test_bit(X86_FEATURE_MCA, &c->x86_capability); -} - -static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) -{ - if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) { - m->rip = regs->rip; - m->cs = regs->cs; - } else { - m->rip = 0; - m->cs = 0; - } - if (rip_msr) { - /* Assume the RIP in the MSR is exact. Is this true? */ - m->mcgstatus |= MCG_STATUS_EIPV; - rdmsrl(rip_msr, m->rip); - m->cs = 0; - } -} - -/* - * The actual machine check handler - */ - -void do_machine_check(struct pt_regs * regs, long error_code) -{ - struct mce m, panicm; - int nowayout = (tolerant < 1); - int kill_it = 0; - u64 mcestart = 0; - int i; - int panicm_found = 0; - - if (regs) - notify_die(DIE_NMI, "machine check", regs, error_code, 255, SIGKILL); - if (!banks) - return; - - memset(&m, 0, sizeof(struct mce)); - m.cpu = hard_smp_processor_id(); - rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); - if (!(m.mcgstatus & MCG_STATUS_RIPV)) - kill_it = 1; - - rdtscll(mcestart); - barrier(); - - for (i = 0; i < banks; i++) { - if (!bank[i]) - continue; - - m.misc = 0; - m.addr = 0; - m.bank = i; - m.tsc = 0; - - rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); - if ((m.status & MCI_STATUS_VAL) == 0) - continue; - - if (m.status & MCI_STATUS_EN) { - /* In theory _OVER could be a nowayout too, but - assume any overflowed errors were no fatal. */ - nowayout |= !!(m.status & MCI_STATUS_PCC); - kill_it |= !!(m.status & MCI_STATUS_UC); - } - - if (m.status & MCI_STATUS_MISCV) - rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); - if (m.status & MCI_STATUS_ADDRV) - rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); - - mce_get_rip(&m, regs); - if (error_code != -1) - rdtscll(m.tsc); - wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0); - mce_log(&m); - - /* Did this bank cause the exception? */ - /* Assume that the bank with uncorrectable errors did it, - and that there is only a single one. */ - if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) { - panicm = m; - panicm_found = 1; - } - - tainted |= TAINT_MACHINE_CHECK; - } - - /* Never do anything final in the polling timer */ - if (!regs) - goto out; - - /* If we didn't find an uncorrectable error, pick - the last one (shouldn't happen, just being safe). */ - if (!panicm_found) - panicm = m; - if (nowayout) - mce_panic("Machine check", &panicm, mcestart); - if (kill_it) { - int user_space = 0; - - if (m.mcgstatus & MCG_STATUS_RIPV) - user_space = panicm.rip && (panicm.cs & 3); - - /* When the machine was in user space and the CPU didn't get - confused it's normally not necessary to panic, unless you - are paranoid (tolerant == 0) - - RED-PEN could be more tolerant for MCEs in idle, - but most likely they occur at boot anyways, where - it is best to just halt the machine. */ - if ((!user_space && (panic_on_oops || tolerant < 2)) || - (unsigned)current->pid <= 1) - mce_panic("Uncorrected machine check", &panicm, mcestart); - - /* do_exit takes an awful lot of locks and has as - slight risk of deadlocking. If you don't want that - don't set tolerant >= 2 */ - if (tolerant < 3) - do_exit(SIGBUS); - } - - out: - /* Last thing done in the machine check exception to clear state. */ - wrmsrl(MSR_IA32_MCG_STATUS, 0); -} - -/* - * Periodic polling timer for "silent" machine check errors. - */ - -static int check_interval = 5 * 60; /* 5 minutes */ -static void mcheck_timer(void *data); -static DECLARE_WORK(mcheck_work, mcheck_timer, NULL); - -static void mcheck_check_cpu(void *info) -{ - if (mce_available(¤t_cpu_data)) - do_machine_check(NULL, 0); -} - -static void mcheck_timer(void *data) -{ - on_each_cpu(mcheck_check_cpu, NULL, 1, 1); - schedule_delayed_work(&mcheck_work, check_interval * HZ); - - /* - * It's ok to read stale data here for notify_user and - * console_logged as we'll simply get the updated versions - * on the next mcheck_timer execution and atomic operations - * on console_logged act as synchronization for notify_user - * writes. - */ - if (notify_user && console_logged) { - notify_user = 0; - clear_bit(0, &console_logged); - printk(KERN_INFO "Machine check events logged\n"); - } -} - - -static __init int periodic_mcheck_init(void) -{ - if (check_interval) - schedule_delayed_work(&mcheck_work, check_interval*HZ); - return 0; -} -__initcall(periodic_mcheck_init); - - -/* - * Initialize Machine Checks for a CPU. - */ -static void mce_init(void *dummy) -{ - u64 cap; - int i; - - rdmsrl(MSR_IA32_MCG_CAP, cap); - banks = cap & 0xff; - if (banks > NR_BANKS) { - printk(KERN_INFO "MCE: warning: using only %d banks\n", banks); - banks = NR_BANKS; - } - /* Use accurate RIP reporting if available. */ - if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) - rip_msr = MSR_IA32_MCG_EIP; - - /* Log the machine checks left over from the previous reset. - This also clears all registers */ - do_machine_check(NULL, -1); - - set_in_cr4(X86_CR4_MCE); - - if (cap & MCG_CTL_P) - wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); - - for (i = 0; i < banks; i++) { - wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); - wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); - } -} - -/* Add per CPU specific workarounds here */ -static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) -{ - /* This should be disabled by the BIOS, but isn't always */ - if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) { - /* disable GART TBL walk error reporting, which trips off - incorrectly with the IOMMU & 3ware & Cerberus. */ - clear_bit(10, &bank[4]); - } -} - -static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c) -{ - switch (c->x86_vendor) { - case X86_VENDOR_INTEL: - mce_intel_feature_init(c); - break; - default: - break; - } -} - -/* - * Called for each booted CPU to set up machine checks. - * Must be called with preempt off. - */ -void __cpuinit mcheck_init(struct cpuinfo_x86 *c) -{ - static cpumask_t mce_cpus __initdata = CPU_MASK_NONE; - - mce_cpu_quirks(c); - - if (mce_dont_init || - cpu_test_and_set(smp_processor_id(), mce_cpus) || - !mce_available(c)) - return; - - mce_init(NULL); - mce_cpu_features(c); -} - -/* - * Character device to read and clear the MCE log. - */ - -static void collect_tscs(void *data) -{ - unsigned long *cpu_tsc = (unsigned long *)data; - rdtscll(cpu_tsc[smp_processor_id()]); -} - -static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off) -{ - unsigned long *cpu_tsc; - static DECLARE_MUTEX(mce_read_sem); - unsigned next; - char __user *buf = ubuf; - int i, err; - - cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL); - if (!cpu_tsc) - return -ENOMEM; - - down(&mce_read_sem); - next = rcu_dereference(mcelog.next); - - /* Only supports full reads right now */ - if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { - up(&mce_read_sem); - kfree(cpu_tsc); - return -EINVAL; - } - - err = 0; - for (i = 0; i < next; i++) { - if (!mcelog.entry[i].finished) - continue; - smp_rmb(); - err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce)); - buf += sizeof(struct mce); - } - - memset(mcelog.entry, 0, next * sizeof(struct mce)); - mcelog.next = 0; - - synchronize_sched(); - - /* Collect entries that were still getting written before the synchronize. */ - - on_each_cpu(collect_tscs, cpu_tsc, 1, 1); - for (i = next; i < MCE_LOG_LEN; i++) { - if (mcelog.entry[i].finished && - mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { - err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce)); - smp_rmb(); - buf += sizeof(struct mce); - memset(&mcelog.entry[i], 0, sizeof(struct mce)); - } - } - up(&mce_read_sem); - kfree(cpu_tsc); - return err ? -EFAULT : buf - ubuf; -} - -static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg) -{ - int __user *p = (int __user *)arg; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - switch (cmd) { - case MCE_GET_RECORD_LEN: - return put_user(sizeof(struct mce), p); - case MCE_GET_LOG_LEN: - return put_user(MCE_LOG_LEN, p); - case MCE_GETCLEAR_FLAGS: { - unsigned flags; - do { - flags = mcelog.flags; - } while (cmpxchg(&mcelog.flags, flags, 0) != flags); - return put_user(flags, p); - } - default: - return -ENOTTY; - } -} - -static struct file_operations mce_chrdev_ops = { - .read = mce_read, - .ioctl = mce_ioctl, -}; - -static struct miscdevice mce_log_device = { - MISC_MCELOG_MINOR, - "mcelog", - &mce_chrdev_ops, -}; - -/* - * Old style boot options parsing. Only for compatibility. - */ - -static int __init mcheck_disable(char *str) -{ - mce_dont_init = 1; - return 0; -} - -/* mce=off disables machine check. Note you can reenable it later - using sysfs */ -static int __init mcheck_enable(char *str) -{ - if (!strcmp(str, "off")) - mce_dont_init = 1; - else - printk("mce= argument %s ignored. Please use /sys", str); - return 0; -} - -__setup("nomce", mcheck_disable); -__setup("mce", mcheck_enable); - -/* - * Sysfs support - */ - -/* On resume clear all MCE state. Don't want to see leftovers from the BIOS. */ -static int mce_resume(struct sys_device *dev) -{ - on_each_cpu(mce_init, NULL, 1, 1); - return 0; -} - -/* Reinit MCEs after user configuration changes */ -static void mce_restart(void) -{ - if (check_interval) - cancel_delayed_work(&mcheck_work); - /* Timer race is harmless here */ - on_each_cpu(mce_init, NULL, 1, 1); - if (check_interval) - schedule_delayed_work(&mcheck_work, check_interval*HZ); -} - -static struct sysdev_class mce_sysclass = { - .resume = mce_resume, - set_kset_name("machinecheck"), -}; - -static DEFINE_PER_CPU(struct sys_device, device_mce); - -/* Why are there no generic functions for this? */ -#define ACCESSOR(name, var, start) \ - static ssize_t show_ ## name(struct sys_device *s, char *buf) { \ - return sprintf(buf, "%lx\n", (unsigned long)var); \ - } \ - static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \ - char *end; \ - unsigned long new = simple_strtoul(buf, &end, 0); \ - if (end == buf) return -EINVAL; \ - var = new; \ - start; \ - return end-buf; \ - } \ - static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); - -ACCESSOR(bank0ctl,bank[0],mce_restart()) -ACCESSOR(bank1ctl,bank[1],mce_restart()) -ACCESSOR(bank2ctl,bank[2],mce_restart()) -ACCESSOR(bank3ctl,bank[3],mce_restart()) -ACCESSOR(bank4ctl,bank[4],mce_restart()) -ACCESSOR(tolerant,tolerant,) -ACCESSOR(check_interval,check_interval,mce_restart()) - -/* Per cpu sysdev init. All of the cpus still share the same ctl bank */ -static __cpuinit int mce_create_device(unsigned int cpu) -{ - int err; - if (!mce_available(&cpu_data[cpu])) - return -EIO; - - per_cpu(device_mce,cpu).id = cpu; - per_cpu(device_mce,cpu).cls = &mce_sysclass; - - err = sysdev_register(&per_cpu(device_mce,cpu)); - - if (!err) { - sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank0ctl); - sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank1ctl); - sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank2ctl); - sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank3ctl); - sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank4ctl); - sysdev_create_file(&per_cpu(device_mce,cpu), &attr_tolerant); - sysdev_create_file(&per_cpu(device_mce,cpu), &attr_check_interval); - } - return err; -} - -#ifdef CONFIG_HOTPLUG_CPU -static __cpuinit void mce_remove_device(unsigned int cpu) -{ - sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank0ctl); - sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank1ctl); - sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank2ctl); - sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank3ctl); - sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank4ctl); - sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant); - sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval); - sysdev_unregister(&per_cpu(device_mce,cpu)); -} -#endif - -/* Get notified when a cpu comes on/off. Be hotplug friendly. */ -static __cpuinit int -mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) -{ - unsigned int cpu = (unsigned long)hcpu; - - switch (action) { - case CPU_ONLINE: - mce_create_device(cpu); - break; -#ifdef CONFIG_HOTPLUG_CPU - case CPU_DEAD: - mce_remove_device(cpu); - break; -#endif - } - return NOTIFY_OK; -} - -static struct notifier_block mce_cpu_notifier = { - .notifier_call = mce_cpu_callback, -}; - -static __init int mce_init_device(void) -{ - int err; - int i = 0; - - if (!mce_available(&boot_cpu_data)) - return -EIO; - err = sysdev_class_register(&mce_sysclass); - - for_each_online_cpu(i) { - mce_create_device(i); - } - - register_cpu_notifier(&mce_cpu_notifier); - misc_register(&mce_log_device); - return err; -} - -device_initcall(mce_init_device); diff --git a/arch/x86_64/kernel/mce_intel.c b/arch/x86_64/kernel/mce_intel.c deleted file mode 100644 index 0be0a795981..00000000000 --- a/arch/x86_64/kernel/mce_intel.c +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Intel specific MCE features. - * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca> - */ - -#include <linux/init.h> -#include <linux/interrupt.h> -#include <linux/percpu.h> -#include <asm/processor.h> -#include <asm/msr.h> -#include <asm/mce.h> -#include <asm/hw_irq.h> - -static DEFINE_PER_CPU(unsigned long, next_check); - -asmlinkage void smp_thermal_interrupt(void) -{ - struct mce m; - - ack_APIC_irq(); - - irq_enter(); - if (time_before(jiffies, __get_cpu_var(next_check))) - goto done; - - __get_cpu_var(next_check) = jiffies + HZ*300; - memset(&m, 0, sizeof(m)); - m.cpu = smp_processor_id(); - m.bank = MCE_THERMAL_BANK; - rdtscll(m.tsc); - rdmsrl(MSR_IA32_THERM_STATUS, m.status); - if (m.status & 0x1) { - printk(KERN_EMERG - "CPU%d: Temperature above threshold, cpu clock throttled\n", m.cpu); - add_taint(TAINT_MACHINE_CHECK); - } else { - printk(KERN_EMERG "CPU%d: Temperature/speed normal\n", m.cpu); - } - - mce_log(&m); -done: - irq_exit(); -} - -static void __cpuinit intel_init_thermal(struct cpuinfo_x86 *c) -{ - u32 l, h; - int tm2 = 0; - unsigned int cpu = smp_processor_id(); - - if (!cpu_has(c, X86_FEATURE_ACPI)) - return; - - if (!cpu_has(c, X86_FEATURE_ACC)) - return; - - /* first check if TM1 is already enabled by the BIOS, in which - * case there might be some SMM goo which handles it, so we can't even - * put a handler since it might be delivered via SMI already. - */ - rdmsr(MSR_IA32_MISC_ENABLE, l, h); - h = apic_read(APIC_LVTTHMR); - if ((l & (1 << 3)) && (h & APIC_DM_SMI)) { - printk(KERN_DEBUG - "CPU%d: Thermal monitoring handled by SMI\n", cpu); - return; - } - - if (cpu_has(c, X86_FEATURE_TM2) && (l & (1 << 13))) - tm2 = 1; - - if (h & APIC_VECTOR_MASK) { - printk(KERN_DEBUG - "CPU%d: Thermal LVT vector (%#x) already " - "installed\n", cpu, (h & APIC_VECTOR_MASK)); - return; - } - - h = THERMAL_APIC_VECTOR; - h |= (APIC_DM_FIXED | APIC_LVT_MASKED); - apic_write_around(APIC_LVTTHMR, h); - - rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); - wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h); - - rdmsr(MSR_IA32_MISC_ENABLE, l, h); - wrmsr(MSR_IA32_MISC_ENABLE, l | (1 << 3), h); - - l = apic_read(APIC_LVTTHMR); - apic_write_around(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); - printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", - cpu, tm2 ? "TM2" : "TM1"); - return; -} - -void __cpuinit mce_intel_feature_init(struct cpuinfo_x86 *c) -{ - intel_init_thermal(c); -} diff --git a/arch/x86_64/kernel/module.c b/arch/x86_64/kernel/module.c deleted file mode 100644 index bac195c74bc..00000000000 --- a/arch/x86_64/kernel/module.c +++ /dev/null @@ -1,170 +0,0 @@ -/* Kernel module help for x86-64 - Copyright (C) 2001 Rusty Russell. - Copyright (C) 2002,2003 Andi Kleen, SuSE Labs. - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -*/ -#include <linux/moduleloader.h> -#include <linux/elf.h> -#include <linux/vmalloc.h> -#include <linux/fs.h> -#include <linux/string.h> -#include <linux/kernel.h> -#include <linux/slab.h> - -#include <asm/system.h> -#include <asm/page.h> -#include <asm/pgtable.h> - -#define DEBUGP(fmt...) - -#ifndef CONFIG_UML -void module_free(struct module *mod, void *module_region) -{ - vfree(module_region); - /* FIXME: If module_region == mod->init_region, trim exception - table entries. */ -} - -void *module_alloc(unsigned long size) -{ - struct vm_struct *area; - - if (!size) - return NULL; - size = PAGE_ALIGN(size); - if (size > MODULES_LEN) - return NULL; - - area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END); - if (!area) - return NULL; - - return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL_EXEC); -} -#endif - -/* We don't need anything special. */ -int module_frob_arch_sections(Elf_Ehdr *hdr, - Elf_Shdr *sechdrs, - char *secstrings, - struct module *mod) -{ - return 0; -} - -int apply_relocate_add(Elf64_Shdr *sechdrs, - const char *strtab, - unsigned int symindex, - unsigned int relsec, - struct module *me) -{ - unsigned int i; - Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr; - Elf64_Sym *sym; - void *loc; - u64 val; - - DEBUGP("Applying relocate section %u to %u\n", relsec, - sechdrs[relsec].sh_info); - for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { - /* This is where to make the change */ - loc = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr - + rel[i].r_offset; - - /* This is the symbol it is referring to. Note that all - undefined symbols have been resolved. */ - sym = (Elf64_Sym *)sechdrs[symindex].sh_addr - + ELF64_R_SYM(rel[i].r_info); - - DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n", - (int)ELF64_R_TYPE(rel[i].r_info), - sym->st_value, rel[i].r_addend, (u64)loc); - - val = sym->st_value + rel[i].r_addend; - - switch (ELF64_R_TYPE(rel[i].r_info)) { - case R_X86_64_NONE: - break; - case R_X86_64_64: - *(u64 *)loc = val; - break; - case R_X86_64_32: - *(u32 *)loc = val; - if (val != *(u32 *)loc) - goto overflow; - break; - case R_X86_64_32S: - *(s32 *)loc = val; - if ((s64)val != *(s32 *)loc) - goto overflow; - break; - case R_X86_64_PC32: - val -= (u64)loc; - *(u32 *)loc = val; -#if 0 - if ((s64)val != *(s32 *)loc) - goto overflow; -#endif - break; - default: - printk(KERN_ERR "module %s: Unknown rela relocation: %Lu\n", - me->name, ELF64_R_TYPE(rel[i].r_info)); - return -ENOEXEC; - } - } - return 0; - -overflow: - printk(KERN_ERR "overflow in relocation type %d val %Lx\n", - (int)ELF64_R_TYPE(rel[i].r_info), val); - printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n", - me->name); - return -ENOEXEC; -} - -int apply_relocate(Elf_Shdr *sechdrs, - const char *strtab, - unsigned int symindex, - unsigned int relsec, - struct module *me) -{ - printk("non add relocation not supported\n"); - return -ENOSYS; -} - -extern void apply_alternatives(void *start, void *end); - -int module_finalize(const Elf_Ehdr *hdr, - const Elf_Shdr *sechdrs, - struct module *me) -{ - const Elf_Shdr *s; - char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; - - /* look for .altinstructions to patch */ - for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { - void *seg; - if (strcmp(".altinstructions", secstrings + s->sh_name)) - continue; - seg = (void *)s->sh_addr; - apply_alternatives(seg, seg + s->sh_size); - } - return 0; -} - -void module_arch_cleanup(struct module *mod) -{ -} diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c deleted file mode 100644 index 08abf9f5b15..00000000000 --- a/arch/x86_64/kernel/mpparse.c +++ /dev/null @@ -1,984 +0,0 @@ -/* - * Intel Multiprocessor Specification 1.1 and 1.4 - * compliant MP-table parsing routines. - * - * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> - * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com> - * - * Fixes - * Erich Boleyn : MP v1.4 and additional changes. - * Alan Cox : Added EBDA scanning - * Ingo Molnar : various cleanups and rewrites - * Maciej W. Rozycki: Bits for default MP configurations - * Paul Diefenbaugh: Added full ACPI support - */ - -#include <linux/mm.h> -#include <linux/irq.h> -#include <linux/init.h> -#include <linux/delay.h> -#include <linux/config.h> -#include <linux/bootmem.h> -#include <linux/smp_lock.h> -#include <linux/kernel_stat.h> -#include <linux/mc146818rtc.h> -#include <linux/acpi.h> -#include <linux/module.h> - -#include <asm/smp.h> -#include <asm/mtrr.h> -#include <asm/mpspec.h> -#include <asm/pgalloc.h> -#include <asm/io_apic.h> -#include <asm/proto.h> -#include <asm/acpi.h> - -/* Have we found an MP table */ -int smp_found_config; -unsigned int __initdata maxcpus = NR_CPUS; - -int acpi_found_madt; - -/* - * Various Linux-internal data structures created from the - * MP-table. - */ -int apic_version [MAX_APICS]; -unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; -int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; -unsigned char pci_bus_to_node [256]; -EXPORT_SYMBOL(pci_bus_to_node); - -static int mp_current_pci_id = 0; -/* I/O APIC entries */ -struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS]; - -/* # of MP IRQ source entries */ -struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; - -/* MP IRQ source entries */ -int mp_irq_entries; - -int nr_ioapics; -int pic_mode; -unsigned long mp_lapic_addr = 0; - - - -/* Processor that is doing the boot up */ -unsigned int boot_cpu_id = -1U; -/* Internal processor count */ -static unsigned int num_processors = 0; - -/* Bitmask of physically existing CPUs */ -physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE; - -/* ACPI MADT entry parsing functions */ -#ifdef CONFIG_ACPI_BOOT -extern struct acpi_boot_flags acpi_boot; -#ifdef CONFIG_X86_LOCAL_APIC -extern int acpi_parse_lapic (acpi_table_entry_header *header); -extern int acpi_parse_lapic_addr_ovr (acpi_table_entry_header *header); -extern int acpi_parse_lapic_nmi (acpi_table_entry_header *header); -#endif /*CONFIG_X86_LOCAL_APIC*/ -#ifdef CONFIG_X86_IO_APIC -extern int acpi_parse_ioapic (acpi_table_entry_header *header); -#endif /*CONFIG_X86_IO_APIC*/ -#endif /*CONFIG_ACPI_BOOT*/ - -u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; - - -/* - * Intel MP BIOS table parsing routines: - */ - -/* - * Checksum an MP configuration block. - */ - -static int __init mpf_checksum(unsigned char *mp, int len) -{ - int sum = 0; - - while (len--) - sum += *mp++; - - return sum & 0xFF; -} - -static void __init MP_processor_info (struct mpc_config_processor *m) -{ - int ver, cpu; - static int found_bsp=0; - - if (!(m->mpc_cpuflag & CPU_ENABLED)) - return; - - printk(KERN_INFO "Processor #%d %d:%d APIC version %d\n", - m->mpc_apicid, - (m->mpc_cpufeature & CPU_FAMILY_MASK)>>8, - (m->mpc_cpufeature & CPU_MODEL_MASK)>>4, - m->mpc_apicver); - - if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { - Dprintk(" Bootup CPU\n"); - boot_cpu_id = m->mpc_apicid; - } - if (num_processors >= NR_CPUS) { - printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." - " Processor ignored.\n", NR_CPUS); - return; - } - - cpu = num_processors++; - - if (m->mpc_apicid > MAX_APICS) { - printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n", - m->mpc_apicid, MAX_APICS); - return; - } - ver = m->mpc_apicver; - - physid_set(m->mpc_apicid, phys_cpu_present_map); - /* - * Validate version - */ - if (ver == 0x0) { - printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! fixing up to 0x10. (tell your hw vendor)\n", m->mpc_apicid); - ver = 0x10; - } - apic_version[m->mpc_apicid] = ver; - if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { - /* - * bios_cpu_apicid is required to have processors listed - * in same order as logical cpu numbers. Hence the first - * entry is BSP, and so on. - */ - cpu = 0; - - bios_cpu_apicid[0] = m->mpc_apicid; - x86_cpu_to_apicid[0] = m->mpc_apicid; - found_bsp = 1; - } else - cpu = num_processors - found_bsp; - bios_cpu_apicid[cpu] = m->mpc_apicid; - x86_cpu_to_apicid[cpu] = m->mpc_apicid; - - cpu_set(cpu, cpu_possible_map); - cpu_set(cpu, cpu_present_map); -} - -static void __init MP_bus_info (struct mpc_config_bus *m) -{ - char str[7]; - - memcpy(str, m->mpc_bustype, 6); - str[6] = 0; - Dprintk("Bus #%d is %s\n", m->mpc_busid, str); - - if (strncmp(str, "ISA", 3) == 0) { - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; - } else if (strncmp(str, "EISA", 4) == 0) { - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA; - } else if (strncmp(str, "PCI", 3) == 0) { - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; - mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id; - mp_current_pci_id++; - } else if (strncmp(str, "MCA", 3) == 0) { - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA; - } else { - printk(KERN_ERR "Unknown bustype %s\n", str); - } -} - -static void __init MP_ioapic_info (struct mpc_config_ioapic *m) -{ - if (!(m->mpc_flags & MPC_APIC_USABLE)) - return; - - printk("I/O APIC #%d Version %d at 0x%X.\n", - m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr); - if (nr_ioapics >= MAX_IO_APICS) { - printk(KERN_ERR "Max # of I/O APICs (%d) exceeded (found %d).\n", - MAX_IO_APICS, nr_ioapics); - panic("Recompile kernel with bigger MAX_IO_APICS!.\n"); - } - if (!m->mpc_apicaddr) { - printk(KERN_ERR "WARNING: bogus zero I/O APIC address" - " found in MP table, skipping!\n"); - return; - } - mp_ioapics[nr_ioapics] = *m; - nr_ioapics++; -} - -static void __init MP_intsrc_info (struct mpc_config_intsrc *m) -{ - mp_irqs [mp_irq_entries] = *m; - Dprintk("Int: type %d, pol %d, trig %d, bus %d," - " IRQ %02x, APIC ID %x, APIC INT %02x\n", - m->mpc_irqtype, m->mpc_irqflag & 3, - (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, - m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq); - if (++mp_irq_entries == MAX_IRQ_SOURCES) - panic("Max # of irq sources exceeded!!\n"); -} - -static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m) -{ - Dprintk("Lint: type %d, pol %d, trig %d, bus %d," - " IRQ %02x, APIC ID %x, APIC LINT %02x\n", - m->mpc_irqtype, m->mpc_irqflag & 3, - (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid, - m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); - /* - * Well it seems all SMP boards in existence - * use ExtINT/LVT1 == LINT0 and - * NMI/LVT2 == LINT1 - the following check - * will show us if this assumptions is false. - * Until then we do not have to add baggage. - */ - if ((m->mpc_irqtype == mp_ExtINT) && - (m->mpc_destapiclint != 0)) - BUG(); - if ((m->mpc_irqtype == mp_NMI) && - (m->mpc_destapiclint != 1)) - BUG(); -} - -/* - * Read/parse the MPC - */ - -static int __init smp_read_mpc(struct mp_config_table *mpc) -{ - char str[16]; - int count=sizeof(*mpc); - unsigned char *mpt=((unsigned char *)mpc)+count; - - if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) { - printk("SMP mptable: bad signature [%c%c%c%c]!\n", - mpc->mpc_signature[0], - mpc->mpc_signature[1], - mpc->mpc_signature[2], - mpc->mpc_signature[3]); - return 0; - } - if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) { - printk("SMP mptable: checksum error!\n"); - return 0; - } - if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) { - printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n", - mpc->mpc_spec); - return 0; - } - if (!mpc->mpc_lapic) { - printk(KERN_ERR "SMP mptable: null local APIC address!\n"); - return 0; - } - memcpy(str,mpc->mpc_oem,8); - str[8]=0; - printk(KERN_INFO "OEM ID: %s ",str); - - memcpy(str,mpc->mpc_productid,12); - str[12]=0; - printk(KERN_INFO "Product ID: %s ",str); - - printk(KERN_INFO "APIC at: 0x%X\n",mpc->mpc_lapic); - - /* save the local APIC address, it might be non-default */ - if (!acpi_lapic) - mp_lapic_addr = mpc->mpc_lapic; - - /* - * Now process the configuration blocks. - */ - while (count < mpc->mpc_length) { - switch(*mpt) { - case MP_PROCESSOR: - { - struct mpc_config_processor *m= - (struct mpc_config_processor *)mpt; - if (!acpi_lapic) - MP_processor_info(m); - mpt += sizeof(*m); - count += sizeof(*m); - break; - } - case MP_BUS: - { - struct mpc_config_bus *m= - (struct mpc_config_bus *)mpt; - MP_bus_info(m); - mpt += sizeof(*m); - count += sizeof(*m); - break; - } - case MP_IOAPIC: - { - struct mpc_config_ioapic *m= - (struct mpc_config_ioapic *)mpt; - MP_ioapic_info(m); - mpt+=sizeof(*m); - count+=sizeof(*m); - break; - } - case MP_INTSRC: - { - struct mpc_config_intsrc *m= - (struct mpc_config_intsrc *)mpt; - - MP_intsrc_info(m); - mpt+=sizeof(*m); - count+=sizeof(*m); - break; - } - case MP_LINTSRC: - { - struct mpc_config_lintsrc *m= - (struct mpc_config_lintsrc *)mpt; - MP_lintsrc_info(m); - mpt+=sizeof(*m); - count+=sizeof(*m); - break; - } - } - } - clustered_apic_check(); - if (!num_processors) - printk(KERN_ERR "SMP mptable: no processors registered!\n"); - return num_processors; -} - -static int __init ELCR_trigger(unsigned int irq) -{ - unsigned int port; - - port = 0x4d0 + (irq >> 3); - return (inb(port) >> (irq & 7)) & 1; -} - -static void __init construct_default_ioirq_mptable(int mpc_default_type) -{ - struct mpc_config_intsrc intsrc; - int i; - int ELCR_fallback = 0; - - intsrc.mpc_type = MP_INTSRC; - intsrc.mpc_irqflag = 0; /* conforming */ - intsrc.mpc_srcbus = 0; - intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid; - - intsrc.mpc_irqtype = mp_INT; - - /* - * If true, we have an ISA/PCI system with no IRQ entries - * in the MP table. To prevent the PCI interrupts from being set up - * incorrectly, we try to use the ELCR. The sanity check to see if - * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can - * never be level sensitive, so we simply see if the ELCR agrees. - * If it does, we assume it's valid. - */ - if (mpc_default_type == 5) { - printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n"); - - if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13)) - printk(KERN_ERR "ELCR contains invalid data... not using ELCR\n"); - else { - printk(KERN_INFO "Using ELCR to identify PCI interrupts\n"); - ELCR_fallback = 1; - } - } - - for (i = 0; i < 16; i++) { - switch (mpc_default_type) { - case 2: - if (i == 0 || i == 13) - continue; /* IRQ0 & IRQ13 not connected */ - /* fall through */ - default: - if (i == 2) - continue; /* IRQ2 is never connected */ - } - - if (ELCR_fallback) { - /* - * If the ELCR indicates a level-sensitive interrupt, we - * copy that information over to the MP table in the - * irqflag field (level sensitive, active high polarity). - */ - if (ELCR_trigger(i)) - intsrc.mpc_irqflag = 13; - else - intsrc.mpc_irqflag = 0; - } - - intsrc.mpc_srcbusirq = i; - intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */ - MP_intsrc_info(&intsrc); - } - - intsrc.mpc_irqtype = mp_ExtINT; - intsrc.mpc_srcbusirq = 0; - intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */ - MP_intsrc_info(&intsrc); -} - -static inline void __init construct_default_ISA_mptable(int mpc_default_type) -{ - struct mpc_config_processor processor; - struct mpc_config_bus bus; - struct mpc_config_ioapic ioapic; - struct mpc_config_lintsrc lintsrc; - int linttypes[2] = { mp_ExtINT, mp_NMI }; - int i; - - /* - * local APIC has default address - */ - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; - - /* - * 2 CPUs, numbered 0 & 1. - */ - processor.mpc_type = MP_PROCESSOR; - /* Either an integrated APIC or a discrete 82489DX. */ - processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; - processor.mpc_cpuflag = CPU_ENABLED; - processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | - (boot_cpu_data.x86_model << 4) | - boot_cpu_data.x86_mask; - processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; - processor.mpc_reserved[0] = 0; - processor.mpc_reserved[1] = 0; - for (i = 0; i < 2; i++) { - processor.mpc_apicid = i; - MP_processor_info(&processor); - } - - bus.mpc_type = MP_BUS; - bus.mpc_busid = 0; - switch (mpc_default_type) { - default: - printk(KERN_ERR "???\nUnknown standard configuration %d\n", - mpc_default_type); - /* fall through */ - case 1: - case 5: - memcpy(bus.mpc_bustype, "ISA ", 6); - break; - case 2: - case 6: - case 3: - memcpy(bus.mpc_bustype, "EISA ", 6); - break; - case 4: - case 7: - memcpy(bus.mpc_bustype, "MCA ", 6); - } - MP_bus_info(&bus); - if (mpc_default_type > 4) { - bus.mpc_busid = 1; - memcpy(bus.mpc_bustype, "PCI ", 6); - MP_bus_info(&bus); - } - - ioapic.mpc_type = MP_IOAPIC; - ioapic.mpc_apicid = 2; - ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; - ioapic.mpc_flags = MPC_APIC_USABLE; - ioapic.mpc_apicaddr = 0xFEC00000; - MP_ioapic_info(&ioapic); - - /* - * We set up most of the low 16 IO-APIC pins according to MPS rules. - */ - construct_default_ioirq_mptable(mpc_default_type); - - lintsrc.mpc_type = MP_LINTSRC; - lintsrc.mpc_irqflag = 0; /* conforming */ - lintsrc.mpc_srcbusid = 0; - lintsrc.mpc_srcbusirq = 0; - lintsrc.mpc_destapic = MP_APIC_ALL; - for (i = 0; i < 2; i++) { - lintsrc.mpc_irqtype = linttypes[i]; - lintsrc.mpc_destapiclint = i; - MP_lintsrc_info(&lintsrc); - } -} - -static struct intel_mp_floating *mpf_found; - -/* - * Scan the memory blocks for an SMP configuration block. - */ -void __init get_smp_config (void) -{ - struct intel_mp_floating *mpf = mpf_found; - - /* - * ACPI may be used to obtain the entire SMP configuration or just to - * enumerate/configure processors (CONFIG_ACPI_BOOT). Note that - * ACPI supports both logical (e.g. Hyper-Threading) and physical - * processors, where MPS only supports physical. - */ - if (acpi_lapic && acpi_ioapic) { - printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n"); - return; - } - else if (acpi_lapic) - printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n"); - - printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification); - if (mpf->mpf_feature2 & (1<<7)) { - printk(KERN_INFO " IMCR and PIC compatibility mode.\n"); - pic_mode = 1; - } else { - printk(KERN_INFO " Virtual Wire compatibility mode.\n"); - pic_mode = 0; - } - - /* - * Now see if we need to read further. - */ - if (mpf->mpf_feature1 != 0) { - - printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1); - construct_default_ISA_mptable(mpf->mpf_feature1); - - } else if (mpf->mpf_physptr) { - - /* - * Read the physical hardware table. Anything here will - * override the defaults. - */ - if (!smp_read_mpc((void *)(unsigned long)mpf->mpf_physptr)) { - smp_found_config = 0; - printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"); - printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n"); - return; - } - /* - * If there are no explicit MP IRQ entries, then we are - * broken. We set up most of the low 16 IO-APIC pins to - * ISA defaults and hope it will work. - */ - if (!mp_irq_entries) { - struct mpc_config_bus bus; - - printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n"); - - bus.mpc_type = MP_BUS; - bus.mpc_busid = 0; - memcpy(bus.mpc_bustype, "ISA ", 6); - MP_bus_info(&bus); - - construct_default_ioirq_mptable(0); - } - - } else - BUG(); - - printk(KERN_INFO "Processors: %d\n", num_processors); - /* - * Only use the first configuration found. - */ -} - -static int __init smp_scan_config (unsigned long base, unsigned long length) -{ - extern void __bad_mpf_size(void); - unsigned int *bp = phys_to_virt(base); - struct intel_mp_floating *mpf; - - Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length); - if (sizeof(*mpf) != 16) - __bad_mpf_size(); - - while (length > 0) { - mpf = (struct intel_mp_floating *)bp; - if ((*bp == SMP_MAGIC_IDENT) && - (mpf->mpf_length == 1) && - !mpf_checksum((unsigned char *)bp, 16) && - ((mpf->mpf_specification == 1) - || (mpf->mpf_specification == 4)) ) { - - smp_found_config = 1; - reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE); - if (mpf->mpf_physptr) - reserve_bootmem_generic(mpf->mpf_physptr, PAGE_SIZE); - mpf_found = mpf; - return 1; - } - bp += 4; - length -= 16; - } - return 0; -} - -void __init find_intel_smp (void) -{ - unsigned int address; - - /* - * FIXME: Linux assumes you have 640K of base ram.. - * this continues the error... - * - * 1) Scan the bottom 1K for a signature - * 2) Scan the top 1K of base RAM - * 3) Scan the 64K of bios - */ - if (smp_scan_config(0x0,0x400) || - smp_scan_config(639*0x400,0x400) || - smp_scan_config(0xF0000,0x10000)) - return; - /* - * If it is an SMP machine we should know now, unless the - * configuration is in an EISA/MCA bus machine with an - * extended bios data area. - * - * there is a real-mode segmented pointer pointing to the - * 4K EBDA area at 0x40E, calculate and scan it here. - * - * NOTE! There are Linux loaders that will corrupt the EBDA - * area, and as such this kind of SMP config may be less - * trustworthy, simply because the SMP table may have been - * stomped on during early boot. These loaders are buggy and - * should be fixed. - */ - - address = *(unsigned short *)phys_to_virt(0x40E); - address <<= 4; - if (smp_scan_config(address, 0x1000)) - return; - - /* If we have come this far, we did not find an MP table */ - printk(KERN_INFO "No mptable found.\n"); -} - -/* - * - Intel MP Configuration Table - */ -void __init find_smp_config (void) -{ -#ifdef CONFIG_X86_LOCAL_APIC - find_intel_smp(); -#endif -} - - -/* -------------------------------------------------------------------------- - ACPI-based MP Configuration - -------------------------------------------------------------------------- */ - -#ifdef CONFIG_ACPI_BOOT - -void __init mp_register_lapic_address ( - u64 address) -{ - mp_lapic_addr = (unsigned long) address; - - set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); - - if (boot_cpu_id == -1U) - boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID)); - - Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid); -} - - -void __init mp_register_lapic ( - u8 id, - u8 enabled) -{ - struct mpc_config_processor processor; - int boot_cpu = 0; - - if (id >= MAX_APICS) { - printk(KERN_WARNING "Processor #%d invalid (max %d)\n", - id, MAX_APICS); - return; - } - - if (id == boot_cpu_physical_apicid) - boot_cpu = 1; - - processor.mpc_type = MP_PROCESSOR; - processor.mpc_apicid = id; - processor.mpc_apicver = 0x10; /* TBD: lapic version */ - processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0); - processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0); - processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | - (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask; - processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; - processor.mpc_reserved[0] = 0; - processor.mpc_reserved[1] = 0; - - MP_processor_info(&processor); -} - -#ifdef CONFIG_X86_IO_APIC - -#define MP_ISA_BUS 0 -#define MP_MAX_IOAPIC_PIN 127 - -static struct mp_ioapic_routing { - int apic_id; - int gsi_start; - int gsi_end; - u32 pin_programmed[4]; -} mp_ioapic_routing[MAX_IO_APICS]; - - -static int mp_find_ioapic ( - int gsi) -{ - int i = 0; - - /* Find the IOAPIC that manages this GSI. */ - for (i = 0; i < nr_ioapics; i++) { - if ((gsi >= mp_ioapic_routing[i].gsi_start) - && (gsi <= mp_ioapic_routing[i].gsi_end)) - return i; - } - - printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); - - return -1; -} - - -void __init mp_register_ioapic ( - u8 id, - u32 address, - u32 gsi_base) -{ - int idx = 0; - - if (nr_ioapics >= MAX_IO_APICS) { - printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " - "(found %d)\n", MAX_IO_APICS, nr_ioapics); - panic("Recompile kernel with bigger MAX_IO_APICS!\n"); - } - if (!address) { - printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address" - " found in MADT table, skipping!\n"); - return; - } - - idx = nr_ioapics++; - - mp_ioapics[idx].mpc_type = MP_IOAPIC; - mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE; - mp_ioapics[idx].mpc_apicaddr = address; - - set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); - mp_ioapics[idx].mpc_apicid = id; - mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx); - - /* - * Build basic IRQ lookup table to facilitate gsi->io_apic lookups - * and to prevent reprogramming of IOAPIC pins (PCI IRQs). - */ - mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid; - mp_ioapic_routing[idx].gsi_start = gsi_base; - mp_ioapic_routing[idx].gsi_end = gsi_base + - io_apic_get_redir_entries(idx); - - printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " - "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, - mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, - mp_ioapic_routing[idx].gsi_start, - mp_ioapic_routing[idx].gsi_end); - - return; -} - - -void __init mp_override_legacy_irq ( - u8 bus_irq, - u8 polarity, - u8 trigger, - u32 gsi) -{ - struct mpc_config_intsrc intsrc; - int ioapic = -1; - int pin = -1; - - /* - * Convert 'gsi' to 'ioapic.pin'. - */ - ioapic = mp_find_ioapic(gsi); - if (ioapic < 0) - return; - pin = gsi - mp_ioapic_routing[ioapic].gsi_start; - - /* - * TBD: This check is for faulty timer entries, where the override - * erroneously sets the trigger to level, resulting in a HUGE - * increase of timer interrupts! - */ - if ((bus_irq == 0) && (trigger == 3)) - trigger = 1; - - intsrc.mpc_type = MP_INTSRC; - intsrc.mpc_irqtype = mp_INT; - intsrc.mpc_irqflag = (trigger << 2) | polarity; - intsrc.mpc_srcbus = MP_ISA_BUS; - intsrc.mpc_srcbusirq = bus_irq; /* IRQ */ - intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */ - intsrc.mpc_dstirq = pin; /* INTIN# */ - - Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n", - intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, - (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, - intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq); - - mp_irqs[mp_irq_entries] = intsrc; - if (++mp_irq_entries == MAX_IRQ_SOURCES) - panic("Max # of irq sources exceeded!\n"); - - return; -} - - -void __init mp_config_acpi_legacy_irqs (void) -{ - struct mpc_config_intsrc intsrc; - int i = 0; - int ioapic = -1; - - /* - * Fabricate the legacy ISA bus (bus #31). - */ - mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; - Dprintk("Bus #%d is ISA\n", MP_ISA_BUS); - - /* - * Locate the IOAPIC that manages the ISA IRQs (0-15). - */ - ioapic = mp_find_ioapic(0); - if (ioapic < 0) - return; - - intsrc.mpc_type = MP_INTSRC; - intsrc.mpc_irqflag = 0; /* Conforming */ - intsrc.mpc_srcbus = MP_ISA_BUS; - intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; - - /* - * Use the default configuration for the IRQs 0-15. Unless - * overridden by (MADT) interrupt source override entries. - */ - for (i = 0; i < 16; i++) { - int idx; - - for (idx = 0; idx < mp_irq_entries; idx++) { - struct mpc_config_intsrc *irq = mp_irqs + idx; - - /* Do we already have a mapping for this ISA IRQ? */ - if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i) - break; - - /* Do we already have a mapping for this IOAPIC pin */ - if ((irq->mpc_dstapic == intsrc.mpc_dstapic) && - (irq->mpc_dstirq == i)) - break; - } - - if (idx != mp_irq_entries) { - printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i); - continue; /* IRQ already used */ - } - - intsrc.mpc_irqtype = mp_INT; - intsrc.mpc_srcbusirq = i; /* Identity mapped */ - intsrc.mpc_dstirq = i; - - Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, " - "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, - (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, - intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, - intsrc.mpc_dstirq); - - mp_irqs[mp_irq_entries] = intsrc; - if (++mp_irq_entries == MAX_IRQ_SOURCES) - panic("Max # of irq sources exceeded!\n"); - } - - return; -} - -#define MAX_GSI_NUM 4096 - -int mp_register_gsi(u32 gsi, int edge_level, int active_high_low) -{ - int ioapic = -1; - int ioapic_pin = 0; - int idx, bit = 0; - static int pci_irq = 16; - /* - * Mapping between Global System Interrupts, which - * represent all possible interrupts, to the IRQs - * assigned to actual devices. - */ - static int gsi_to_irq[MAX_GSI_NUM]; - - if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) - return gsi; - -#ifdef CONFIG_ACPI_BUS - /* Don't set up the ACPI SCI because it's already set up */ - if (acpi_fadt.sci_int == gsi) - return gsi; -#endif - - ioapic = mp_find_ioapic(gsi); - if (ioapic < 0) { - printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi); - return gsi; - } - - ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_start; - - /* - * Avoid pin reprogramming. PRTs typically include entries - * with redundant pin->gsi mappings (but unique PCI devices); - * we only program the IOAPIC on the first. - */ - bit = ioapic_pin % 32; - idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32); - if (idx > 3) { - printk(KERN_ERR "Invalid reference to IOAPIC pin " - "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, - ioapic_pin); - return gsi; - } - if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) { - Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n", - mp_ioapic_routing[ioapic].apic_id, ioapic_pin); - return gsi_to_irq[gsi]; - } - - mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit); - - if (edge_level) { - /* - * For PCI devices assign IRQs in order, avoiding gaps - * due to unused I/O APIC pins. - */ - int irq = gsi; - gsi = pci_irq++; - gsi_to_irq[irq] = gsi; - } - - io_apic_set_pci_routing(ioapic, ioapic_pin, gsi, - edge_level == ACPI_EDGE_SENSITIVE ? 0 : 1, - active_high_low == ACPI_ACTIVE_HIGH ? 0 : 1); - return gsi; -} - -#endif /*CONFIG_X86_IO_APIC*/ -#endif /*CONFIG_ACPI_BOOT*/ diff --git a/arch/x86_64/kernel/msr.c b/arch/x86_64/kernel/msr.c deleted file mode 100644 index 598953ab015..00000000000 --- a/arch/x86_64/kernel/msr.c +++ /dev/null @@ -1,279 +0,0 @@ -/* ----------------------------------------------------------------------- * - * - * Copyright 2000 H. Peter Anvin - All Rights Reserved - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139, - * USA; either version 2 of the License, or (at your option) any later - * version; incorporated herein by reference. - * - * ----------------------------------------------------------------------- */ - -/* - * msr.c - * - * x86 MSR access device - * - * This device is accessed by lseek() to the appropriate register number - * and then read/write in chunks of 8 bytes. A larger size means multiple - * reads or writes of the same register. - * - * This driver uses /dev/cpu/%d/msr where %d is the minor number, and on - * an SMP box will direct the access to CPU %d. - */ - -#include <linux/module.h> -#include <linux/config.h> - -#include <linux/types.h> -#include <linux/errno.h> -#include <linux/fcntl.h> -#include <linux/init.h> -#include <linux/poll.h> -#include <linux/smp.h> -#include <linux/smp_lock.h> -#include <linux/major.h> -#include <linux/fs.h> - -#include <asm/processor.h> -#include <asm/msr.h> -#include <asm/uaccess.h> -#include <asm/system.h> - -/* Note: "err" is handled in a funny way below. Otherwise one version - of gcc or another breaks. */ - -static inline int wrmsr_eio(u32 reg, u32 eax, u32 edx) -{ - int err; - - asm volatile ("1: wrmsr\n" - "2:\n" - ".section .fixup,\"ax\"\n" - "3: movl %4,%0\n" - " jmp 2b\n" - ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 8\n" " .quad 1b,3b\n" ".previous":"=&bDS" (err) - :"a"(eax), "d"(edx), "c"(reg), "i"(-EIO), "0"(0)); - - return err; -} - -static inline int rdmsr_eio(u32 reg, u32 *eax, u32 *edx) -{ - int err; - - asm volatile ("1: rdmsr\n" - "2:\n" - ".section .fixup,\"ax\"\n" - "3: movl %4,%0\n" - " jmp 2b\n" - ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 8\n" - " .quad 1b,3b\n" - ".previous":"=&bDS" (err), "=a"(*eax), "=d"(*edx) - :"c"(reg), "i"(-EIO), "0"(0)); - - return err; -} - -#ifdef CONFIG_SMP - -struct msr_command { - int cpu; - int err; - u32 reg; - u32 data[2]; -}; - -static void msr_smp_wrmsr(void *cmd_block) -{ - struct msr_command *cmd = (struct msr_command *)cmd_block; - - if (cmd->cpu == smp_processor_id()) - cmd->err = wrmsr_eio(cmd->reg, cmd->data[0], cmd->data[1]); -} - -static void msr_smp_rdmsr(void *cmd_block) -{ - struct msr_command *cmd = (struct msr_command *)cmd_block; - - if (cmd->cpu == smp_processor_id()) - cmd->err = rdmsr_eio(cmd->reg, &cmd->data[0], &cmd->data[1]); -} - -static inline int do_wrmsr(int cpu, u32 reg, u32 eax, u32 edx) -{ - struct msr_command cmd; - int ret; - - preempt_disable(); - if (cpu == smp_processor_id()) { - ret = wrmsr_eio(reg, eax, edx); - } else { - cmd.cpu = cpu; - cmd.reg = reg; - cmd.data[0] = eax; - cmd.data[1] = edx; - - smp_call_function(msr_smp_wrmsr, &cmd, 1, 1); - ret = cmd.err; - } - preempt_enable(); - return ret; -} - -static inline int do_rdmsr(int cpu, u32 reg, u32 * eax, u32 * edx) -{ - struct msr_command cmd; - int ret; - - preempt_disable(); - if (cpu == smp_processor_id()) { - ret = rdmsr_eio(reg, eax, edx); - } else { - cmd.cpu = cpu; - cmd.reg = reg; - - smp_call_function(msr_smp_rdmsr, &cmd, 1, 1); - - *eax = cmd.data[0]; - *edx = cmd.data[1]; - - ret = cmd.err; - } - preempt_enable(); - return ret; -} - -#else /* ! CONFIG_SMP */ - -static inline int do_wrmsr(int cpu, u32 reg, u32 eax, u32 edx) -{ - return wrmsr_eio(reg, eax, edx); -} - -static inline int do_rdmsr(int cpu, u32 reg, u32 *eax, u32 *edx) -{ - return rdmsr_eio(reg, eax, edx); -} - -#endif /* ! CONFIG_SMP */ - -static loff_t msr_seek(struct file *file, loff_t offset, int orig) -{ - loff_t ret = -EINVAL; - - lock_kernel(); - switch (orig) { - case 0: - file->f_pos = offset; - ret = file->f_pos; - break; - case 1: - file->f_pos += offset; - ret = file->f_pos; - } - unlock_kernel(); - return ret; -} - -static ssize_t msr_read(struct file *file, char __user * buf, - size_t count, loff_t * ppos) -{ - u32 __user *tmp = (u32 __user *) buf; - u32 data[2]; - size_t rv; - u32 reg = *ppos; - int cpu = iminor(file->f_dentry->d_inode); - int err; - - if (count % 8) - return -EINVAL; /* Invalid chunk size */ - - for (rv = 0; count; count -= 8) { - err = do_rdmsr(cpu, reg, &data[0], &data[1]); - if (err) - return err; - if (copy_to_user(tmp, &data, 8)) - return -EFAULT; - tmp += 2; - } - - return ((char __user *)tmp) - buf; -} - -static ssize_t msr_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) -{ - const u32 __user *tmp = (const u32 __user *)buf; - u32 data[2]; - size_t rv; - u32 reg = *ppos; - int cpu = iminor(file->f_dentry->d_inode); - int err; - - if (count % 8) - return -EINVAL; /* Invalid chunk size */ - - for (rv = 0; count; count -= 8) { - if (copy_from_user(&data, tmp, 8)) - return -EFAULT; - err = do_wrmsr(cpu, reg, data[0], data[1]); - if (err) - return err; - tmp += 2; - } - - return ((char __user *)tmp) - buf; -} - -static int msr_open(struct inode *inode, struct file *file) -{ - unsigned int cpu = iminor(file->f_dentry->d_inode); - struct cpuinfo_x86 *c = &(cpu_data)[cpu]; - - if (cpu >= NR_CPUS || !cpu_online(cpu)) - return -ENXIO; /* No such CPU */ - if (!cpu_has(c, X86_FEATURE_MSR)) - return -EIO; /* MSR not supported */ - - return 0; -} - -/* - * File operations we support - */ -static struct file_operations msr_fops = { - .owner = THIS_MODULE, - .llseek = msr_seek, - .read = msr_read, - .write = msr_write, - .open = msr_open, -}; - -static int __init msr_init(void) -{ - if (register_chrdev(MSR_MAJOR, "cpu/msr", &msr_fops)) { - printk(KERN_ERR "msr: unable to get major %d for msr\n", - MSR_MAJOR); - return -EBUSY; - } - - return 0; -} - -static void __exit msr_exit(void) -{ - unregister_chrdev(MSR_MAJOR, "cpu/msr"); -} - -module_init(msr_init); -module_exit(msr_exit) - -MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>"); -MODULE_DESCRIPTION("x86 generic MSR driver"); -MODULE_LICENSE("GPL"); diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c deleted file mode 100644 index 4e44d6e6b7e..00000000000 --- a/arch/x86_64/kernel/nmi.c +++ /dev/null @@ -1,589 +0,0 @@ -/* - * linux/arch/x86_64/nmi.c - * - * NMI watchdog support on APIC systems - * - * Started by Ingo Molnar <mingo@redhat.com> - * - * Fixes: - * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog. - * Mikael Pettersson : Power Management for local APIC NMI watchdog. - * Pavel Machek and - * Mikael Pettersson : PM converted to driver model. Disable/enable API. - */ - -#include <linux/config.h> -#include <linux/mm.h> -#include <linux/irq.h> -#include <linux/delay.h> -#include <linux/bootmem.h> -#include <linux/smp_lock.h> -#include <linux/interrupt.h> -#include <linux/mc146818rtc.h> -#include <linux/kernel_stat.h> -#include <linux/module.h> -#include <linux/sysdev.h> -#include <linux/nmi.h> -#include <linux/sysctl.h> - -#include <asm/smp.h> -#include <asm/mtrr.h> -#include <asm/mpspec.h> -#include <asm/nmi.h> -#include <asm/msr.h> -#include <asm/proto.h> -#include <asm/kdebug.h> -#include <asm/local.h> - -/* - * lapic_nmi_owner tracks the ownership of the lapic NMI hardware: - * - it may be reserved by some other driver, or not - * - when not reserved by some other driver, it may be used for - * the NMI watchdog, or not - * - * This is maintained separately from nmi_active because the NMI - * watchdog may also be driven from the I/O APIC timer. - */ -static DEFINE_SPINLOCK(lapic_nmi_owner_lock); -static unsigned int lapic_nmi_owner; -#define LAPIC_NMI_WATCHDOG (1<<0) -#define LAPIC_NMI_RESERVED (1<<1) - -/* nmi_active: - * +1: the lapic NMI watchdog is active, but can be disabled - * 0: the lapic NMI watchdog has not been set up, and cannot - * be enabled - * -1: the lapic NMI watchdog is disabled, but can be enabled - */ -int nmi_active; /* oprofile uses this */ -int panic_on_timeout; - -unsigned int nmi_watchdog = NMI_DEFAULT; -static unsigned int nmi_hz = HZ; -static unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ -static unsigned int nmi_p4_cccr_val; - -/* Note that these events don't tick when the CPU idles. This means - the frequency varies with CPU load. */ - -#define K7_EVNTSEL_ENABLE (1 << 22) -#define K7_EVNTSEL_INT (1 << 20) -#define K7_EVNTSEL_OS (1 << 17) -#define K7_EVNTSEL_USR (1 << 16) -#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 -#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING - -#define MSR_P4_MISC_ENABLE 0x1A0 -#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7) -#define MSR_P4_MISC_ENABLE_PEBS_UNAVAIL (1<<12) -#define MSR_P4_PERFCTR0 0x300 -#define MSR_P4_CCCR0 0x360 -#define P4_ESCR_EVENT_SELECT(N) ((N)<<25) -#define P4_ESCR_OS (1<<3) -#define P4_ESCR_USR (1<<2) -#define P4_CCCR_OVF_PMI0 (1<<26) -#define P4_CCCR_OVF_PMI1 (1<<27) -#define P4_CCCR_THRESHOLD(N) ((N)<<20) -#define P4_CCCR_COMPLEMENT (1<<19) -#define P4_CCCR_COMPARE (1<<18) -#define P4_CCCR_REQUIRED (3<<16) -#define P4_CCCR_ESCR_SELECT(N) ((N)<<13) -#define P4_CCCR_ENABLE (1<<12) -/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter - CRU_ESCR0 (with any non-null event selector) through a complemented - max threshold. [IA32-Vol3, Section 14.9.9] */ -#define MSR_P4_IQ_COUNTER0 0x30C -#define P4_NMI_CRU_ESCR0 (P4_ESCR_EVENT_SELECT(0x3F)|P4_ESCR_OS|P4_ESCR_USR) -#define P4_NMI_IQ_CCCR0 \ - (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \ - P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE) - -static __cpuinit inline int nmi_known_cpu(void) -{ - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - return boot_cpu_data.x86 == 15; - case X86_VENDOR_INTEL: - return boot_cpu_data.x86 == 15; - } - return 0; -} - -/* Run after command line and cpu_init init, but before all other checks */ -void __cpuinit nmi_watchdog_default(void) -{ - if (nmi_watchdog != NMI_DEFAULT) - return; - if (nmi_known_cpu()) - nmi_watchdog = NMI_LOCAL_APIC; - else - nmi_watchdog = NMI_IO_APIC; -} - -#ifdef CONFIG_SMP -/* The performance counters used by NMI_LOCAL_APIC don't trigger when - * the CPU is idle. To make sure the NMI watchdog really ticks on all - * CPUs during the test make them busy. - */ -static __init void nmi_cpu_busy(void *data) -{ - volatile int *endflag = data; - local_irq_enable(); - /* Intentionally don't use cpu_relax here. This is - to make sure that the performance counter really ticks, - even if there is a simulator or similar that catches the - pause instruction. On a real HT machine this is fine because - all other CPUs are busy with "useless" delay loops and don't - care if they get somewhat less cycles. */ - while (*endflag == 0) - barrier(); -} -#endif - -int __init check_nmi_watchdog (void) -{ - volatile int endflag = 0; - int *counts; - int cpu; - - counts = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); - if (!counts) - return -1; - - printk(KERN_INFO "testing NMI watchdog ... "); - - if (nmi_watchdog == NMI_LOCAL_APIC) - smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0); - - for (cpu = 0; cpu < NR_CPUS; cpu++) - counts[cpu] = cpu_pda[cpu].__nmi_count; - local_irq_enable(); - mdelay((10*1000)/nmi_hz); // wait 10 ticks - - for (cpu = 0; cpu < NR_CPUS; cpu++) { - if (!cpu_online(cpu)) - continue; - if (cpu_pda[cpu].__nmi_count - counts[cpu] <= 5) { - endflag = 1; - printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n", - cpu, - counts[cpu], - cpu_pda[cpu].__nmi_count); - nmi_active = 0; - lapic_nmi_owner &= ~LAPIC_NMI_WATCHDOG; - nmi_perfctr_msr = 0; - kfree(counts); - return -1; - } - } - endflag = 1; - printk("OK.\n"); - - /* now that we know it works we can reduce NMI frequency to - something more reasonable; makes a difference in some configs */ - if (nmi_watchdog == NMI_LOCAL_APIC) - nmi_hz = 1; - - kfree(counts); - return 0; -} - -int __init setup_nmi_watchdog(char *str) -{ - int nmi; - - if (!strncmp(str,"panic",5)) { - panic_on_timeout = 1; - str = strchr(str, ','); - if (!str) - return 1; - ++str; - } - - get_option(&str, &nmi); - - if (nmi >= NMI_INVALID) - return 0; - nmi_watchdog = nmi; - return 1; -} - -__setup("nmi_watchdog=", setup_nmi_watchdog); - -static void disable_lapic_nmi_watchdog(void) -{ - if (nmi_active <= 0) - return; - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - wrmsr(MSR_K7_EVNTSEL0, 0, 0); - break; - case X86_VENDOR_INTEL: - if (boot_cpu_data.x86 == 15) { - wrmsr(MSR_P4_IQ_CCCR0, 0, 0); - wrmsr(MSR_P4_CRU_ESCR0, 0, 0); - } - break; - } - nmi_active = -1; - /* tell do_nmi() and others that we're not active any more */ - nmi_watchdog = 0; -} - -static void enable_lapic_nmi_watchdog(void) -{ - if (nmi_active < 0) { - nmi_watchdog = NMI_LOCAL_APIC; - setup_apic_nmi_watchdog(); - } -} - -int reserve_lapic_nmi(void) -{ - unsigned int old_owner; - - spin_lock(&lapic_nmi_owner_lock); - old_owner = lapic_nmi_owner; - lapic_nmi_owner |= LAPIC_NMI_RESERVED; - spin_unlock(&lapic_nmi_owner_lock); - if (old_owner & LAPIC_NMI_RESERVED) - return -EBUSY; - if (old_owner & LAPIC_NMI_WATCHDOG) - disable_lapic_nmi_watchdog(); - return 0; -} - -void release_lapic_nmi(void) -{ - unsigned int new_owner; - - spin_lock(&lapic_nmi_owner_lock); - new_owner = lapic_nmi_owner & ~LAPIC_NMI_RESERVED; - lapic_nmi_owner = new_owner; - spin_unlock(&lapic_nmi_owner_lock); - if (new_owner & LAPIC_NMI_WATCHDOG) - enable_lapic_nmi_watchdog(); -} - -void disable_timer_nmi_watchdog(void) -{ - if ((nmi_watchdog != NMI_IO_APIC) || (nmi_active <= 0)) - return; - - disable_irq(0); - unset_nmi_callback(); - nmi_active = -1; - nmi_watchdog = NMI_NONE; -} - -void enable_timer_nmi_watchdog(void) -{ - if (nmi_active < 0) { - nmi_watchdog = NMI_IO_APIC; - touch_nmi_watchdog(); - nmi_active = 1; - enable_irq(0); - } -} - -#ifdef CONFIG_PM - -static int nmi_pm_active; /* nmi_active before suspend */ - -static int lapic_nmi_suspend(struct sys_device *dev, u32 state) -{ - nmi_pm_active = nmi_active; - disable_lapic_nmi_watchdog(); - return 0; -} - -static int lapic_nmi_resume(struct sys_device *dev) -{ - if (nmi_pm_active > 0) - enable_lapic_nmi_watchdog(); - return 0; -} - -static struct sysdev_class nmi_sysclass = { - set_kset_name("lapic_nmi"), - .resume = lapic_nmi_resume, - .suspend = lapic_nmi_suspend, -}; - -static struct sys_device device_lapic_nmi = { - .id = 0, - .cls = &nmi_sysclass, -}; - -static int __init init_lapic_nmi_sysfs(void) -{ - int error; - - if (nmi_active == 0 || nmi_watchdog != NMI_LOCAL_APIC) - return 0; - - error = sysdev_class_register(&nmi_sysclass); - if (!error) - error = sysdev_register(&device_lapic_nmi); - return error; -} -/* must come after the local APIC's device_initcall() */ -late_initcall(init_lapic_nmi_sysfs); - -#endif /* CONFIG_PM */ - -/* - * Activate the NMI watchdog via the local APIC. - * Original code written by Keith Owens. - */ - -static void clear_msr_range(unsigned int base, unsigned int n) -{ - unsigned int i; - - for(i = 0; i < n; ++i) - wrmsr(base+i, 0, 0); -} - -static void setup_k7_watchdog(void) -{ - int i; - unsigned int evntsel; - - nmi_perfctr_msr = MSR_K7_PERFCTR0; - - for(i = 0; i < 4; ++i) { - /* Simulator may not support it */ - if (checking_wrmsrl(MSR_K7_EVNTSEL0+i, 0UL)) { - nmi_perfctr_msr = 0; - return; - } - wrmsrl(MSR_K7_PERFCTR0+i, 0UL); - } - - evntsel = K7_EVNTSEL_INT - | K7_EVNTSEL_OS - | K7_EVNTSEL_USR - | K7_NMI_EVENT; - - wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); - wrmsr(MSR_K7_PERFCTR0, -(cpu_khz/nmi_hz*1000), -1); - apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= K7_EVNTSEL_ENABLE; - wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); -} - - -static int setup_p4_watchdog(void) -{ - unsigned int misc_enable, dummy; - - rdmsr(MSR_P4_MISC_ENABLE, misc_enable, dummy); - if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL)) - return 0; - - nmi_perfctr_msr = MSR_P4_IQ_COUNTER0; - nmi_p4_cccr_val = P4_NMI_IQ_CCCR0; -#ifdef CONFIG_SMP - if (smp_num_siblings == 2) - nmi_p4_cccr_val |= P4_CCCR_OVF_PMI1; -#endif - - if (!(misc_enable & MSR_P4_MISC_ENABLE_PEBS_UNAVAIL)) - clear_msr_range(0x3F1, 2); - /* MSR 0x3F0 seems to have a default value of 0xFC00, but current - docs doesn't fully define it, so leave it alone for now. */ - if (boot_cpu_data.x86_model >= 0x3) { - /* MSR_P4_IQ_ESCR0/1 (0x3ba/0x3bb) removed */ - clear_msr_range(0x3A0, 26); - clear_msr_range(0x3BC, 3); - } else { - clear_msr_range(0x3A0, 31); - } - clear_msr_range(0x3C0, 6); - clear_msr_range(0x3C8, 6); - clear_msr_range(0x3E0, 2); - clear_msr_range(MSR_P4_CCCR0, 18); - clear_msr_range(MSR_P4_PERFCTR0, 18); - - wrmsr(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0, 0); - wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE, 0); - Dprintk("setting P4_IQ_COUNTER0 to 0x%08lx\n", -(cpu_khz/nmi_hz*1000)); - wrmsr(MSR_P4_IQ_COUNTER0, -(cpu_khz/nmi_hz*1000), -1); - apic_write(APIC_LVTPC, APIC_DM_NMI); - wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0); - return 1; -} - -void setup_apic_nmi_watchdog(void) -{ - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - if (boot_cpu_data.x86 != 15) - return; - if (strstr(boot_cpu_data.x86_model_id, "Screwdriver")) - return; - setup_k7_watchdog(); - break; - case X86_VENDOR_INTEL: - if (boot_cpu_data.x86 != 15) - return; - if (!setup_p4_watchdog()) - return; - break; - - default: - return; - } - lapic_nmi_owner = LAPIC_NMI_WATCHDOG; - nmi_active = 1; -} - -/* - * the best way to detect whether a CPU has a 'hard lockup' problem - * is to check it's local APIC timer IRQ counts. If they are not - * changing then that CPU has some problem. - * - * as these watchdog NMI IRQs are generated on every CPU, we only - * have to check the current processor. - */ - -static DEFINE_PER_CPU(unsigned, last_irq_sum); -static DEFINE_PER_CPU(local_t, alert_counter); -static DEFINE_PER_CPU(int, nmi_touch); - -void touch_nmi_watchdog (void) -{ - int i; - - /* - * Tell other CPUs to reset their alert counters. We cannot - * do it ourselves because the alert count increase is not - * atomic. - */ - for (i = 0; i < NR_CPUS; i++) - per_cpu(nmi_touch, i) = 1; -} - -void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason) -{ - int sum; - int touched = 0; - - sum = read_pda(apic_timer_irqs); - if (__get_cpu_var(nmi_touch)) { - __get_cpu_var(nmi_touch) = 0; - touched = 1; - } - if (!touched && __get_cpu_var(last_irq_sum) == sum) { - /* - * Ayiee, looks like this CPU is stuck ... - * wait a few IRQs (5 seconds) before doing the oops ... - */ - local_inc(&__get_cpu_var(alert_counter)); - if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) { - if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) - == NOTIFY_STOP) { - local_set(&__get_cpu_var(alert_counter), 0); - return; - } - die_nmi("NMI Watchdog detected LOCKUP on CPU%d", regs); - } - } else { - __get_cpu_var(last_irq_sum) = sum; - local_set(&__get_cpu_var(alert_counter), 0); - } - if (nmi_perfctr_msr) { - if (nmi_perfctr_msr == MSR_P4_IQ_COUNTER0) { - /* - * P4 quirks: - * - An overflown perfctr will assert its interrupt - * until the OVF flag in its CCCR is cleared. - * - LVTPC is masked on interrupt and must be - * unmasked by the LVTPC handler. - */ - wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0); - apic_write(APIC_LVTPC, APIC_DM_NMI); - } - wrmsr(nmi_perfctr_msr, -(cpu_khz/nmi_hz*1000), -1); - } -} - -static int dummy_nmi_callback(struct pt_regs * regs, int cpu) -{ - return 0; -} - -static nmi_callback_t nmi_callback = dummy_nmi_callback; - -asmlinkage void do_nmi(struct pt_regs * regs, long error_code) -{ - int cpu = safe_smp_processor_id(); - - nmi_enter(); - add_pda(__nmi_count,1); - if (!nmi_callback(regs, cpu)) - default_do_nmi(regs); - nmi_exit(); -} - -void set_nmi_callback(nmi_callback_t callback) -{ - nmi_callback = callback; -} - -void unset_nmi_callback(void) -{ - nmi_callback = dummy_nmi_callback; -} - -#ifdef CONFIG_SYSCTL - -static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) -{ - unsigned char reason = get_nmi_reason(); - char buf[64]; - - if (!(reason & 0xc0)) { - sprintf(buf, "NMI received for unknown reason %02x\n", reason); - die_nmi(buf,regs); - } - return 0; -} - -/* - * proc handler for /proc/sys/kernel/unknown_nmi_panic - */ -int proc_unknown_nmi_panic(struct ctl_table *table, int write, struct file *file, - void __user *buffer, size_t *length, loff_t *ppos) -{ - int old_state; - - old_state = unknown_nmi_panic; - proc_dointvec(table, write, file, buffer, length, ppos); - if (!!old_state == !!unknown_nmi_panic) - return 0; - - if (unknown_nmi_panic) { - if (reserve_lapic_nmi() < 0) { - unknown_nmi_panic = 0; - return -EBUSY; - } else { - set_nmi_callback(unknown_nmi_panic_callback); - } - } else { - release_lapic_nmi(); - unset_nmi_callback(); - } - return 0; -} - -#endif - -EXPORT_SYMBOL(nmi_active); -EXPORT_SYMBOL(nmi_watchdog); -EXPORT_SYMBOL(reserve_lapic_nmi); -EXPORT_SYMBOL(release_lapic_nmi); -EXPORT_SYMBOL(disable_timer_nmi_watchdog); -EXPORT_SYMBOL(enable_timer_nmi_watchdog); -EXPORT_SYMBOL(touch_nmi_watchdog); diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c deleted file mode 100644 index cab471cf3ed..00000000000 --- a/arch/x86_64/kernel/pci-dma.c +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Dynamic DMA mapping support. - */ - -#include <linux/types.h> -#include <linux/mm.h> -#include <linux/string.h> -#include <linux/pci.h> -#include <linux/module.h> -#include <asm/io.h> - -/* Map a set of buffers described by scatterlist in streaming - * mode for DMA. This is the scatter-gather version of the - * above pci_map_single interface. Here the scatter gather list - * elements are each tagged with the appropriate dma address - * and length. They are obtained via sg_dma_{address,length}(SG). - * - * NOTE: An implementation may be able to use a smaller number of - * DMA address/length pairs than there are SG table elements. - * (for example via virtual mapping capabilities) - * The routine returns the number of addr/length pairs actually - * used, at most nents. - * - * Device ownership issues as mentioned above for pci_map_single are - * the same here. - */ -int dma_map_sg(struct device *hwdev, struct scatterlist *sg, - int nents, int direction) -{ - int i; - - BUG_ON(direction == DMA_NONE); - for (i = 0; i < nents; i++ ) { - struct scatterlist *s = &sg[i]; - BUG_ON(!s->page); - s->dma_address = virt_to_bus(page_address(s->page) +s->offset); - s->dma_length = s->length; - } - return nents; -} - -EXPORT_SYMBOL(dma_map_sg); - -/* Unmap a set of streaming mode DMA translations. - * Again, cpu read rules concerning calls here are the same as for - * pci_unmap_single() above. - */ -void dma_unmap_sg(struct device *dev, struct scatterlist *sg, - int nents, int dir) -{ - int i; - for (i = 0; i < nents; i++) { - struct scatterlist *s = &sg[i]; - BUG_ON(s->page == NULL); - BUG_ON(s->dma_address == 0); - dma_unmap_single(dev, s->dma_address, s->dma_length, dir); - } -} - -EXPORT_SYMBOL(dma_unmap_sg); diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c deleted file mode 100644 index 57f35c68aa3..00000000000 --- a/arch/x86_64/kernel/pci-gart.c +++ /dev/null @@ -1,980 +0,0 @@ -/* - * Dynamic DMA mapping support for AMD Hammer. - * - * Use the integrated AGP GART in the Hammer northbridge as an IOMMU for PCI. - * This allows to use PCI devices that only support 32bit addresses on systems - * with more than 4GB. - * - * See Documentation/DMA-mapping.txt for the interface specification. - * - * Copyright 2002 Andi Kleen, SuSE Labs. - */ - -#include <linux/config.h> -#include <linux/types.h> -#include <linux/ctype.h> -#include <linux/agp_backend.h> -#include <linux/init.h> -#include <linux/mm.h> -#include <linux/string.h> -#include <linux/spinlock.h> -#include <linux/pci.h> -#include <linux/module.h> -#include <linux/topology.h> -#include <linux/interrupt.h> -#include <linux/bitops.h> -#include <asm/atomic.h> -#include <asm/io.h> -#include <asm/mtrr.h> -#include <asm/pgtable.h> -#include <asm/proto.h> -#include <asm/cacheflush.h> -#include <asm/kdebug.h> - -dma_addr_t bad_dma_address; - -unsigned long iommu_bus_base; /* GART remapping area (physical) */ -static unsigned long iommu_size; /* size of remapping area bytes */ -static unsigned long iommu_pages; /* .. and in pages */ - -u32 *iommu_gatt_base; /* Remapping table */ - -int no_iommu; -static int no_agp; -#ifdef CONFIG_IOMMU_DEBUG -int panic_on_overflow = 1; -int force_iommu = 1; -#else -int panic_on_overflow = 0; -int force_iommu = 0; -#endif -int iommu_merge = 1; -int iommu_sac_force = 0; - -/* If this is disabled the IOMMU will use an optimized flushing strategy - of only flushing when an mapping is reused. With it true the GART is flushed - for every mapping. Problem is that doing the lazy flush seems to trigger - bugs with some popular PCI cards, in particular 3ware (but has been also - also seen with Qlogic at least). */ -int iommu_fullflush = 1; - -/* This tells the BIO block layer to assume merging. Default to off - because we cannot guarantee merging later. */ -int iommu_bio_merge = 0; - -#define MAX_NB 8 - -/* Allocation bitmap for the remapping area */ -static DEFINE_SPINLOCK(iommu_bitmap_lock); -static unsigned long *iommu_gart_bitmap; /* guarded by iommu_bitmap_lock */ - -static u32 gart_unmapped_entry; - -#define GPTE_VALID 1 -#define GPTE_COHERENT 2 -#define GPTE_ENCODE(x) \ - (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT) -#define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28)) - -#define to_pages(addr,size) \ - (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT) - -#define for_all_nb(dev) \ - dev = NULL; \ - while ((dev = pci_get_device(PCI_VENDOR_ID_AMD, 0x1103, dev))!=NULL)\ - if (dev->bus->number == 0 && \ - (PCI_SLOT(dev->devfn) >= 24) && (PCI_SLOT(dev->devfn) <= 31)) - -static struct pci_dev *northbridges[MAX_NB]; -static u32 northbridge_flush_word[MAX_NB]; - -#define EMERGENCY_PAGES 32 /* = 128KB */ - -#ifdef CONFIG_AGP -#define AGPEXTERN extern -#else -#define AGPEXTERN -#endif - -/* backdoor interface to AGP driver */ -AGPEXTERN int agp_memory_reserved; -AGPEXTERN __u32 *agp_gatt_table; - -static unsigned long next_bit; /* protected by iommu_bitmap_lock */ -static int need_flush; /* global flush state. set for each gart wrap */ -static dma_addr_t dma_map_area(struct device *dev, unsigned long phys_mem, - size_t size, int dir, int do_panic); - -/* Dummy device used for NULL arguments (normally ISA). Better would - be probably a smaller DMA mask, but this is bug-to-bug compatible to i386. */ -static struct device fallback_dev = { - .bus_id = "fallback device", - .coherent_dma_mask = 0xffffffff, - .dma_mask = &fallback_dev.coherent_dma_mask, -}; - -static unsigned long alloc_iommu(int size) -{ - unsigned long offset, flags; - - spin_lock_irqsave(&iommu_bitmap_lock, flags); - offset = find_next_zero_string(iommu_gart_bitmap,next_bit,iommu_pages,size); - if (offset == -1) { - need_flush = 1; - offset = find_next_zero_string(iommu_gart_bitmap,0,next_bit,size); - } - if (offset != -1) { - set_bit_string(iommu_gart_bitmap, offset, size); - next_bit = offset+size; - if (next_bit >= iommu_pages) { - next_bit = 0; - need_flush = 1; - } - } - if (iommu_fullflush) - need_flush = 1; - spin_unlock_irqrestore(&iommu_bitmap_lock, flags); - return offset; -} - -static void free_iommu(unsigned long offset, int size) -{ - unsigned long flags; - if (size == 1) { - clear_bit(offset, iommu_gart_bitmap); - return; - } - spin_lock_irqsave(&iommu_bitmap_lock, flags); - __clear_bit_string(iommu_gart_bitmap, offset, size); - spin_unlock_irqrestore(&iommu_bitmap_lock, flags); -} - -/* - * Use global flush state to avoid races with multiple flushers. - */ -static void flush_gart(struct device *dev) -{ - unsigned long flags; - int flushed = 0; - int i, max; - - spin_lock_irqsave(&iommu_bitmap_lock, flags); - if (need_flush) { - max = 0; - for (i = 0; i < MAX_NB; i++) { - if (!northbridges[i]) - continue; - pci_write_config_dword(northbridges[i], 0x9c, - northbridge_flush_word[i] | 1); - flushed++; - max = i; - } - for (i = 0; i <= max; i++) { - u32 w; - if (!northbridges[i]) - continue; - /* Make sure the hardware actually executed the flush. */ - do { - pci_read_config_dword(northbridges[i], 0x9c, &w); - } while (w & 1); - } - if (!flushed) - printk("nothing to flush?\n"); - need_flush = 0; - } - spin_unlock_irqrestore(&iommu_bitmap_lock, flags); -} - -/* Allocate DMA memory on node near device */ -noinline -static void *dma_alloc_pages(struct device *dev, unsigned gfp, unsigned order) -{ - struct page *page; - int node; - if (dev->bus == &pci_bus_type) { - cpumask_t mask; - mask = pcibus_to_cpumask(to_pci_dev(dev)->bus); - node = cpu_to_node(first_cpu(mask)); - } else - node = numa_node_id(); - page = alloc_pages_node(node, gfp, order); - return page ? page_address(page) : NULL; -} - -/* - * Allocate memory for a coherent mapping. - */ -void * -dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, - unsigned gfp) -{ - void *memory; - unsigned long dma_mask = 0; - u64 bus; - - if (!dev) - dev = &fallback_dev; - dma_mask = dev->coherent_dma_mask; - if (dma_mask == 0) - dma_mask = 0xffffffff; - - /* Kludge to make it bug-to-bug compatible with i386. i386 - uses the normal dma_mask for alloc_coherent. */ - dma_mask &= *dev->dma_mask; - - again: - memory = dma_alloc_pages(dev, gfp, get_order(size)); - if (memory == NULL) - return NULL; - - { - int high, mmu; - bus = virt_to_bus(memory); - high = (bus + size) >= dma_mask; - mmu = high; - if (force_iommu && !(gfp & GFP_DMA)) - mmu = 1; - if (no_iommu || dma_mask < 0xffffffffUL) { - if (high) { - free_pages((unsigned long)memory, - get_order(size)); - - if (swiotlb) { - return - swiotlb_alloc_coherent(dev, size, - dma_handle, - gfp); - } - - if (!(gfp & GFP_DMA)) { - gfp |= GFP_DMA; - goto again; - } - return NULL; - } - mmu = 0; - } - memset(memory, 0, size); - if (!mmu) { - *dma_handle = virt_to_bus(memory); - return memory; - } - } - - *dma_handle = dma_map_area(dev, bus, size, PCI_DMA_BIDIRECTIONAL, 0); - if (*dma_handle == bad_dma_address) - goto error; - flush_gart(dev); - return memory; - -error: - if (panic_on_overflow) - panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n", size); - free_pages((unsigned long)memory, get_order(size)); - return NULL; -} - -/* - * Unmap coherent memory. - * The caller must ensure that the device has finished accessing the mapping. - */ -void dma_free_coherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t bus) -{ - if (swiotlb) { - swiotlb_free_coherent(dev, size, vaddr, bus); - return; - } - - dma_unmap_single(dev, bus, size, 0); - free_pages((unsigned long)vaddr, get_order(size)); -} - -#ifdef CONFIG_IOMMU_LEAK - -#define SET_LEAK(x) if (iommu_leak_tab) \ - iommu_leak_tab[x] = __builtin_return_address(0); -#define CLEAR_LEAK(x) if (iommu_leak_tab) \ - iommu_leak_tab[x] = NULL; - -/* Debugging aid for drivers that don't free their IOMMU tables */ -static void **iommu_leak_tab; -static int leak_trace; -int iommu_leak_pages = 20; -void dump_leak(void) -{ - int i; - static int dump; - if (dump || !iommu_leak_tab) return; - dump = 1; - show_stack(NULL,NULL); - /* Very crude. dump some from the end of the table too */ - printk("Dumping %d pages from end of IOMMU:\n", iommu_leak_pages); - for (i = 0; i < iommu_leak_pages; i+=2) { - printk("%lu: ", iommu_pages-i); - printk_address((unsigned long) iommu_leak_tab[iommu_pages-i]); - printk("%c", (i+1)%2 == 0 ? '\n' : ' '); - } - printk("\n"); -} -#else -#define SET_LEAK(x) -#define CLEAR_LEAK(x) -#endif - -static void iommu_full(struct device *dev, size_t size, int dir, int do_panic) -{ - /* - * Ran out of IOMMU space for this operation. This is very bad. - * Unfortunately the drivers cannot handle this operation properly. - * Return some non mapped prereserved space in the aperture and - * let the Northbridge deal with it. This will result in garbage - * in the IO operation. When the size exceeds the prereserved space - * memory corruption will occur or random memory will be DMAed - * out. Hopefully no network devices use single mappings that big. - */ - - printk(KERN_ERR - "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n", - size, dev->bus_id); - - if (size > PAGE_SIZE*EMERGENCY_PAGES && do_panic) { - if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL) - panic("PCI-DMA: Memory would be corrupted\n"); - if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL) - panic("PCI-DMA: Random memory would be DMAed\n"); - } - -#ifdef CONFIG_IOMMU_LEAK - dump_leak(); -#endif -} - -static inline int need_iommu(struct device *dev, unsigned long addr, size_t size) -{ - u64 mask = *dev->dma_mask; - int high = addr + size >= mask; - int mmu = high; - if (force_iommu) - mmu = 1; - if (no_iommu) { - if (high) - panic("PCI-DMA: high address but no IOMMU.\n"); - mmu = 0; - } - return mmu; -} - -static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t size) -{ - u64 mask = *dev->dma_mask; - int high = addr + size >= mask; - int mmu = high; - if (no_iommu) { - if (high) - panic("PCI-DMA: high address but no IOMMU.\n"); - mmu = 0; - } - return mmu; -} - -/* Map a single continuous physical area into the IOMMU. - * Caller needs to check if the iommu is needed and flush. - */ -static dma_addr_t dma_map_area(struct device *dev, unsigned long phys_mem, - size_t size, int dir, int do_panic) -{ - unsigned long npages = to_pages(phys_mem, size); - unsigned long iommu_page = alloc_iommu(npages); - int i; - if (iommu_page == -1) { - if (!nonforced_iommu(dev, phys_mem, size)) - return phys_mem; - if (panic_on_overflow) - panic("dma_map_area overflow %lu bytes\n", size); - iommu_full(dev, size, dir, do_panic); - return bad_dma_address; - } - - for (i = 0; i < npages; i++) { - iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem); - SET_LEAK(iommu_page + i); - phys_mem += PAGE_SIZE; - } - return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); -} - -/* Map a single area into the IOMMU */ -dma_addr_t dma_map_single(struct device *dev, void *addr, size_t size, int dir) -{ - unsigned long phys_mem, bus; - - BUG_ON(dir == DMA_NONE); - - if (swiotlb) - return swiotlb_map_single(dev,addr,size,dir); - if (!dev) - dev = &fallback_dev; - - phys_mem = virt_to_phys(addr); - if (!need_iommu(dev, phys_mem, size)) - return phys_mem; - - bus = dma_map_area(dev, phys_mem, size, dir, 1); - flush_gart(dev); - return bus; -} - -/* Fallback for dma_map_sg in case of overflow */ -static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, - int nents, int dir) -{ - int i; - -#ifdef CONFIG_IOMMU_DEBUG - printk(KERN_DEBUG "dma_map_sg overflow\n"); -#endif - - for (i = 0; i < nents; i++ ) { - struct scatterlist *s = &sg[i]; - unsigned long addr = page_to_phys(s->page) + s->offset; - if (nonforced_iommu(dev, addr, s->length)) { - addr = dma_map_area(dev, addr, s->length, dir, 0); - if (addr == bad_dma_address) { - if (i > 0) - dma_unmap_sg(dev, sg, i, dir); - nents = 0; - sg[0].dma_length = 0; - break; - } - } - s->dma_address = addr; - s->dma_length = s->length; - } - flush_gart(dev); - return nents; -} - -/* Map multiple scatterlist entries continuous into the first. */ -static int __dma_map_cont(struct scatterlist *sg, int start, int stopat, - struct scatterlist *sout, unsigned long pages) -{ - unsigned long iommu_start = alloc_iommu(pages); - unsigned long iommu_page = iommu_start; - int i; - - if (iommu_start == -1) - return -1; - - for (i = start; i < stopat; i++) { - struct scatterlist *s = &sg[i]; - unsigned long pages, addr; - unsigned long phys_addr = s->dma_address; - - BUG_ON(i > start && s->offset); - if (i == start) { - *sout = *s; - sout->dma_address = iommu_bus_base; - sout->dma_address += iommu_page*PAGE_SIZE + s->offset; - sout->dma_length = s->length; - } else { - sout->dma_length += s->length; - } - - addr = phys_addr; - pages = to_pages(s->offset, s->length); - while (pages--) { - iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); - SET_LEAK(iommu_page); - addr += PAGE_SIZE; - iommu_page++; - } - } - BUG_ON(iommu_page - iommu_start != pages); - return 0; -} - -static inline int dma_map_cont(struct scatterlist *sg, int start, int stopat, - struct scatterlist *sout, - unsigned long pages, int need) -{ - if (!need) { - BUG_ON(stopat - start != 1); - *sout = sg[start]; - sout->dma_length = sg[start].length; - return 0; - } - return __dma_map_cont(sg, start, stopat, sout, pages); -} - -/* - * DMA map all entries in a scatterlist. - * Merge chunks that have page aligned sizes into a continuous mapping. - */ -int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) -{ - int i; - int out; - int start; - unsigned long pages = 0; - int need = 0, nextneed; - - BUG_ON(dir == DMA_NONE); - if (nents == 0) - return 0; - - if (swiotlb) - return swiotlb_map_sg(dev,sg,nents,dir); - if (!dev) - dev = &fallback_dev; - - out = 0; - start = 0; - for (i = 0; i < nents; i++) { - struct scatterlist *s = &sg[i]; - dma_addr_t addr = page_to_phys(s->page) + s->offset; - s->dma_address = addr; - BUG_ON(s->length == 0); - - nextneed = need_iommu(dev, addr, s->length); - - /* Handle the previous not yet processed entries */ - if (i > start) { - struct scatterlist *ps = &sg[i-1]; - /* Can only merge when the last chunk ends on a page - boundary and the new one doesn't have an offset. */ - if (!iommu_merge || !nextneed || !need || s->offset || - (ps->offset + ps->length) % PAGE_SIZE) { - if (dma_map_cont(sg, start, i, sg+out, pages, - need) < 0) - goto error; - out++; - pages = 0; - start = i; - } - } - - need = nextneed; - pages += to_pages(s->offset, s->length); - } - if (dma_map_cont(sg, start, i, sg+out, pages, need) < 0) - goto error; - out++; - flush_gart(dev); - if (out < nents) - sg[out].dma_length = 0; - return out; - -error: - flush_gart(NULL); - dma_unmap_sg(dev, sg, nents, dir); - /* When it was forced try again unforced */ - if (force_iommu) - return dma_map_sg_nonforce(dev, sg, nents, dir); - if (panic_on_overflow) - panic("dma_map_sg: overflow on %lu pages\n", pages); - iommu_full(dev, pages << PAGE_SHIFT, dir, 0); - for (i = 0; i < nents; i++) - sg[i].dma_address = bad_dma_address; - return 0; -} - -/* - * Free a DMA mapping. - */ -void dma_unmap_single(struct device *dev, dma_addr_t dma_addr, - size_t size, int direction) -{ - unsigned long iommu_page; - int npages; - int i; - - if (swiotlb) { - swiotlb_unmap_single(dev,dma_addr,size,direction); - return; - } - - if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE || - dma_addr >= iommu_bus_base + iommu_size) - return; - iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; - npages = to_pages(dma_addr, size); - for (i = 0; i < npages; i++) { - iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; - CLEAR_LEAK(iommu_page + i); - } - free_iommu(iommu_page, npages); -} - -/* - * Wrapper for pci_unmap_single working with scatterlists. - */ -void dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) -{ - int i; - if (swiotlb) { - swiotlb_unmap_sg(dev,sg,nents,dir); - return; - } - for (i = 0; i < nents; i++) { - struct scatterlist *s = &sg[i]; - if (!s->dma_length || !s->length) - break; - dma_unmap_single(dev, s->dma_address, s->dma_length, dir); - } -} - -int dma_supported(struct device *dev, u64 mask) -{ - /* Copied from i386. Doesn't make much sense, because it will - only work for pci_alloc_coherent. - The caller just has to use GFP_DMA in this case. */ - if (mask < 0x00ffffff) - return 0; - - /* Tell the device to use SAC when IOMMU force is on. - This allows the driver to use cheaper accesses in some cases. - - Problem with this is that if we overflow the IOMMU area - and return DAC as fallback address the device may not handle it correctly. - - As a special case some controllers have a 39bit address mode - that is as efficient as 32bit (aic79xx). Don't force SAC for these. - Assume all masks <= 40 bits are of this type. Normally this doesn't - make any difference, but gives more gentle handling of IOMMU overflow. */ - if (iommu_sac_force && (mask >= 0xffffffffffULL)) { - printk(KERN_INFO "%s: Force SAC with mask %Lx\n", dev->bus_id,mask); - return 0; - } - - return 1; -} - -int dma_get_cache_alignment(void) -{ - return boot_cpu_data.x86_clflush_size; -} - -EXPORT_SYMBOL(dma_unmap_sg); -EXPORT_SYMBOL(dma_map_sg); -EXPORT_SYMBOL(dma_map_single); -EXPORT_SYMBOL(dma_unmap_single); -EXPORT_SYMBOL(dma_supported); -EXPORT_SYMBOL(no_iommu); -EXPORT_SYMBOL(force_iommu); -EXPORT_SYMBOL(bad_dma_address); -EXPORT_SYMBOL(iommu_bio_merge); -EXPORT_SYMBOL(iommu_sac_force); -EXPORT_SYMBOL(dma_get_cache_alignment); -EXPORT_SYMBOL(dma_alloc_coherent); -EXPORT_SYMBOL(dma_free_coherent); - -static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) -{ - unsigned long a; - if (!iommu_size) { - iommu_size = aper_size; - if (!no_agp) - iommu_size /= 2; - } - - a = aper + iommu_size; - iommu_size -= round_up(a, LARGE_PAGE_SIZE) - a; - - if (iommu_size < 64*1024*1024) - printk(KERN_WARNING - "PCI-DMA: Warning: Small IOMMU %luMB. Consider increasing the AGP aperture in BIOS\n",iommu_size>>20); - - return iommu_size; -} - -static __init unsigned read_aperture(struct pci_dev *dev, u32 *size) -{ - unsigned aper_size = 0, aper_base_32; - u64 aper_base; - unsigned aper_order; - - pci_read_config_dword(dev, 0x94, &aper_base_32); - pci_read_config_dword(dev, 0x90, &aper_order); - aper_order = (aper_order >> 1) & 7; - - aper_base = aper_base_32 & 0x7fff; - aper_base <<= 25; - - aper_size = (32 * 1024 * 1024) << aper_order; - if (aper_base + aper_size >= 0xffffffff || !aper_size) - aper_base = 0; - - *size = aper_size; - return aper_base; -} - -/* - * Private Northbridge GATT initialization in case we cannot use the - * AGP driver for some reason. - */ -static __init int init_k8_gatt(struct agp_kern_info *info) -{ - struct pci_dev *dev; - void *gatt; - unsigned aper_base, new_aper_base; - unsigned aper_size, gatt_size, new_aper_size; - - printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); - aper_size = aper_base = info->aper_size = 0; - for_all_nb(dev) { - new_aper_base = read_aperture(dev, &new_aper_size); - if (!new_aper_base) - goto nommu; - - if (!aper_base) { - aper_size = new_aper_size; - aper_base = new_aper_base; - } - if (aper_size != new_aper_size || aper_base != new_aper_base) - goto nommu; - } - if (!aper_base) - goto nommu; - info->aper_base = aper_base; - info->aper_size = aper_size>>20; - - gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32); - gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size)); - if (!gatt) - panic("Cannot allocate GATT table"); - memset(gatt, 0, gatt_size); - agp_gatt_table = gatt; - - for_all_nb(dev) { - u32 ctl; - u32 gatt_reg; - - gatt_reg = __pa(gatt) >> 12; - gatt_reg <<= 4; - pci_write_config_dword(dev, 0x98, gatt_reg); - pci_read_config_dword(dev, 0x90, &ctl); - - ctl |= 1; - ctl &= ~((1<<4) | (1<<5)); - - pci_write_config_dword(dev, 0x90, ctl); - } - flush_gart(NULL); - - printk("PCI-DMA: aperture base @ %x size %u KB\n",aper_base, aper_size>>10); - return 0; - - nommu: - /* Should not happen anymore */ - printk(KERN_ERR "PCI-DMA: More than 4GB of RAM and no IOMMU\n" - KERN_ERR "PCI-DMA: 32bit PCI IO may malfunction."); - return -1; -} - -extern int agp_amd64_init(void); - -static int __init pci_iommu_init(void) -{ - struct agp_kern_info info; - unsigned long aper_size; - unsigned long iommu_start; - struct pci_dev *dev; - unsigned long scratch; - long i; - -#ifndef CONFIG_AGP_AMD64 - no_agp = 1; -#else - /* Makefile puts PCI initialization via subsys_initcall first. */ - /* Add other K8 AGP bridge drivers here */ - no_agp = no_agp || - (agp_amd64_init() < 0) || - (agp_copy_info(agp_bridge, &info) < 0); -#endif - - if (swiotlb) { - no_iommu = 1; - printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n"); - return -1; - } - - if (no_iommu || - (!force_iommu && end_pfn < 0xffffffff>>PAGE_SHIFT) || - !iommu_aperture || - (no_agp && init_k8_gatt(&info) < 0)) { - printk(KERN_INFO "PCI-DMA: Disabling IOMMU.\n"); - no_iommu = 1; - return -1; - } - - aper_size = info.aper_size * 1024 * 1024; - iommu_size = check_iommu_size(info.aper_base, aper_size); - iommu_pages = iommu_size >> PAGE_SHIFT; - - iommu_gart_bitmap = (void*)__get_free_pages(GFP_KERNEL, - get_order(iommu_pages/8)); - if (!iommu_gart_bitmap) - panic("Cannot allocate iommu bitmap\n"); - memset(iommu_gart_bitmap, 0, iommu_pages/8); - -#ifdef CONFIG_IOMMU_LEAK - if (leak_trace) { - iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL, - get_order(iommu_pages*sizeof(void *))); - if (iommu_leak_tab) - memset(iommu_leak_tab, 0, iommu_pages * 8); - else - printk("PCI-DMA: Cannot allocate leak trace area\n"); - } -#endif - - /* - * Out of IOMMU space handling. - * Reserve some invalid pages at the beginning of the GART. - */ - set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES); - - agp_memory_reserved = iommu_size; - printk(KERN_INFO - "PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n", - iommu_size>>20); - - iommu_start = aper_size - iommu_size; - iommu_bus_base = info.aper_base + iommu_start; - bad_dma_address = iommu_bus_base; - iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT); - - /* - * Unmap the IOMMU part of the GART. The alias of the page is - * always mapped with cache enabled and there is no full cache - * coherency across the GART remapping. The unmapping avoids - * automatic prefetches from the CPU allocating cache lines in - * there. All CPU accesses are done via the direct mapping to - * the backing memory. The GART address is only used by PCI - * devices. - */ - clear_kernel_mapping((unsigned long)__va(iommu_bus_base), iommu_size); - - /* - * Try to workaround a bug (thanks to BenH) - * Set unmapped entries to a scratch page instead of 0. - * Any prefetches that hit unmapped entries won't get an bus abort - * then. - */ - scratch = get_zeroed_page(GFP_KERNEL); - if (!scratch) - panic("Cannot allocate iommu scratch page"); - gart_unmapped_entry = GPTE_ENCODE(__pa(scratch)); - for (i = EMERGENCY_PAGES; i < iommu_pages; i++) - iommu_gatt_base[i] = gart_unmapped_entry; - - for_all_nb(dev) { - u32 flag; - int cpu = PCI_SLOT(dev->devfn) - 24; - if (cpu >= MAX_NB) - continue; - northbridges[cpu] = dev; - pci_read_config_dword(dev, 0x9c, &flag); /* cache flush word */ - northbridge_flush_word[cpu] = flag; - } - - flush_gart(NULL); - - return 0; -} - -/* Must execute after PCI subsystem */ -fs_initcall(pci_iommu_init); - -/* iommu=[size][,noagp][,off][,force][,noforce][,leak][,memaper[=order]][,merge] - [,forcesac][,fullflush][,nomerge][,biomerge] - size set size of iommu (in bytes) - noagp don't initialize the AGP driver and use full aperture. - off don't use the IOMMU - leak turn on simple iommu leak tracing (only when CONFIG_IOMMU_LEAK is on) - memaper[=order] allocate an own aperture over RAM with size 32MB^order. - noforce don't force IOMMU usage. Default. - force Force IOMMU. - merge Do lazy merging. This may improve performance on some block devices. - Implies force (experimental) - biomerge Do merging at the BIO layer. This is more efficient than merge, - but should be only done with very big IOMMUs. Implies merge,force. - nomerge Don't do SG merging. - forcesac For SAC mode for masks <40bits (experimental) - fullflush Flush IOMMU on each allocation (default) - nofullflush Don't use IOMMU fullflush - allowed overwrite iommu off workarounds for specific chipsets. - soft Use software bounce buffering (default for Intel machines) - noaperture Don't touch the aperture for AGP. -*/ -__init int iommu_setup(char *p) -{ - int arg; - - while (*p) { - if (!strncmp(p,"noagp",5)) - no_agp = 1; - if (!strncmp(p,"off",3)) - no_iommu = 1; - if (!strncmp(p,"force",5)) { - force_iommu = 1; - iommu_aperture_allowed = 1; - } - if (!strncmp(p,"allowed",7)) - iommu_aperture_allowed = 1; - if (!strncmp(p,"noforce",7)) { - iommu_merge = 0; - force_iommu = 0; - } - if (!strncmp(p, "memaper", 7)) { - fallback_aper_force = 1; - p += 7; - if (*p == '=') { - ++p; - if (get_option(&p, &arg)) - fallback_aper_order = arg; - } - } - if (!strncmp(p, "biomerge",8)) { - iommu_bio_merge = 4096; - iommu_merge = 1; - force_iommu = 1; - } - if (!strncmp(p, "panic",5)) - panic_on_overflow = 1; - if (!strncmp(p, "nopanic",7)) - panic_on_overflow = 0; - if (!strncmp(p, "merge",5)) { - iommu_merge = 1; - force_iommu = 1; - } - if (!strncmp(p, "nomerge",7)) - iommu_merge = 0; - if (!strncmp(p, "forcesac",8)) - iommu_sac_force = 1; - if (!strncmp(p, "fullflush",8)) - iommu_fullflush = 1; - if (!strncmp(p, "nofullflush",11)) - iommu_fullflush = 0; - if (!strncmp(p, "soft",4)) - swiotlb = 1; - if (!strncmp(p, "noaperture",10)) - fix_aperture = 0; -#ifdef CONFIG_IOMMU_LEAK - if (!strncmp(p,"leak",4)) { - leak_trace = 1; - p += 4; - if (*p == '=') ++p; - if (isdigit(*p) && get_option(&p, &arg)) - iommu_leak_pages = arg; - } else -#endif - if (isdigit(*p) && get_option(&p, &arg)) - iommu_size = arg; - p += strcspn(p, ","); - if (*p == ',') - ++p; - } - return 1; -} diff --git a/arch/x86_64/kernel/pci-nommu.c b/arch/x86_64/kernel/pci-nommu.c deleted file mode 100644 index 67d90b89af0..00000000000 --- a/arch/x86_64/kernel/pci-nommu.c +++ /dev/null @@ -1,94 +0,0 @@ -/* Fallback functions when the main IOMMU code is not compiled in. This - code is roughly equivalent to i386. */ -#include <linux/mm.h> -#include <linux/init.h> -#include <linux/pci.h> -#include <linux/string.h> -#include <asm/proto.h> -#include <asm/processor.h> - -int iommu_merge = 0; -EXPORT_SYMBOL(iommu_merge); - -dma_addr_t bad_dma_address; -EXPORT_SYMBOL(bad_dma_address); - -int iommu_bio_merge = 0; -EXPORT_SYMBOL(iommu_bio_merge); - -int iommu_sac_force = 0; -EXPORT_SYMBOL(iommu_sac_force); - -/* - * Dummy IO MMU functions - */ - -void *dma_alloc_coherent(struct device *hwdev, size_t size, - dma_addr_t *dma_handle, unsigned gfp) -{ - void *ret; - u64 mask; - int order = get_order(size); - - if (hwdev) - mask = hwdev->coherent_dma_mask & *hwdev->dma_mask; - else - mask = 0xffffffff; - for (;;) { - ret = (void *)__get_free_pages(gfp, order); - if (ret == NULL) - return NULL; - *dma_handle = virt_to_bus(ret); - if ((*dma_handle & ~mask) == 0) - break; - free_pages((unsigned long)ret, order); - if (gfp & GFP_DMA) - return NULL; - gfp |= GFP_DMA; - } - - memset(ret, 0, size); - return ret; -} -EXPORT_SYMBOL(dma_alloc_coherent); - -void dma_free_coherent(struct device *hwdev, size_t size, - void *vaddr, dma_addr_t dma_handle) -{ - free_pages((unsigned long)vaddr, get_order(size)); -} -EXPORT_SYMBOL(dma_free_coherent); - -int dma_supported(struct device *hwdev, u64 mask) -{ - /* - * we fall back to GFP_DMA when the mask isn't all 1s, - * so we can't guarantee allocations that must be - * within a tighter range than GFP_DMA.. - * RED-PEN this won't work for pci_map_single. Caller has to - * use GFP_DMA in the first place. - */ - if (mask < 0x00ffffff) - return 0; - - return 1; -} -EXPORT_SYMBOL(dma_supported); - -int dma_get_cache_alignment(void) -{ - return boot_cpu_data.x86_clflush_size; -} -EXPORT_SYMBOL(dma_get_cache_alignment); - -static int __init check_ram(void) -{ - if (end_pfn >= 0xffffffff>>PAGE_SHIFT) { - printk( - KERN_ERR "WARNING more than 4GB of memory but IOMMU not compiled in.\n" - KERN_ERR "WARNING 32bit PCI may malfunction.\n"); - } - return 0; -} -__initcall(check_ram); - diff --git a/arch/x86_64/kernel/pmtimer.c b/arch/x86_64/kernel/pmtimer.c deleted file mode 100644 index feb5f108dd2..00000000000 --- a/arch/x86_64/kernel/pmtimer.c +++ /dev/null @@ -1,101 +0,0 @@ -/* Ported over from i386 by AK, original copyright was: - * - * (C) Dominik Brodowski <linux@brodo.de> 2003 - * - * Driver to use the Power Management Timer (PMTMR) available in some - * southbridges as primary timing source for the Linux kernel. - * - * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c, - * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4. - * - * This file is licensed under the GPL v2. - * - * Dropped all the hardware bug workarounds for now. Hopefully they - * are not needed on 64bit chipsets. - */ - -#include <linux/jiffies.h> -#include <linux/kernel.h> -#include <linux/time.h> -#include <linux/init.h> -#include <linux/cpumask.h> -#include <asm/io.h> -#include <asm/proto.h> -#include <asm/msr.h> -#include <asm/vsyscall.h> - -/* The I/O port the PMTMR resides at. - * The location is detected during setup_arch(), - * in arch/i386/kernel/acpi/boot.c */ -u32 pmtmr_ioport; - -/* value of the Power timer at last timer interrupt */ -static u32 offset_delay; -static u32 last_pmtmr_tick; - -#define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */ - -static inline u32 cyc2us(u32 cycles) -{ - /* The Power Management Timer ticks at 3.579545 ticks per microsecond. - * 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%] - * - * Even with HZ = 100, delta is at maximum 35796 ticks, so it can - * easily be multiplied with 286 (=0x11E) without having to fear - * u32 overflows. - */ - cycles *= 286; - return (cycles >> 10); -} - -int pmtimer_mark_offset(void) -{ - static int first_run = 1; - unsigned long tsc; - u32 lost; - - u32 tick = inl(pmtmr_ioport); - u32 delta; - - delta = cyc2us((tick - last_pmtmr_tick) & ACPI_PM_MASK); - - last_pmtmr_tick = tick; - monotonic_base += delta * NSEC_PER_USEC; - - delta += offset_delay; - - lost = delta / (USEC_PER_SEC / HZ); - offset_delay = delta % (USEC_PER_SEC / HZ); - - rdtscll(tsc); - vxtime.last_tsc = tsc - offset_delay * cpu_khz; - - /* don't calculate delay for first run, - or if we've got less then a tick */ - if (first_run || (lost < 1)) { - first_run = 0; - offset_delay = 0; - } - - return lost - 1; -} - -unsigned int do_gettimeoffset_pm(void) -{ - u32 now, offset, delta = 0; - - offset = last_pmtmr_tick; - now = inl(pmtmr_ioport); - delta = (now - offset) & ACPI_PM_MASK; - - return offset_delay + cyc2us(delta); -} - - -static int __init nopmtimer_setup(char *s) -{ - pmtmr_ioport = 0; - return 0; -} - -__setup("nopmtimer", nopmtimer_setup); diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c deleted file mode 100644 index 7577f9d7a75..00000000000 --- a/arch/x86_64/kernel/process.c +++ /dev/null @@ -1,842 +0,0 @@ -/* - * linux/arch/x86-64/kernel/process.c - * - * Copyright (C) 1995 Linus Torvalds - * - * Pentium III FXSR, SSE support - * Gareth Hughes <gareth@valinux.com>, May 2000 - * - * X86-64 port - * Andi Kleen. - * - * CPU hotplug support - ashok.raj@intel.com - * $Id: process.c,v 1.38 2002/01/15 10:08:03 ak Exp $ - */ - -/* - * This file handles the architecture-dependent parts of process handling.. - */ - -#include <stdarg.h> - -#include <linux/cpu.h> -#include <linux/errno.h> -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/elfcore.h> -#include <linux/smp.h> -#include <linux/slab.h> -#include <linux/user.h> -#include <linux/module.h> -#include <linux/a.out.h> -#include <linux/interrupt.h> -#include <linux/delay.h> -#include <linux/irq.h> -#include <linux/ptrace.h> -#include <linux/utsname.h> -#include <linux/random.h> -#include <linux/kprobes.h> - -#include <asm/uaccess.h> -#include <asm/pgtable.h> -#include <asm/system.h> -#include <asm/io.h> -#include <asm/processor.h> -#include <asm/i387.h> -#include <asm/mmu_context.h> -#include <asm/pda.h> -#include <asm/prctl.h> -#include <asm/kdebug.h> -#include <asm/desc.h> -#include <asm/proto.h> -#include <asm/ia32.h> - -asmlinkage extern void ret_from_fork(void); - -unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; - -static atomic_t hlt_counter = ATOMIC_INIT(0); - -unsigned long boot_option_idle_override = 0; -EXPORT_SYMBOL(boot_option_idle_override); - -/* - * Powermanagement idle function, if any.. - */ -void (*pm_idle)(void); -static DEFINE_PER_CPU(unsigned int, cpu_idle_state); - -void disable_hlt(void) -{ - atomic_inc(&hlt_counter); -} - -EXPORT_SYMBOL(disable_hlt); - -void enable_hlt(void) -{ - atomic_dec(&hlt_counter); -} - -EXPORT_SYMBOL(enable_hlt); - -/* - * We use this if we don't have any better - * idle routine.. - */ -void default_idle(void) -{ - if (!atomic_read(&hlt_counter)) { - local_irq_disable(); - if (!need_resched()) - safe_halt(); - else - local_irq_enable(); - } -} - -/* - * On SMP it's slightly faster (but much more power-consuming!) - * to poll the ->need_resched flag instead of waiting for the - * cross-CPU IPI to arrive. Use this option with caution. - */ -static void poll_idle (void) -{ - int oldval; - - local_irq_enable(); - - /* - * Deal with another CPU just having chosen a thread to - * run here: - */ - oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED); - - if (!oldval) { - set_thread_flag(TIF_POLLING_NRFLAG); - asm volatile( - "2:" - "testl %0,%1;" - "rep; nop;" - "je 2b;" - : : - "i" (_TIF_NEED_RESCHED), - "m" (current_thread_info()->flags)); - } else { - set_need_resched(); - } -} - -void cpu_idle_wait(void) -{ - unsigned int cpu, this_cpu = get_cpu(); - cpumask_t map; - - set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); - put_cpu(); - - cpus_clear(map); - for_each_online_cpu(cpu) { - per_cpu(cpu_idle_state, cpu) = 1; - cpu_set(cpu, map); - } - - __get_cpu_var(cpu_idle_state) = 0; - - wmb(); - do { - ssleep(1); - for_each_online_cpu(cpu) { - if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu)) - cpu_clear(cpu, map); - } - cpus_and(map, map, cpu_online_map); - } while (!cpus_empty(map)); -} -EXPORT_SYMBOL_GPL(cpu_idle_wait); - -#ifdef CONFIG_HOTPLUG_CPU -DECLARE_PER_CPU(int, cpu_state); - -#include <asm/nmi.h> -/* We don't actually take CPU down, just spin without interrupts. */ -static inline void play_dead(void) -{ - idle_task_exit(); - wbinvd(); - mb(); - /* Ack it */ - __get_cpu_var(cpu_state) = CPU_DEAD; - - while (1) - safe_halt(); -} -#else -static inline void play_dead(void) -{ - BUG(); -} -#endif /* CONFIG_HOTPLUG_CPU */ - -/* - * The idle thread. There's no useful work to be - * done, so just try to conserve power and have a - * low exit latency (ie sit in a loop waiting for - * somebody to say that they'd like to reschedule) - */ -void cpu_idle (void) -{ - /* endless idle loop with no priority at all */ - while (1) { - while (!need_resched()) { - void (*idle)(void); - - if (__get_cpu_var(cpu_idle_state)) - __get_cpu_var(cpu_idle_state) = 0; - - rmb(); - idle = pm_idle; - if (!idle) - idle = default_idle; - if (cpu_is_offline(smp_processor_id())) - play_dead(); - idle(); - } - - schedule(); - } -} - -/* - * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, - * which can obviate IPI to trigger checking of need_resched. - * We execute MONITOR against need_resched and enter optimized wait state - * through MWAIT. Whenever someone changes need_resched, we would be woken - * up from MWAIT (without an IPI). - */ -static void mwait_idle(void) -{ - local_irq_enable(); - - if (!need_resched()) { - set_thread_flag(TIF_POLLING_NRFLAG); - do { - __monitor((void *)¤t_thread_info()->flags, 0, 0); - if (need_resched()) - break; - __mwait(0, 0); - } while (!need_resched()); - clear_thread_flag(TIF_POLLING_NRFLAG); - } -} - -void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) -{ - static int printed; - if (cpu_has(c, X86_FEATURE_MWAIT)) { - /* - * Skip, if setup has overridden idle. - * One CPU supports mwait => All CPUs supports mwait - */ - if (!pm_idle) { - if (!printed) { - printk("using mwait in idle threads.\n"); - printed = 1; - } - pm_idle = mwait_idle; - } - } -} - -static int __init idle_setup (char *str) -{ - if (!strncmp(str, "poll", 4)) { - printk("using polling idle threads.\n"); - pm_idle = poll_idle; - } - - boot_option_idle_override = 1; - return 1; -} - -__setup("idle=", idle_setup); - -/* Prints also some state that isn't saved in the pt_regs */ -void __show_regs(struct pt_regs * regs) -{ - unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; - unsigned int fsindex,gsindex; - unsigned int ds,cs,es; - - printk("\n"); - print_modules(); - printk("Pid: %d, comm: %.20s %s %s\n", - current->pid, current->comm, print_tainted(), system_utsname.release); - printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip); - printk_address(regs->rip); - printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, regs->eflags); - printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", - regs->rax, regs->rbx, regs->rcx); - printk("RDX: %016lx RSI: %016lx RDI: %016lx\n", - regs->rdx, regs->rsi, regs->rdi); - printk("RBP: %016lx R08: %016lx R09: %016lx\n", - regs->rbp, regs->r8, regs->r9); - printk("R10: %016lx R11: %016lx R12: %016lx\n", - regs->r10, regs->r11, regs->r12); - printk("R13: %016lx R14: %016lx R15: %016lx\n", - regs->r13, regs->r14, regs->r15); - - asm("movl %%ds,%0" : "=r" (ds)); - asm("movl %%cs,%0" : "=r" (cs)); - asm("movl %%es,%0" : "=r" (es)); - asm("movl %%fs,%0" : "=r" (fsindex)); - asm("movl %%gs,%0" : "=r" (gsindex)); - - rdmsrl(MSR_FS_BASE, fs); - rdmsrl(MSR_GS_BASE, gs); - rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); - - asm("movq %%cr0, %0": "=r" (cr0)); - asm("movq %%cr2, %0": "=r" (cr2)); - asm("movq %%cr3, %0": "=r" (cr3)); - asm("movq %%cr4, %0": "=r" (cr4)); - - printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", - fs,fsindex,gs,gsindex,shadowgs); - printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0); - printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4); -} - -void show_regs(struct pt_regs *regs) -{ - __show_regs(regs); - show_trace(®s->rsp); -} - -/* - * Free current thread data structures etc.. - */ -void exit_thread(void) -{ - struct task_struct *me = current; - struct thread_struct *t = &me->thread; - - /* - * Remove function-return probe instances associated with this task - * and put them back on the free list. Do not insert an exit probe for - * this function, it will be disabled by kprobe_flush_task if you do. - */ - kprobe_flush_task(me); - - if (me->thread.io_bitmap_ptr) { - struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); - - kfree(t->io_bitmap_ptr); - t->io_bitmap_ptr = NULL; - /* - * Careful, clear this in the TSS too: - */ - memset(tss->io_bitmap, 0xff, t->io_bitmap_max); - t->io_bitmap_max = 0; - put_cpu(); - } -} - -void flush_thread(void) -{ - struct task_struct *tsk = current; - struct thread_info *t = current_thread_info(); - - /* - * Remove function-return probe instances associated with this task - * and put them back on the free list. Do not insert an exit probe for - * this function, it will be disabled by kprobe_flush_task if you do. - */ - kprobe_flush_task(tsk); - - if (t->flags & _TIF_ABI_PENDING) - t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32); - - tsk->thread.debugreg0 = 0; - tsk->thread.debugreg1 = 0; - tsk->thread.debugreg2 = 0; - tsk->thread.debugreg3 = 0; - tsk->thread.debugreg6 = 0; - tsk->thread.debugreg7 = 0; - memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); - /* - * Forget coprocessor state.. - */ - clear_fpu(tsk); - clear_used_math(); -} - -void release_thread(struct task_struct *dead_task) -{ - if (dead_task->mm) { - if (dead_task->mm->context.size) { - printk("WARNING: dead process %8s still has LDT? <%p/%d>\n", - dead_task->comm, - dead_task->mm->context.ldt, - dead_task->mm->context.size); - BUG(); - } - } -} - -static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr) -{ - struct user_desc ud = { - .base_addr = addr, - .limit = 0xfffff, - .seg_32bit = 1, - .limit_in_pages = 1, - .useable = 1, - }; - struct n_desc_struct *desc = (void *)t->thread.tls_array; - desc += tls; - desc->a = LDT_entry_a(&ud); - desc->b = LDT_entry_b(&ud); -} - -static inline u32 read_32bit_tls(struct task_struct *t, int tls) -{ - struct desc_struct *desc = (void *)t->thread.tls_array; - desc += tls; - return desc->base0 | - (((u32)desc->base1) << 16) | - (((u32)desc->base2) << 24); -} - -/* - * This gets called before we allocate a new thread and copy - * the current task into it. - */ -void prepare_to_copy(struct task_struct *tsk) -{ - unlazy_fpu(tsk); -} - -int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, - unsigned long unused, - struct task_struct * p, struct pt_regs * regs) -{ - int err; - struct pt_regs * childregs; - struct task_struct *me = current; - - childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p->thread_info)) - 1; - - *childregs = *regs; - - childregs->rax = 0; - childregs->rsp = rsp; - if (rsp == ~0UL) { - childregs->rsp = (unsigned long)childregs; - } - - p->thread.rsp = (unsigned long) childregs; - p->thread.rsp0 = (unsigned long) (childregs+1); - p->thread.userrsp = me->thread.userrsp; - - set_ti_thread_flag(p->thread_info, TIF_FORK); - - p->thread.fs = me->thread.fs; - p->thread.gs = me->thread.gs; - - asm("mov %%gs,%0" : "=m" (p->thread.gsindex)); - asm("mov %%fs,%0" : "=m" (p->thread.fsindex)); - asm("mov %%es,%0" : "=m" (p->thread.es)); - asm("mov %%ds,%0" : "=m" (p->thread.ds)); - - if (unlikely(me->thread.io_bitmap_ptr != NULL)) { - p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); - if (!p->thread.io_bitmap_ptr) { - p->thread.io_bitmap_max = 0; - return -ENOMEM; - } - memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, IO_BITMAP_BYTES); - } - - /* - * Set a new TLS for the child thread? - */ - if (clone_flags & CLONE_SETTLS) { -#ifdef CONFIG_IA32_EMULATION - if (test_thread_flag(TIF_IA32)) - err = ia32_child_tls(p, childregs); - else -#endif - err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); - if (err) - goto out; - } - err = 0; -out: - if (err && p->thread.io_bitmap_ptr) { - kfree(p->thread.io_bitmap_ptr); - p->thread.io_bitmap_max = 0; - } - return err; -} - -/* - * This function selects if the context switch from prev to next - * has to tweak the TSC disable bit in the cr4. - */ -static inline void disable_tsc(struct task_struct *prev_p, - struct task_struct *next_p) -{ - struct thread_info *prev, *next; - - /* - * gcc should eliminate the ->thread_info dereference if - * has_secure_computing returns 0 at compile time (SECCOMP=n). - */ - prev = prev_p->thread_info; - next = next_p->thread_info; - - if (has_secure_computing(prev) || has_secure_computing(next)) { - /* slow path here */ - if (has_secure_computing(prev) && - !has_secure_computing(next)) { - write_cr4(read_cr4() & ~X86_CR4_TSD); - } else if (!has_secure_computing(prev) && - has_secure_computing(next)) - write_cr4(read_cr4() | X86_CR4_TSD); - } -} - -/* - * This special macro can be used to load a debugging register - */ -#define loaddebug(thread,r) set_debug(thread->debugreg ## r, r) - -/* - * switch_to(x,y) should switch tasks from x to y. - * - * This could still be optimized: - * - fold all the options into a flag word and test it with a single test. - * - could test fs/gs bitsliced - */ -struct task_struct *__switch_to(struct task_struct *prev_p, struct task_struct *next_p) -{ - struct thread_struct *prev = &prev_p->thread, - *next = &next_p->thread; - int cpu = smp_processor_id(); - struct tss_struct *tss = &per_cpu(init_tss, cpu); - - unlazy_fpu(prev_p); - - /* - * Reload esp0, LDT and the page table pointer: - */ - tss->rsp0 = next->rsp0; - - /* - * Switch DS and ES. - * This won't pick up thread selector changes, but I guess that is ok. - */ - asm volatile("mov %%es,%0" : "=m" (prev->es)); - if (unlikely(next->es | prev->es)) - loadsegment(es, next->es); - - asm volatile ("mov %%ds,%0" : "=m" (prev->ds)); - if (unlikely(next->ds | prev->ds)) - loadsegment(ds, next->ds); - - load_TLS(next, cpu); - - /* - * Switch FS and GS. - */ - { - unsigned fsindex; - asm volatile("movl %%fs,%0" : "=r" (fsindex)); - /* segment register != 0 always requires a reload. - also reload when it has changed. - when prev process used 64bit base always reload - to avoid an information leak. */ - if (unlikely(fsindex | next->fsindex | prev->fs)) { - loadsegment(fs, next->fsindex); - /* check if the user used a selector != 0 - * if yes clear 64bit base, since overloaded base - * is always mapped to the Null selector - */ - if (fsindex) - prev->fs = 0; - } - /* when next process has a 64bit base use it */ - if (next->fs) - wrmsrl(MSR_FS_BASE, next->fs); - prev->fsindex = fsindex; - } - { - unsigned gsindex; - asm volatile("movl %%gs,%0" : "=r" (gsindex)); - if (unlikely(gsindex | next->gsindex | prev->gs)) { - load_gs_index(next->gsindex); - if (gsindex) - prev->gs = 0; - } - if (next->gs) - wrmsrl(MSR_KERNEL_GS_BASE, next->gs); - prev->gsindex = gsindex; - } - - /* - * Switch the PDA context. - */ - prev->userrsp = read_pda(oldrsp); - write_pda(oldrsp, next->userrsp); - write_pda(pcurrent, next_p); - write_pda(kernelstack, (unsigned long)next_p->thread_info + THREAD_SIZE - PDA_STACKOFFSET); - - /* - * Now maybe reload the debug registers - */ - if (unlikely(next->debugreg7)) { - loaddebug(next, 0); - loaddebug(next, 1); - loaddebug(next, 2); - loaddebug(next, 3); - /* no 4 and 5 */ - loaddebug(next, 6); - loaddebug(next, 7); - } - - - /* - * Handle the IO bitmap - */ - if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) { - if (next->io_bitmap_ptr) - /* - * Copy the relevant range of the IO bitmap. - * Normally this is 128 bytes or less: - */ - memcpy(tss->io_bitmap, next->io_bitmap_ptr, - max(prev->io_bitmap_max, next->io_bitmap_max)); - else { - /* - * Clear any possible leftover bits: - */ - memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); - } - } - - disable_tsc(prev_p, next_p); - - return prev_p; -} - -/* - * sys_execve() executes a new program. - */ -asmlinkage -long sys_execve(char __user *name, char __user * __user *argv, - char __user * __user *envp, struct pt_regs regs) -{ - long error; - char * filename; - - filename = getname(name); - error = PTR_ERR(filename); - if (IS_ERR(filename)) - return error; - error = do_execve(filename, argv, envp, ®s); - if (error == 0) { - task_lock(current); - current->ptrace &= ~PT_DTRACE; - task_unlock(current); - } - putname(filename); - return error; -} - -void set_personality_64bit(void) -{ - /* inherit personality from parent */ - - /* Make sure to be in 64bit mode */ - clear_thread_flag(TIF_IA32); - - /* TBD: overwrites user setup. Should have two bits. - But 64bit processes have always behaved this way, - so it's not too bad. The main problem is just that - 32bit childs are affected again. */ - current->personality &= ~READ_IMPLIES_EXEC; -} - -asmlinkage long sys_fork(struct pt_regs *regs) -{ - return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL); -} - -asmlinkage long sys_clone(unsigned long clone_flags, unsigned long newsp, void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) -{ - if (!newsp) - newsp = regs->rsp; - return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); -} - -/* - * This is trivial, and on the face of it looks like it - * could equally well be done in user mode. - * - * Not so, for quite unobvious reasons - register pressure. - * In user mode vfork() cannot have a stack frame, and if - * done by calling the "clone()" system call directly, you - * do not have enough call-clobbered registers to hold all - * the information you need. - */ -asmlinkage long sys_vfork(struct pt_regs *regs) -{ - return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0, - NULL, NULL); -} - -unsigned long get_wchan(struct task_struct *p) -{ - unsigned long stack; - u64 fp,rip; - int count = 0; - - if (!p || p == current || p->state==TASK_RUNNING) - return 0; - stack = (unsigned long)p->thread_info; - if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE) - return 0; - fp = *(u64 *)(p->thread.rsp); - do { - if (fp < (unsigned long)stack || fp > (unsigned long)stack+THREAD_SIZE) - return 0; - rip = *(u64 *)(fp+8); - if (!in_sched_functions(rip)) - return rip; - fp = *(u64 *)fp; - } while (count++ < 16); - return 0; -} - -long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) -{ - int ret = 0; - int doit = task == current; - int cpu; - - switch (code) { - case ARCH_SET_GS: - if (addr >= TASK_SIZE_OF(task)) - return -EPERM; - cpu = get_cpu(); - /* handle small bases via the GDT because that's faster to - switch. */ - if (addr <= 0xffffffff) { - set_32bit_tls(task, GS_TLS, addr); - if (doit) { - load_TLS(&task->thread, cpu); - load_gs_index(GS_TLS_SEL); - } - task->thread.gsindex = GS_TLS_SEL; - task->thread.gs = 0; - } else { - task->thread.gsindex = 0; - task->thread.gs = addr; - if (doit) { - load_gs_index(0); - ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); - } - } - put_cpu(); - break; - case ARCH_SET_FS: - /* Not strictly needed for fs, but do it for symmetry - with gs */ - if (addr >= TASK_SIZE_OF(task)) - return -EPERM; - cpu = get_cpu(); - /* handle small bases via the GDT because that's faster to - switch. */ - if (addr <= 0xffffffff) { - set_32bit_tls(task, FS_TLS, addr); - if (doit) { - load_TLS(&task->thread, cpu); - asm volatile("movl %0,%%fs" :: "r" (FS_TLS_SEL)); - } - task->thread.fsindex = FS_TLS_SEL; - task->thread.fs = 0; - } else { - task->thread.fsindex = 0; - task->thread.fs = addr; - if (doit) { - /* set the selector to 0 to not confuse - __switch_to */ - asm volatile("movl %0,%%fs" :: "r" (0)); - ret = checking_wrmsrl(MSR_FS_BASE, addr); - } - } - put_cpu(); - break; - case ARCH_GET_FS: { - unsigned long base; - if (task->thread.fsindex == FS_TLS_SEL) - base = read_32bit_tls(task, FS_TLS); - else if (doit) { - rdmsrl(MSR_FS_BASE, base); - } else - base = task->thread.fs; - ret = put_user(base, (unsigned long __user *)addr); - break; - } - case ARCH_GET_GS: { - unsigned long base; - if (task->thread.gsindex == GS_TLS_SEL) - base = read_32bit_tls(task, GS_TLS); - else if (doit) { - rdmsrl(MSR_KERNEL_GS_BASE, base); - } else - base = task->thread.gs; - ret = put_user(base, (unsigned long __user *)addr); - break; - } - - default: - ret = -EINVAL; - break; - } - - return ret; -} - -long sys_arch_prctl(int code, unsigned long addr) -{ - return do_arch_prctl(current, code, addr); -} - -/* - * Capture the user space registers if the task is not running (in user space) - */ -int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) -{ - struct pt_regs *pp, ptregs; - - pp = (struct pt_regs *)(tsk->thread.rsp0); - --pp; - - ptregs = *pp; - ptregs.cs &= 0xffff; - ptregs.ss &= 0xffff; - - elf_core_copy_regs(regs, &ptregs); - - return 1; -} - -unsigned long arch_align_stack(unsigned long sp) -{ - if (randomize_va_space) - sp -= get_random_int() % 8192; - return sp & ~0xf; -} diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c deleted file mode 100644 index bbf64b59a21..00000000000 --- a/arch/x86_64/kernel/ptrace.c +++ /dev/null @@ -1,674 +0,0 @@ -/* ptrace.c */ -/* By Ross Biro 1/23/92 */ -/* - * Pentium III FXSR, SSE support - * Gareth Hughes <gareth@valinux.com>, May 2000 - * - * x86-64 port 2000-2002 Andi Kleen - */ - -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/smp_lock.h> -#include <linux/errno.h> -#include <linux/ptrace.h> -#include <linux/user.h> -#include <linux/security.h> -#include <linux/audit.h> -#include <linux/seccomp.h> -#include <linux/signal.h> - -#include <asm/uaccess.h> -#include <asm/pgtable.h> -#include <asm/system.h> -#include <asm/processor.h> -#include <asm/i387.h> -#include <asm/debugreg.h> -#include <asm/ldt.h> -#include <asm/desc.h> -#include <asm/proto.h> -#include <asm/ia32.h> - -/* - * does not yet catch signals sent when the child dies. - * in exit.c or in signal.c. - */ - -/* determines which flags the user has access to. */ -/* 1 = access 0 = no access */ -#define FLAG_MASK 0x44dd5UL - -/* set's the trap flag. */ -#define TRAP_FLAG 0x100UL - -/* - * eflags and offset of eflags on child stack.. - */ -#define EFLAGS offsetof(struct pt_regs, eflags) -#define EFL_OFFSET ((int)(EFLAGS-sizeof(struct pt_regs))) - -/* - * this routine will get a word off of the processes privileged stack. - * the offset is how far from the base addr as stored in the TSS. - * this routine assumes that all the privileged stacks are in our - * data space. - */ -static inline unsigned long get_stack_long(struct task_struct *task, int offset) -{ - unsigned char *stack; - - stack = (unsigned char *)task->thread.rsp0; - stack += offset; - return (*((unsigned long *)stack)); -} - -static inline struct pt_regs *get_child_regs(struct task_struct *task) -{ - struct pt_regs *regs = (void *)task->thread.rsp0; - return regs - 1; -} - -/* - * this routine will put a word on the processes privileged stack. - * the offset is how far from the base addr as stored in the TSS. - * this routine assumes that all the privileged stacks are in our - * data space. - */ -static inline long put_stack_long(struct task_struct *task, int offset, - unsigned long data) -{ - unsigned char * stack; - - stack = (unsigned char *) task->thread.rsp0; - stack += offset; - *(unsigned long *) stack = data; - return 0; -} - -#define LDT_SEGMENT 4 - -unsigned long convert_rip_to_linear(struct task_struct *child, struct pt_regs *regs) -{ - unsigned long addr, seg; - - addr = regs->rip; - seg = regs->cs & 0xffff; - - /* - * We'll assume that the code segments in the GDT - * are all zero-based. That is largely true: the - * TLS segments are used for data, and the PNPBIOS - * and APM bios ones we just ignore here. - */ - if (seg & LDT_SEGMENT) { - u32 *desc; - unsigned long base; - - down(&child->mm->context.sem); - desc = child->mm->context.ldt + (seg & ~7); - base = (desc[0] >> 16) | ((desc[1] & 0xff) << 16) | (desc[1] & 0xff000000); - - /* 16-bit code segment? */ - if (!((desc[1] >> 22) & 1)) - addr &= 0xffff; - addr += base; - up(&child->mm->context.sem); - } - return addr; -} - -static int is_at_popf(struct task_struct *child, struct pt_regs *regs) -{ - int i, copied; - unsigned char opcode[16]; - unsigned long addr = convert_rip_to_linear(child, regs); - - copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0); - for (i = 0; i < copied; i++) { - switch (opcode[i]) { - /* popf */ - case 0x9d: - return 1; - - /* CHECKME: 64 65 */ - - /* opcode and address size prefixes */ - case 0x66: case 0x67: - continue; - /* irrelevant prefixes (segment overrides and repeats) */ - case 0x26: case 0x2e: - case 0x36: case 0x3e: - case 0x64: case 0x65: - case 0xf0: case 0xf2: case 0xf3: - continue; - - /* REX prefixes */ - case 0x40 ... 0x4f: - continue; - - /* CHECKME: f0, f2, f3 */ - - /* - * pushf: NOTE! We should probably not let - * the user see the TF bit being set. But - * it's more pain than it's worth to avoid - * it, and a debugger could emulate this - * all in user space if it _really_ cares. - */ - case 0x9c: - default: - return 0; - } - } - return 0; -} - -static void set_singlestep(struct task_struct *child) -{ - struct pt_regs *regs = get_child_regs(child); - - /* - * Always set TIF_SINGLESTEP - this guarantees that - * we single-step system calls etc.. This will also - * cause us to set TF when returning to user mode. - */ - set_tsk_thread_flag(child, TIF_SINGLESTEP); - - /* - * If TF was already set, don't do anything else - */ - if (regs->eflags & TRAP_FLAG) - return; - - /* Set TF on the kernel stack.. */ - regs->eflags |= TRAP_FLAG; - - /* - * ..but if TF is changed by the instruction we will trace, - * don't mark it as being "us" that set it, so that we - * won't clear it by hand later. - * - * AK: this is not enough, LAHF and IRET can change TF in user space too. - */ - if (is_at_popf(child, regs)) - return; - - child->ptrace |= PT_DTRACE; -} - -static void clear_singlestep(struct task_struct *child) -{ - /* Always clear TIF_SINGLESTEP... */ - clear_tsk_thread_flag(child, TIF_SINGLESTEP); - - /* But touch TF only if it was set by us.. */ - if (child->ptrace & PT_DTRACE) { - struct pt_regs *regs = get_child_regs(child); - regs->eflags &= ~TRAP_FLAG; - child->ptrace &= ~PT_DTRACE; - } -} - -/* - * Called by kernel/ptrace.c when detaching.. - * - * Make sure the single step bit is not set. - */ -void ptrace_disable(struct task_struct *child) -{ - clear_singlestep(child); -} - -static int putreg(struct task_struct *child, - unsigned long regno, unsigned long value) -{ - unsigned long tmp; - - /* Some code in the 64bit emulation may not be 64bit clean. - Don't take any chances. */ - if (test_tsk_thread_flag(child, TIF_IA32)) - value &= 0xffffffff; - switch (regno) { - case offsetof(struct user_regs_struct,fs): - if (value && (value & 3) != 3) - return -EIO; - child->thread.fsindex = value & 0xffff; - return 0; - case offsetof(struct user_regs_struct,gs): - if (value && (value & 3) != 3) - return -EIO; - child->thread.gsindex = value & 0xffff; - return 0; - case offsetof(struct user_regs_struct,ds): - if (value && (value & 3) != 3) - return -EIO; - child->thread.ds = value & 0xffff; - return 0; - case offsetof(struct user_regs_struct,es): - if (value && (value & 3) != 3) - return -EIO; - child->thread.es = value & 0xffff; - return 0; - case offsetof(struct user_regs_struct,ss): - if ((value & 3) != 3) - return -EIO; - value &= 0xffff; - return 0; - case offsetof(struct user_regs_struct,fs_base): - if (value >= TASK_SIZE_OF(child)) - return -EIO; - child->thread.fs = value; - return 0; - case offsetof(struct user_regs_struct,gs_base): - if (value >= TASK_SIZE_OF(child)) - return -EIO; - child->thread.gs = value; - return 0; - case offsetof(struct user_regs_struct, eflags): - value &= FLAG_MASK; - tmp = get_stack_long(child, EFL_OFFSET); - tmp &= ~FLAG_MASK; - value |= tmp; - break; - case offsetof(struct user_regs_struct,cs): - if ((value & 3) != 3) - return -EIO; - value &= 0xffff; - break; - case offsetof(struct user_regs_struct, rip): - /* Check if the new RIP address is canonical */ - if (value >= TASK_SIZE_OF(child)) - return -EIO; - break; - } - put_stack_long(child, regno - sizeof(struct pt_regs), value); - return 0; -} - -static unsigned long getreg(struct task_struct *child, unsigned long regno) -{ - unsigned long val; - switch (regno) { - case offsetof(struct user_regs_struct, fs): - return child->thread.fsindex; - case offsetof(struct user_regs_struct, gs): - return child->thread.gsindex; - case offsetof(struct user_regs_struct, ds): - return child->thread.ds; - case offsetof(struct user_regs_struct, es): - return child->thread.es; - case offsetof(struct user_regs_struct, fs_base): - return child->thread.fs; - case offsetof(struct user_regs_struct, gs_base): - return child->thread.gs; - default: - regno = regno - sizeof(struct pt_regs); - val = get_stack_long(child, regno); - if (test_tsk_thread_flag(child, TIF_IA32)) - val &= 0xffffffff; - return val; - } - -} - -asmlinkage long sys_ptrace(long request, long pid, unsigned long addr, long data) -{ - struct task_struct *child; - long i, ret; - unsigned ui; - - /* This lock_kernel fixes a subtle race with suid exec */ - lock_kernel(); - ret = -EPERM; - if (request == PTRACE_TRACEME) { - /* are we already being traced? */ - if (current->ptrace & PT_PTRACED) - goto out; - ret = security_ptrace(current->parent, current); - if (ret) - goto out; - /* set the ptrace bit in the process flags. */ - current->ptrace |= PT_PTRACED; - ret = 0; - goto out; - } - ret = -ESRCH; - read_lock(&tasklist_lock); - child = find_task_by_pid(pid); - if (child) - get_task_struct(child); - read_unlock(&tasklist_lock); - if (!child) - goto out; - - ret = -EPERM; - if (pid == 1) /* you may not mess with init */ - goto out_tsk; - - if (request == PTRACE_ATTACH) { - ret = ptrace_attach(child); - goto out_tsk; - } - ret = ptrace_check_attach(child, request == PTRACE_KILL); - if (ret < 0) - goto out_tsk; - - switch (request) { - /* when I and D space are separate, these will need to be fixed. */ - case PTRACE_PEEKTEXT: /* read word at location addr. */ - case PTRACE_PEEKDATA: { - unsigned long tmp; - int copied; - - copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0); - ret = -EIO; - if (copied != sizeof(tmp)) - break; - ret = put_user(tmp,(unsigned long __user *) data); - break; - } - - /* read the word at location addr in the USER area. */ - case PTRACE_PEEKUSR: { - unsigned long tmp; - - ret = -EIO; - if ((addr & 7) || - addr > sizeof(struct user) - 7) - break; - - switch (addr) { - case 0 ... sizeof(struct user_regs_struct) - sizeof(long): - tmp = getreg(child, addr); - break; - case offsetof(struct user, u_debugreg[0]): - tmp = child->thread.debugreg0; - break; - case offsetof(struct user, u_debugreg[1]): - tmp = child->thread.debugreg1; - break; - case offsetof(struct user, u_debugreg[2]): - tmp = child->thread.debugreg2; - break; - case offsetof(struct user, u_debugreg[3]): - tmp = child->thread.debugreg3; - break; - case offsetof(struct user, u_debugreg[6]): - tmp = child->thread.debugreg6; - break; - case offsetof(struct user, u_debugreg[7]): - tmp = child->thread.debugreg7; - break; - default: - tmp = 0; - break; - } - ret = put_user(tmp,(unsigned long __user *) data); - break; - } - - /* when I and D space are separate, this will have to be fixed. */ - case PTRACE_POKETEXT: /* write the word at location addr. */ - case PTRACE_POKEDATA: - ret = 0; - if (access_process_vm(child, addr, &data, sizeof(data), 1) == sizeof(data)) - break; - ret = -EIO; - break; - - case PTRACE_POKEUSR: /* write the word at location addr in the USER area */ - { - int dsize = test_tsk_thread_flag(child, TIF_IA32) ? 3 : 7; - ret = -EIO; - if ((addr & 7) || - addr > sizeof(struct user) - 7) - break; - - switch (addr) { - case 0 ... sizeof(struct user_regs_struct) - sizeof(long): - ret = putreg(child, addr, data); - break; - /* Disallows to set a breakpoint into the vsyscall */ - case offsetof(struct user, u_debugreg[0]): - if (data >= TASK_SIZE_OF(child) - dsize) break; - child->thread.debugreg0 = data; - ret = 0; - break; - case offsetof(struct user, u_debugreg[1]): - if (data >= TASK_SIZE_OF(child) - dsize) break; - child->thread.debugreg1 = data; - ret = 0; - break; - case offsetof(struct user, u_debugreg[2]): - if (data >= TASK_SIZE_OF(child) - dsize) break; - child->thread.debugreg2 = data; - ret = 0; - break; - case offsetof(struct user, u_debugreg[3]): - if (data >= TASK_SIZE_OF(child) - dsize) break; - child->thread.debugreg3 = data; - ret = 0; - break; - case offsetof(struct user, u_debugreg[6]): - if (data >> 32) - break; - child->thread.debugreg6 = data; - ret = 0; - break; - case offsetof(struct user, u_debugreg[7]): - /* See arch/i386/kernel/ptrace.c for an explanation of - * this awkward check.*/ - data &= ~DR_CONTROL_RESERVED; - for(i=0; i<4; i++) - if ((0x5454 >> ((data >> (16 + 4*i)) & 0xf)) & 1) - break; - if (i == 4) { - child->thread.debugreg7 = data; - ret = 0; - } - break; - } - break; - } - case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */ - case PTRACE_CONT: /* restart after signal. */ - - ret = -EIO; - if (!valid_signal(data)) - break; - if (request == PTRACE_SYSCALL) - set_tsk_thread_flag(child,TIF_SYSCALL_TRACE); - else - clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE); - clear_tsk_thread_flag(child, TIF_SINGLESTEP); - child->exit_code = data; - /* make sure the single step bit is not set. */ - clear_singlestep(child); - wake_up_process(child); - ret = 0; - break; - -#ifdef CONFIG_IA32_EMULATION - /* This makes only sense with 32bit programs. Allow a - 64bit debugger to fully examine them too. Better - don't use it against 64bit processes, use - PTRACE_ARCH_PRCTL instead. */ - case PTRACE_SET_THREAD_AREA: { - struct user_desc __user *p; - int old; - p = (struct user_desc __user *)data; - get_user(old, &p->entry_number); - put_user(addr, &p->entry_number); - ret = do_set_thread_area(&child->thread, p); - put_user(old, &p->entry_number); - break; - case PTRACE_GET_THREAD_AREA: - p = (struct user_desc __user *)data; - get_user(old, &p->entry_number); - put_user(addr, &p->entry_number); - ret = do_get_thread_area(&child->thread, p); - put_user(old, &p->entry_number); - break; - } -#endif - /* normal 64bit interface to access TLS data. - Works just like arch_prctl, except that the arguments - are reversed. */ - case PTRACE_ARCH_PRCTL: - ret = do_arch_prctl(child, data, addr); - break; - -/* - * make the child exit. Best I can do is send it a sigkill. - * perhaps it should be put in the status that it wants to - * exit. - */ - case PTRACE_KILL: - ret = 0; - if (child->exit_state == EXIT_ZOMBIE) /* already dead */ - break; - clear_tsk_thread_flag(child, TIF_SINGLESTEP); - child->exit_code = SIGKILL; - /* make sure the single step bit is not set. */ - clear_singlestep(child); - wake_up_process(child); - break; - - case PTRACE_SINGLESTEP: /* set the trap flag. */ - ret = -EIO; - if (!valid_signal(data)) - break; - clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE); - set_singlestep(child); - child->exit_code = data; - /* give it a chance to run. */ - wake_up_process(child); - ret = 0; - break; - - case PTRACE_DETACH: - /* detach a process that was attached. */ - ret = ptrace_detach(child, data); - break; - - case PTRACE_GETREGS: { /* Get all gp regs from the child. */ - if (!access_ok(VERIFY_WRITE, (unsigned __user *)data, - sizeof(struct user_regs_struct))) { - ret = -EIO; - break; - } - ret = 0; - for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) { - ret |= __put_user(getreg(child, ui),(unsigned long __user *) data); - data += sizeof(long); - } - break; - } - - case PTRACE_SETREGS: { /* Set all gp regs in the child. */ - unsigned long tmp; - if (!access_ok(VERIFY_READ, (unsigned __user *)data, - sizeof(struct user_regs_struct))) { - ret = -EIO; - break; - } - ret = 0; - for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) { - ret |= __get_user(tmp, (unsigned long __user *) data); - putreg(child, ui, tmp); - data += sizeof(long); - } - break; - } - - case PTRACE_GETFPREGS: { /* Get the child extended FPU state. */ - if (!access_ok(VERIFY_WRITE, (unsigned __user *)data, - sizeof(struct user_i387_struct))) { - ret = -EIO; - break; - } - ret = get_fpregs((struct user_i387_struct __user *)data, child); - break; - } - - case PTRACE_SETFPREGS: { /* Set the child extended FPU state. */ - if (!access_ok(VERIFY_READ, (unsigned __user *)data, - sizeof(struct user_i387_struct))) { - ret = -EIO; - break; - } - set_stopped_child_used_math(child); - ret = set_fpregs(child, (struct user_i387_struct __user *)data); - break; - } - - default: - ret = ptrace_request(child, request, addr, data); - break; - } -out_tsk: - put_task_struct(child); -out: - unlock_kernel(); - return ret; -} - -static void syscall_trace(struct pt_regs *regs) -{ - -#if 0 - printk("trace %s rip %lx rsp %lx rax %d origrax %d caller %lx tiflags %x ptrace %x\n", - current->comm, - regs->rip, regs->rsp, regs->rax, regs->orig_rax, __builtin_return_address(0), - current_thread_info()->flags, current->ptrace); -#endif - - ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) - ? 0x80 : 0)); - /* - * this isn't the same as continuing with a signal, but it will do - * for normal use. strace only continues with a signal if the - * stopping signal is not SIGTRAP. -brl - */ - if (current->exit_code) { - send_sig(current->exit_code, current, 1); - current->exit_code = 0; - } -} - -asmlinkage void syscall_trace_enter(struct pt_regs *regs) -{ - /* do the secure computing check first */ - secure_computing(regs->orig_rax); - - if (test_thread_flag(TIF_SYSCALL_TRACE) - && (current->ptrace & PT_PTRACED)) - syscall_trace(regs); - - if (unlikely(current->audit_context)) { - if (test_thread_flag(TIF_IA32)) { - audit_syscall_entry(current, AUDIT_ARCH_I386, - regs->orig_rax, - regs->rbx, regs->rcx, - regs->rdx, regs->rsi); - } else { - audit_syscall_entry(current, AUDIT_ARCH_X86_64, - regs->orig_rax, - regs->rdi, regs->rsi, - regs->rdx, regs->r10); - } - } -} - -asmlinkage void syscall_trace_leave(struct pt_regs *regs) -{ - if (unlikely(current->audit_context)) - audit_syscall_exit(current, AUDITSC_RESULT(regs->rax), regs->rax); - - if ((test_thread_flag(TIF_SYSCALL_TRACE) - || test_thread_flag(TIF_SINGLESTEP)) - && (current->ptrace & PT_PTRACED)) - syscall_trace(regs); -} diff --git a/arch/x86_64/kernel/reboot.c b/arch/x86_64/kernel/reboot.c deleted file mode 100644 index 47f95687905..00000000000 --- a/arch/x86_64/kernel/reboot.c +++ /dev/null @@ -1,162 +0,0 @@ -/* Various gunk just to reboot the machine. */ -#include <linux/module.h> -#include <linux/reboot.h> -#include <linux/init.h> -#include <linux/smp.h> -#include <linux/kernel.h> -#include <linux/ctype.h> -#include <linux/string.h> -#include <asm/io.h> -#include <asm/kdebug.h> -#include <asm/delay.h> -#include <asm/hw_irq.h> -#include <asm/system.h> -#include <asm/pgtable.h> -#include <asm/tlbflush.h> -#include <asm/apic.h> - -/* - * Power off function, if any - */ -void (*pm_power_off)(void); - -static long no_idt[3]; -static enum { - BOOT_TRIPLE = 't', - BOOT_KBD = 'k' -} reboot_type = BOOT_KBD; -static int reboot_mode = 0; -int reboot_force; - -/* reboot=t[riple] | k[bd] [, [w]arm | [c]old] - warm Don't set the cold reboot flag - cold Set the cold reboot flag - triple Force a triple fault (init) - kbd Use the keyboard controller. cold reset (default) - force Avoid anything that could hang. - */ -static int __init reboot_setup(char *str) -{ - for (;;) { - switch (*str) { - case 'w': - reboot_mode = 0x1234; - break; - - case 'c': - reboot_mode = 0; - break; - - case 't': - case 'b': - case 'k': - reboot_type = *str; - break; - case 'f': - reboot_force = 1; - break; - } - if((str = strchr(str,',')) != NULL) - str++; - else - break; - } - return 1; -} - -__setup("reboot=", reboot_setup); - -static inline void kb_wait(void) -{ - int i; - - for (i=0; i<0x10000; i++) - if ((inb_p(0x64) & 0x02) == 0) - break; -} - -void machine_shutdown(void) -{ - /* Stop the cpus and apics */ -#ifdef CONFIG_SMP - int reboot_cpu_id; - - /* The boot cpu is always logical cpu 0 */ - reboot_cpu_id = 0; - - /* Make certain the cpu I'm about to reboot on is online */ - if (!cpu_isset(reboot_cpu_id, cpu_online_map)) { - reboot_cpu_id = smp_processor_id(); - } - - /* Make certain I only run on the appropriate processor */ - set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id)); - - /* O.K Now that I'm on the appropriate processor, - * stop all of the others. - */ - smp_send_stop(); -#endif - - local_irq_disable(); - -#ifndef CONFIG_SMP - disable_local_APIC(); -#endif - - disable_IO_APIC(); - - local_irq_enable(); -} - -void machine_emergency_restart(void) -{ - int i; - - /* Tell the BIOS if we want cold or warm reboot */ - *((unsigned short *)__va(0x472)) = reboot_mode; - - for (;;) { - /* Could also try the reset bit in the Hammer NB */ - switch (reboot_type) { - case BOOT_KBD: - for (i=0; i<100; i++) { - kb_wait(); - udelay(50); - outb(0xfe,0x64); /* pulse reset low */ - udelay(50); - } - - case BOOT_TRIPLE: - __asm__ __volatile__("lidt (%0)": :"r" (&no_idt)); - __asm__ __volatile__("int3"); - - reboot_type = BOOT_KBD; - break; - } - } -} - -void machine_restart(char * __unused) -{ - printk("machine restart\n"); - - if (!reboot_force) { - machine_shutdown(); - } - machine_emergency_restart(); -} - -void machine_halt(void) -{ -} - -void machine_power_off(void) -{ - if (!reboot_force) { - machine_shutdown(); - } - if (pm_power_off) - pm_power_off(); -} - diff --git a/arch/x86_64/kernel/relocate_kernel.S b/arch/x86_64/kernel/relocate_kernel.S deleted file mode 100644 index d24fa9b72a2..00000000000 --- a/arch/x86_64/kernel/relocate_kernel.S +++ /dev/null @@ -1,143 +0,0 @@ -/* - * relocate_kernel.S - put the kernel image in place to boot - * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com> - * - * This source code is licensed under the GNU General Public License, - * Version 2. See the file COPYING for more details. - */ - -#include <linux/linkage.h> - - /* - * Must be relocatable PIC code callable as a C function, that once - * it starts can not use the previous processes stack. - */ - .globl relocate_new_kernel - .code64 -relocate_new_kernel: - /* %rdi page_list - * %rsi reboot_code_buffer - * %rdx start address - * %rcx page_table - * %r8 arg5 - * %r9 arg6 - */ - - /* zero out flags, and disable interrupts */ - pushq $0 - popfq - - /* set a new stack at the bottom of our page... */ - lea 4096(%rsi), %rsp - - /* store the parameters back on the stack */ - pushq %rdx /* store the start address */ - - /* Set cr0 to a known state: - * 31 1 == Paging enabled - * 18 0 == Alignment check disabled - * 16 0 == Write protect disabled - * 3 0 == No task switch - * 2 0 == Don't do FP software emulation. - * 0 1 == Proctected mode enabled - */ - movq %cr0, %rax - andq $~((1<<18)|(1<<16)|(1<<3)|(1<<2)), %rax - orl $((1<<31)|(1<<0)), %eax - movq %rax, %cr0 - - /* Set cr4 to a known state: - * 10 0 == xmm exceptions disabled - * 9 0 == xmm registers instructions disabled - * 8 0 == performance monitoring counter disabled - * 7 0 == page global disabled - * 6 0 == machine check exceptions disabled - * 5 1 == physical address extension enabled - * 4 0 == page size extensions disabled - * 3 0 == Debug extensions disabled - * 2 0 == Time stamp disable (disabled) - * 1 0 == Protected mode virtual interrupts disabled - * 0 0 == VME disabled - */ - - movq $((1<<5)), %rax - movq %rax, %cr4 - - jmp 1f -1: - - /* Switch to the identity mapped page tables, - * and flush the TLB. - */ - movq %rcx, %cr3 - - /* Do the copies */ - movq %rdi, %rcx /* Put the page_list in %rcx */ - xorq %rdi, %rdi - xorq %rsi, %rsi - jmp 1f - -0: /* top, read another word for the indirection page */ - - movq (%rbx), %rcx - addq $8, %rbx -1: - testq $0x1, %rcx /* is it a destination page? */ - jz 2f - movq %rcx, %rdi - andq $0xfffffffffffff000, %rdi - jmp 0b -2: - testq $0x2, %rcx /* is it an indirection page? */ - jz 2f - movq %rcx, %rbx - andq $0xfffffffffffff000, %rbx - jmp 0b -2: - testq $0x4, %rcx /* is it the done indicator? */ - jz 2f - jmp 3f -2: - testq $0x8, %rcx /* is it the source indicator? */ - jz 0b /* Ignore it otherwise */ - movq %rcx, %rsi /* For ever source page do a copy */ - andq $0xfffffffffffff000, %rsi - - movq $512, %rcx - rep ; movsq - jmp 0b -3: - - /* To be certain of avoiding problems with self-modifying code - * I need to execute a serializing instruction here. - * So I flush the TLB by reloading %cr3 here, it's handy, - * and not processor dependent. - */ - movq %cr3, %rax - movq %rax, %cr3 - - /* set all of the registers to known values */ - /* leave %rsp alone */ - - xorq %rax, %rax - xorq %rbx, %rbx - xorq %rcx, %rcx - xorq %rdx, %rdx - xorq %rsi, %rsi - xorq %rdi, %rdi - xorq %rbp, %rbp - xorq %r8, %r8 - xorq %r9, %r9 - xorq %r10, %r9 - xorq %r11, %r11 - xorq %r12, %r12 - xorq %r13, %r13 - xorq %r14, %r14 - xorq %r15, %r15 - - ret -relocate_new_kernel_end: - - .globl relocate_new_kernel_size -relocate_new_kernel_size: - .quad relocate_new_kernel_end - relocate_new_kernel diff --git a/arch/x86_64/kernel/semaphore.c b/arch/x86_64/kernel/semaphore.c deleted file mode 100644 index 48f7c18172b..00000000000 --- a/arch/x86_64/kernel/semaphore.c +++ /dev/null @@ -1,180 +0,0 @@ -/* - * x86_64 semaphore implementation. - * - * (C) Copyright 1999 Linus Torvalds - * - * Portions Copyright 1999 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org> - */ -#include <linux/config.h> -#include <linux/sched.h> -#include <linux/init.h> -#include <asm/errno.h> - -#include <asm/semaphore.h> - -/* - * Semaphores are implemented using a two-way counter: - * The "count" variable is decremented for each process - * that tries to acquire the semaphore, while the "sleeping" - * variable is a count of such acquires. - * - * Notably, the inline "up()" and "down()" functions can - * efficiently test if they need to do any extra work (up - * needs to do something only if count was negative before - * the increment operation. - * - * "sleeping" and the contention routine ordering is protected - * by the spinlock in the semaphore's waitqueue head. - * - * Note that these functions are only called when there is - * contention on the lock, and as such all this is the - * "non-critical" part of the whole semaphore business. The - * critical part is the inline stuff in <asm/semaphore.h> - * where we want to avoid any extra jumps and calls. - */ - -/* - * Logic: - * - only on a boundary condition do we need to care. When we go - * from a negative count to a non-negative, we wake people up. - * - when we go from a non-negative count to a negative do we - * (a) synchronize with the "sleeper" count and (b) make sure - * that we're on the wakeup list before we synchronize so that - * we cannot lose wakeup events. - */ - -void __up(struct semaphore *sem) -{ - wake_up(&sem->wait); -} - -void __sched __down(struct semaphore * sem) -{ - struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); - unsigned long flags; - - tsk->state = TASK_UNINTERRUPTIBLE; - spin_lock_irqsave(&sem->wait.lock, flags); - add_wait_queue_exclusive_locked(&sem->wait, &wait); - - sem->sleepers++; - for (;;) { - int sleepers = sem->sleepers; - - /* - * Add "everybody else" into it. They aren't - * playing, because we own the spinlock in - * the wait_queue_head. - */ - if (!atomic_add_negative(sleepers - 1, &sem->count)) { - sem->sleepers = 0; - break; - } - sem->sleepers = 1; /* us - see -1 above */ - spin_unlock_irqrestore(&sem->wait.lock, flags); - - schedule(); - - spin_lock_irqsave(&sem->wait.lock, flags); - tsk->state = TASK_UNINTERRUPTIBLE; - } - remove_wait_queue_locked(&sem->wait, &wait); - wake_up_locked(&sem->wait); - spin_unlock_irqrestore(&sem->wait.lock, flags); - tsk->state = TASK_RUNNING; -} - -int __sched __down_interruptible(struct semaphore * sem) -{ - int retval = 0; - struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); - unsigned long flags; - - tsk->state = TASK_INTERRUPTIBLE; - spin_lock_irqsave(&sem->wait.lock, flags); - add_wait_queue_exclusive_locked(&sem->wait, &wait); - - sem->sleepers++; - for (;;) { - int sleepers = sem->sleepers; - - /* - * With signals pending, this turns into - * the trylock failure case - we won't be - * sleeping, and we* can't get the lock as - * it has contention. Just correct the count - * and exit. - */ - if (signal_pending(current)) { - retval = -EINTR; - sem->sleepers = 0; - atomic_add(sleepers, &sem->count); - break; - } - - /* - * Add "everybody else" into it. They aren't - * playing, because we own the spinlock in - * wait_queue_head. The "-1" is because we're - * still hoping to get the semaphore. - */ - if (!atomic_add_negative(sleepers - 1, &sem->count)) { - sem->sleepers = 0; - break; - } - sem->sleepers = 1; /* us - see -1 above */ - spin_unlock_irqrestore(&sem->wait.lock, flags); - - schedule(); - - spin_lock_irqsave(&sem->wait.lock, flags); - tsk->state = TASK_INTERRUPTIBLE; - } - remove_wait_queue_locked(&sem->wait, &wait); - wake_up_locked(&sem->wait); - spin_unlock_irqrestore(&sem->wait.lock, flags); - - tsk->state = TASK_RUNNING; - return retval; -} - -/* - * Trylock failed - make sure we correct for - * having decremented the count. - * - * We could have done the trylock with a - * single "cmpxchg" without failure cases, - * but then it wouldn't work on a 386. - */ -int __down_trylock(struct semaphore * sem) -{ - int sleepers; - unsigned long flags; - - spin_lock_irqsave(&sem->wait.lock, flags); - sleepers = sem->sleepers + 1; - sem->sleepers = 0; - - /* - * Add "everybody else" and us into it. They aren't - * playing, because we own the spinlock in the - * wait_queue_head. - */ - if (!atomic_add_negative(sleepers, &sem->count)) { - wake_up_locked(&sem->wait); - } - - spin_unlock_irqrestore(&sem->wait.lock, flags); - return 1; -} - - diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c deleted file mode 100644 index 0aa526298e9..00000000000 --- a/arch/x86_64/kernel/setup.c +++ /dev/null @@ -1,1274 +0,0 @@ -/* - * linux/arch/x86-64/kernel/setup.c - * - * Copyright (C) 1995 Linus Torvalds - * - * Nov 2001 Dave Jones <davej@suse.de> - * Forked from i386 setup code. - * - * $Id$ - */ - -/* - * This file handles the architecture-dependent parts of initialization - */ - -#include <linux/errno.h> -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/stddef.h> -#include <linux/unistd.h> -#include <linux/ptrace.h> -#include <linux/slab.h> -#include <linux/user.h> -#include <linux/a.out.h> -#include <linux/tty.h> -#include <linux/ioport.h> -#include <linux/delay.h> -#include <linux/config.h> -#include <linux/init.h> -#include <linux/initrd.h> -#include <linux/highmem.h> -#include <linux/bootmem.h> -#include <linux/module.h> -#include <asm/processor.h> -#include <linux/console.h> -#include <linux/seq_file.h> -#include <linux/root_dev.h> -#include <linux/pci.h> -#include <linux/acpi.h> -#include <linux/kallsyms.h> -#include <linux/edd.h> -#include <linux/mmzone.h> -#include <linux/kexec.h> - -#include <asm/mtrr.h> -#include <asm/uaccess.h> -#include <asm/system.h> -#include <asm/io.h> -#include <asm/smp.h> -#include <asm/msr.h> -#include <asm/desc.h> -#include <video/edid.h> -#include <asm/e820.h> -#include <asm/dma.h> -#include <asm/mpspec.h> -#include <asm/mmu_context.h> -#include <asm/bootsetup.h> -#include <asm/proto.h> -#include <asm/setup.h> -#include <asm/mach_apic.h> -#include <asm/numa.h> - -/* - * Machine setup.. - */ - -struct cpuinfo_x86 boot_cpu_data; - -unsigned long mmu_cr4_features; - -int acpi_disabled; -EXPORT_SYMBOL(acpi_disabled); -#ifdef CONFIG_ACPI_BOOT -extern int __initdata acpi_ht; -extern acpi_interrupt_flags acpi_sci_flags; -int __initdata acpi_force = 0; -#endif - -int acpi_numa __initdata; - -/* Boot loader ID as an integer, for the benefit of proc_dointvec */ -int bootloader_type; - -unsigned long saved_video_mode; - -#ifdef CONFIG_SWIOTLB -int swiotlb; -EXPORT_SYMBOL(swiotlb); -#endif - -/* - * Setup options - */ -struct drive_info_struct { char dummy[32]; } drive_info; -struct screen_info screen_info; -struct sys_desc_table_struct { - unsigned short length; - unsigned char table[0]; -}; - -struct edid_info edid_info; -struct e820map e820; - -extern int root_mountflags; -extern char _text, _etext, _edata, _end; - -char command_line[COMMAND_LINE_SIZE]; - -struct resource standard_io_resources[] = { - { .name = "dma1", .start = 0x00, .end = 0x1f, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "pic1", .start = 0x20, .end = 0x21, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "timer0", .start = 0x40, .end = 0x43, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "timer1", .start = 0x50, .end = 0x53, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "keyboard", .start = 0x60, .end = 0x6f, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "dma page reg", .start = 0x80, .end = 0x8f, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "pic2", .start = 0xa0, .end = 0xa1, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "dma2", .start = 0xc0, .end = 0xdf, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "fpu", .start = 0xf0, .end = 0xff, - .flags = IORESOURCE_BUSY | IORESOURCE_IO } -}; - -#define STANDARD_IO_RESOURCES \ - (sizeof standard_io_resources / sizeof standard_io_resources[0]) - -#define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM) - -struct resource data_resource = { - .name = "Kernel data", - .start = 0, - .end = 0, - .flags = IORESOURCE_RAM, -}; -struct resource code_resource = { - .name = "Kernel code", - .start = 0, - .end = 0, - .flags = IORESOURCE_RAM, -}; - -#define IORESOURCE_ROM (IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM) - -static struct resource system_rom_resource = { - .name = "System ROM", - .start = 0xf0000, - .end = 0xfffff, - .flags = IORESOURCE_ROM, -}; - -static struct resource extension_rom_resource = { - .name = "Extension ROM", - .start = 0xe0000, - .end = 0xeffff, - .flags = IORESOURCE_ROM, -}; - -static struct resource adapter_rom_resources[] = { - { .name = "Adapter ROM", .start = 0xc8000, .end = 0, - .flags = IORESOURCE_ROM }, - { .name = "Adapter ROM", .start = 0, .end = 0, - .flags = IORESOURCE_ROM }, - { .name = "Adapter ROM", .start = 0, .end = 0, - .flags = IORESOURCE_ROM }, - { .name = "Adapter ROM", .start = 0, .end = 0, - .flags = IORESOURCE_ROM }, - { .name = "Adapter ROM", .start = 0, .end = 0, - .flags = IORESOURCE_ROM }, - { .name = "Adapter ROM", .start = 0, .end = 0, - .flags = IORESOURCE_ROM } -}; - -#define ADAPTER_ROM_RESOURCES \ - (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0]) - -static struct resource video_rom_resource = { - .name = "Video ROM", - .start = 0xc0000, - .end = 0xc7fff, - .flags = IORESOURCE_ROM, -}; - -static struct resource video_ram_resource = { - .name = "Video RAM area", - .start = 0xa0000, - .end = 0xbffff, - .flags = IORESOURCE_RAM, -}; - -#define romsignature(x) (*(unsigned short *)(x) == 0xaa55) - -static int __init romchecksum(unsigned char *rom, unsigned long length) -{ - unsigned char *p, sum = 0; - - for (p = rom; p < rom + length; p++) - sum += *p; - return sum == 0; -} - -static void __init probe_roms(void) -{ - unsigned long start, length, upper; - unsigned char *rom; - int i; - - /* video rom */ - upper = adapter_rom_resources[0].start; - for (start = video_rom_resource.start; start < upper; start += 2048) { - rom = isa_bus_to_virt(start); - if (!romsignature(rom)) - continue; - - video_rom_resource.start = start; - - /* 0 < length <= 0x7f * 512, historically */ - length = rom[2] * 512; - - /* if checksum okay, trust length byte */ - if (length && romchecksum(rom, length)) - video_rom_resource.end = start + length - 1; - - request_resource(&iomem_resource, &video_rom_resource); - break; - } - - start = (video_rom_resource.end + 1 + 2047) & ~2047UL; - if (start < upper) - start = upper; - - /* system rom */ - request_resource(&iomem_resource, &system_rom_resource); - upper = system_rom_resource.start; - - /* check for extension rom (ignore length byte!) */ - rom = isa_bus_to_virt(extension_rom_resource.start); - if (romsignature(rom)) { - length = extension_rom_resource.end - extension_rom_resource.start + 1; - if (romchecksum(rom, length)) { - request_resource(&iomem_resource, &extension_rom_resource); - upper = extension_rom_resource.start; - } - } - - /* check for adapter roms on 2k boundaries */ - for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) { - rom = isa_bus_to_virt(start); - if (!romsignature(rom)) - continue; - - /* 0 < length <= 0x7f * 512, historically */ - length = rom[2] * 512; - - /* but accept any length that fits if checksum okay */ - if (!length || start + length > upper || !romchecksum(rom, length)) - continue; - - adapter_rom_resources[i].start = start; - adapter_rom_resources[i].end = start + length - 1; - request_resource(&iomem_resource, &adapter_rom_resources[i]); - - start = adapter_rom_resources[i++].end & ~2047UL; - } -} - -static __init void parse_cmdline_early (char ** cmdline_p) -{ - char c = ' ', *to = command_line, *from = COMMAND_LINE; - int len = 0; - - /* Save unparsed command line copy for /proc/cmdline */ - memcpy(saved_command_line, COMMAND_LINE, COMMAND_LINE_SIZE); - saved_command_line[COMMAND_LINE_SIZE-1] = '\0'; - - for (;;) { - if (c != ' ') - goto next_char; - -#ifdef CONFIG_SMP - /* - * If the BIOS enumerates physical processors before logical, - * maxcpus=N at enumeration-time can be used to disable HT. - */ - else if (!memcmp(from, "maxcpus=", 8)) { - extern unsigned int maxcpus; - - maxcpus = simple_strtoul(from + 8, NULL, 0); - } -#endif -#ifdef CONFIG_ACPI_BOOT - /* "acpi=off" disables both ACPI table parsing and interpreter init */ - if (!memcmp(from, "acpi=off", 8)) - disable_acpi(); - - if (!memcmp(from, "acpi=force", 10)) { - /* add later when we do DMI horrors: */ - acpi_force = 1; - acpi_disabled = 0; - } - - /* acpi=ht just means: do ACPI MADT parsing - at bootup, but don't enable the full ACPI interpreter */ - if (!memcmp(from, "acpi=ht", 7)) { - if (!acpi_force) - disable_acpi(); - acpi_ht = 1; - } - else if (!memcmp(from, "pci=noacpi", 10)) - acpi_disable_pci(); - else if (!memcmp(from, "acpi=noirq", 10)) - acpi_noirq_set(); - - else if (!memcmp(from, "acpi_sci=edge", 13)) - acpi_sci_flags.trigger = 1; - else if (!memcmp(from, "acpi_sci=level", 14)) - acpi_sci_flags.trigger = 3; - else if (!memcmp(from, "acpi_sci=high", 13)) - acpi_sci_flags.polarity = 1; - else if (!memcmp(from, "acpi_sci=low", 12)) - acpi_sci_flags.polarity = 3; - - /* acpi=strict disables out-of-spec workarounds */ - else if (!memcmp(from, "acpi=strict", 11)) { - acpi_strict = 1; - } -#ifdef CONFIG_X86_IO_APIC - else if (!memcmp(from, "acpi_skip_timer_override", 24)) - acpi_skip_timer_override = 1; -#endif -#endif - - if (!memcmp(from, "nolapic", 7) || - !memcmp(from, "disableapic", 11)) - disable_apic = 1; - - if (!memcmp(from, "noapic", 6)) - skip_ioapic_setup = 1; - - if (!memcmp(from, "apic", 4)) { - skip_ioapic_setup = 0; - ioapic_force = 1; - } - - if (!memcmp(from, "mem=", 4)) - parse_memopt(from+4, &from); - -#ifdef CONFIG_NUMA - if (!memcmp(from, "numa=", 5)) - numa_setup(from+5); -#endif - -#ifdef CONFIG_GART_IOMMU - if (!memcmp(from,"iommu=",6)) { - iommu_setup(from+6); - } -#endif - - if (!memcmp(from,"oops=panic", 10)) - panic_on_oops = 1; - - if (!memcmp(from, "noexec=", 7)) - nonx_setup(from + 7); - -#ifdef CONFIG_KEXEC - /* crashkernel=size@addr specifies the location to reserve for - * a crash kernel. By reserving this memory we guarantee - * that linux never set's it up as a DMA target. - * Useful for holding code to do something appropriate - * after a kernel panic. - */ - else if (!memcmp(from, "crashkernel=", 12)) { - unsigned long size, base; - size = memparse(from+12, &from); - if (*from == '@') { - base = memparse(from+1, &from); - /* FIXME: Do I want a sanity check - * to validate the memory range? - */ - crashk_res.start = base; - crashk_res.end = base + size - 1; - } - } -#endif - - next_char: - c = *(from++); - if (!c) - break; - if (COMMAND_LINE_SIZE <= ++len) - break; - *(to++) = c; - } - *to = '\0'; - *cmdline_p = command_line; -} - -#ifndef CONFIG_NUMA -static void __init -contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn) -{ - unsigned long bootmap_size, bootmap; - - memory_present(0, start_pfn, end_pfn); - bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT; - bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size); - if (bootmap == -1L) - panic("Cannot find bootmem map of size %ld\n",bootmap_size); - bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn); - e820_bootmem_free(NODE_DATA(0), 0, end_pfn << PAGE_SHIFT); - reserve_bootmem(bootmap, bootmap_size); -} -#endif - -/* Use inline assembly to define this because the nops are defined - as inline assembly strings in the include files and we cannot - get them easily into strings. */ -asm("\t.data\nk8nops: " - K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6 - K8_NOP7 K8_NOP8); - -extern unsigned char k8nops[]; -static unsigned char *k8_nops[ASM_NOP_MAX+1] = { - NULL, - k8nops, - k8nops + 1, - k8nops + 1 + 2, - k8nops + 1 + 2 + 3, - k8nops + 1 + 2 + 3 + 4, - k8nops + 1 + 2 + 3 + 4 + 5, - k8nops + 1 + 2 + 3 + 4 + 5 + 6, - k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, -}; - -/* Replace instructions with better alternatives for this CPU type. - - This runs before SMP is initialized to avoid SMP problems with - self modifying code. This implies that assymetric systems where - APs have less capabilities than the boot processor are not handled. - In this case boot with "noreplacement". */ -void apply_alternatives(void *start, void *end) -{ - struct alt_instr *a; - int diff, i, k; - for (a = start; (void *)a < end; a++) { - if (!boot_cpu_has(a->cpuid)) - continue; - - BUG_ON(a->replacementlen > a->instrlen); - __inline_memcpy(a->instr, a->replacement, a->replacementlen); - diff = a->instrlen - a->replacementlen; - - /* Pad the rest with nops */ - for (i = a->replacementlen; diff > 0; diff -= k, i += k) { - k = diff; - if (k > ASM_NOP_MAX) - k = ASM_NOP_MAX; - __inline_memcpy(a->instr + i, k8_nops[k], k); - } - } -} - -static int no_replacement __initdata = 0; - -void __init alternative_instructions(void) -{ - extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; - if (no_replacement) - return; - apply_alternatives(__alt_instructions, __alt_instructions_end); -} - -static int __init noreplacement_setup(char *s) -{ - no_replacement = 1; - return 0; -} - -__setup("noreplacement", noreplacement_setup); - -#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) -struct edd edd; -#ifdef CONFIG_EDD_MODULE -EXPORT_SYMBOL(edd); -#endif -/** - * copy_edd() - Copy the BIOS EDD information - * from boot_params into a safe place. - * - */ -static inline void copy_edd(void) -{ - memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature)); - memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info)); - edd.mbr_signature_nr = EDD_MBR_SIG_NR; - edd.edd_info_nr = EDD_NR; -} -#else -static inline void copy_edd(void) -{ -} -#endif - -#define EBDA_ADDR_POINTER 0x40E -static void __init reserve_ebda_region(void) -{ - unsigned int addr; - /** - * there is a real-mode segmented pointer pointing to the - * 4K EBDA area at 0x40E - */ - addr = *(unsigned short *)phys_to_virt(EBDA_ADDR_POINTER); - addr <<= 4; - if (addr) - reserve_bootmem_generic(addr, PAGE_SIZE); -} - -void __init setup_arch(char **cmdline_p) -{ - unsigned long kernel_end; - - ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV); - drive_info = DRIVE_INFO; - screen_info = SCREEN_INFO; - edid_info = EDID_INFO; - saved_video_mode = SAVED_VIDEO_MODE; - bootloader_type = LOADER_TYPE; - -#ifdef CONFIG_BLK_DEV_RAM - rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK; - rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0); - rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0); -#endif - setup_memory_region(); - copy_edd(); - - if (!MOUNT_ROOT_RDONLY) - root_mountflags &= ~MS_RDONLY; - init_mm.start_code = (unsigned long) &_text; - init_mm.end_code = (unsigned long) &_etext; - init_mm.end_data = (unsigned long) &_edata; - init_mm.brk = (unsigned long) &_end; - - code_resource.start = virt_to_phys(&_text); - code_resource.end = virt_to_phys(&_etext)-1; - data_resource.start = virt_to_phys(&_etext); - data_resource.end = virt_to_phys(&_edata)-1; - - parse_cmdline_early(cmdline_p); - - early_identify_cpu(&boot_cpu_data); - - /* - * partially used pages are not usable - thus - * we are rounding upwards: - */ - end_pfn = e820_end_of_ram(); - - check_efer(); - - init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT)); - -#ifdef CONFIG_ACPI_BOOT - /* - * Initialize the ACPI boot-time table parser (gets the RSDP and SDT). - * Call this early for SRAT node setup. - */ - acpi_boot_table_init(); -#endif - -#ifdef CONFIG_ACPI_NUMA - /* - * Parse SRAT to discover nodes. - */ - acpi_numa_init(); -#endif - -#ifdef CONFIG_NUMA - numa_initmem_init(0, end_pfn); -#else - contig_initmem_init(0, end_pfn); -#endif - - /* Reserve direct mapping */ - reserve_bootmem_generic(table_start << PAGE_SHIFT, - (table_end - table_start) << PAGE_SHIFT); - - /* reserve kernel */ - kernel_end = round_up(__pa_symbol(&_end),PAGE_SIZE); - reserve_bootmem_generic(HIGH_MEMORY, kernel_end - HIGH_MEMORY); - - /* - * reserve physical page 0 - it's a special BIOS page on many boxes, - * enabling clean reboots, SMP operation, laptop functions. - */ - reserve_bootmem_generic(0, PAGE_SIZE); - - /* reserve ebda region */ - reserve_ebda_region(); - -#ifdef CONFIG_SMP - /* - * But first pinch a few for the stack/trampoline stuff - * FIXME: Don't need the extra page at 4K, but need to fix - * trampoline before removing it. (see the GDT stuff) - */ - reserve_bootmem_generic(PAGE_SIZE, PAGE_SIZE); - - /* Reserve SMP trampoline */ - reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, PAGE_SIZE); -#endif - -#ifdef CONFIG_ACPI_SLEEP - /* - * Reserve low memory region for sleep support. - */ - acpi_reserve_bootmem(); -#endif -#ifdef CONFIG_X86_LOCAL_APIC - /* - * Find and reserve possible boot-time SMP configuration: - */ - find_smp_config(); -#endif -#ifdef CONFIG_BLK_DEV_INITRD - if (LOADER_TYPE && INITRD_START) { - if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) { - reserve_bootmem_generic(INITRD_START, INITRD_SIZE); - initrd_start = - INITRD_START ? INITRD_START + PAGE_OFFSET : 0; - initrd_end = initrd_start+INITRD_SIZE; - } - else { - printk(KERN_ERR "initrd extends beyond end of memory " - "(0x%08lx > 0x%08lx)\ndisabling initrd\n", - (unsigned long)(INITRD_START + INITRD_SIZE), - (unsigned long)(end_pfn << PAGE_SHIFT)); - initrd_start = 0; - } - } -#endif - - sparse_init(); - -#ifdef CONFIG_KEXEC - if (crashk_res.start != crashk_res.end) { - reserve_bootmem(crashk_res.start, - crashk_res.end - crashk_res.start + 1); - } -#endif - paging_init(); - - check_ioapic(); - -#ifdef CONFIG_ACPI_BOOT - /* - * Read APIC and some other early information from ACPI tables. - */ - acpi_boot_init(); -#endif - -#ifdef CONFIG_X86_LOCAL_APIC - /* - * get boot-time SMP configuration: - */ - if (smp_found_config) - get_smp_config(); - init_apic_mappings(); -#endif - - /* - * Request address space for all standard RAM and ROM resources - * and also for regions reported as reserved by the e820. - */ - probe_roms(); - e820_reserve_resources(); - - request_resource(&iomem_resource, &video_ram_resource); - - { - unsigned i; - /* request I/O space for devices used on all i[345]86 PCs */ - for (i = 0; i < STANDARD_IO_RESOURCES; i++) - request_resource(&ioport_resource, &standard_io_resources[i]); - } - - e820_setup_gap(); - -#ifdef CONFIG_GART_IOMMU - iommu_hole_init(); -#endif - -#ifdef CONFIG_VT -#if defined(CONFIG_VGA_CONSOLE) - conswitchp = &vga_con; -#elif defined(CONFIG_DUMMY_CONSOLE) - conswitchp = &dummy_con; -#endif -#endif -} - -static int __cpuinit get_model_name(struct cpuinfo_x86 *c) -{ - unsigned int *v; - - if (c->extended_cpuid_level < 0x80000004) - return 0; - - v = (unsigned int *) c->x86_model_id; - cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); - cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]); - cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]); - c->x86_model_id[48] = 0; - return 1; -} - - -static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) -{ - unsigned int n, dummy, eax, ebx, ecx, edx; - - n = c->extended_cpuid_level; - - if (n >= 0x80000005) { - cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); - printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", - edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); - c->x86_cache_size=(ecx>>24)+(edx>>24); - /* On K8 L1 TLB is inclusive, so don't count it */ - c->x86_tlbsize = 0; - } - - if (n >= 0x80000006) { - cpuid(0x80000006, &dummy, &ebx, &ecx, &edx); - ecx = cpuid_ecx(0x80000006); - c->x86_cache_size = ecx >> 16; - c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff); - - printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", - c->x86_cache_size, ecx & 0xFF); - } - - if (n >= 0x80000007) - cpuid(0x80000007, &dummy, &dummy, &dummy, &c->x86_power); - if (n >= 0x80000008) { - cpuid(0x80000008, &eax, &dummy, &dummy, &dummy); - c->x86_virt_bits = (eax >> 8) & 0xff; - c->x86_phys_bits = eax & 0xff; - } -} - -/* - * On a AMD dual core setup the lower bits of the APIC id distingush the cores. - * Assumes number of cores is a power of two. - */ -static void __init amd_detect_cmp(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_SMP - int cpu = smp_processor_id(); - int node = 0; - unsigned bits; - - bits = 0; - while ((1 << bits) < c->x86_num_cores) - bits++; - - /* Low order bits define the core id (index of core in socket) */ - cpu_core_id[cpu] = phys_proc_id[cpu] & ((1 << bits)-1); - /* Convert the APIC ID into the socket ID */ - phys_proc_id[cpu] >>= bits; - -#ifdef CONFIG_NUMA - /* When an ACPI SRAT table is available use the mappings from SRAT - instead. */ - if (acpi_numa <= 0) { - node = phys_proc_id[cpu]; - if (!node_online(node)) - node = first_node(node_online_map); - cpu_to_node[cpu] = node; - } else { - node = cpu_to_node[cpu]; - } -#endif - - printk(KERN_INFO "CPU %d(%d) -> Node %d -> Core %d\n", - cpu, c->x86_num_cores, node, cpu_core_id[cpu]); -#endif -} - -static int __init init_amd(struct cpuinfo_x86 *c) -{ - int r; - int level; - - /* Bit 31 in normal CPUID used for nonstandard 3DNow ID; - 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */ - clear_bit(0*32+31, &c->x86_capability); - - /* C-stepping K8? */ - level = cpuid_eax(1); - if ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) - set_bit(X86_FEATURE_K8_C, &c->x86_capability); - - r = get_model_name(c); - if (!r) { - switch (c->x86) { - case 15: - /* Should distinguish Models here, but this is only - a fallback anyways. */ - strcpy(c->x86_model_id, "Hammer"); - break; - } - } - display_cacheinfo(c); - - if (c->extended_cpuid_level >= 0x80000008) { - c->x86_num_cores = (cpuid_ecx(0x80000008) & 0xff) + 1; - if (c->x86_num_cores & (c->x86_num_cores - 1)) - c->x86_num_cores = 1; - - amd_detect_cmp(c); - } - - return r; -} - -static void __cpuinit detect_ht(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_SMP - u32 eax, ebx, ecx, edx; - int index_msb, tmp; - int cpu = smp_processor_id(); - - if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY)) - return; - - cpuid(1, &eax, &ebx, &ecx, &edx); - smp_num_siblings = (ebx & 0xff0000) >> 16; - - if (smp_num_siblings == 1) { - printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); - } else if (smp_num_siblings > 1) { - index_msb = 31; - /* - * At this point we only support two siblings per - * processor package. - */ - if (smp_num_siblings > NR_CPUS) { - printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings); - smp_num_siblings = 1; - return; - } - tmp = smp_num_siblings; - while ((tmp & 0x80000000 ) == 0) { - tmp <<=1 ; - index_msb--; - } - if (smp_num_siblings & (smp_num_siblings - 1)) - index_msb++; - phys_proc_id[cpu] = phys_pkg_id(index_msb); - - printk(KERN_INFO "CPU: Physical Processor ID: %d\n", - phys_proc_id[cpu]); - - smp_num_siblings = smp_num_siblings / c->x86_num_cores; - - tmp = smp_num_siblings; - index_msb = 31; - while ((tmp & 0x80000000) == 0) { - tmp <<=1 ; - index_msb--; - } - if (smp_num_siblings & (smp_num_siblings - 1)) - index_msb++; - - cpu_core_id[cpu] = phys_pkg_id(index_msb); - - if (c->x86_num_cores > 1) - printk(KERN_INFO "CPU: Processor Core ID: %d\n", - cpu_core_id[cpu]); - } -#endif -} - -/* - * find out the number of processor cores on the die - */ -static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c) -{ - unsigned int eax; - - if (c->cpuid_level < 4) - return 1; - - __asm__("cpuid" - : "=a" (eax) - : "0" (4), "c" (0) - : "bx", "dx"); - - if (eax & 0x1f) - return ((eax >> 26) + 1); - else - return 1; -} - -static void __cpuinit init_intel(struct cpuinfo_x86 *c) -{ - /* Cache sizes */ - unsigned n; - - init_intel_cacheinfo(c); - n = c->extended_cpuid_level; - if (n >= 0x80000008) { - unsigned eax = cpuid_eax(0x80000008); - c->x86_virt_bits = (eax >> 8) & 0xff; - c->x86_phys_bits = eax & 0xff; - } - - if (c->x86 == 15) - c->x86_cache_alignment = c->x86_clflush_size * 2; - if (c->x86 >= 15) - set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability); - c->x86_num_cores = intel_num_cpu_cores(c); -} - -void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) -{ - char *v = c->x86_vendor_id; - - if (!strcmp(v, "AuthenticAMD")) - c->x86_vendor = X86_VENDOR_AMD; - else if (!strcmp(v, "GenuineIntel")) - c->x86_vendor = X86_VENDOR_INTEL; - else - c->x86_vendor = X86_VENDOR_UNKNOWN; -} - -struct cpu_model_info { - int vendor; - int family; - char *model_names[16]; -}; - -/* Do some early cpuid on the boot CPU to get some parameter that are - needed before check_bugs. Everything advanced is in identify_cpu - below. */ -void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) -{ - u32 tfms; - - c->loops_per_jiffy = loops_per_jiffy; - c->x86_cache_size = -1; - c->x86_vendor = X86_VENDOR_UNKNOWN; - c->x86_model = c->x86_mask = 0; /* So far unknown... */ - c->x86_vendor_id[0] = '\0'; /* Unset */ - c->x86_model_id[0] = '\0'; /* Unset */ - c->x86_clflush_size = 64; - c->x86_cache_alignment = c->x86_clflush_size; - c->x86_num_cores = 1; - c->extended_cpuid_level = 0; - memset(&c->x86_capability, 0, sizeof c->x86_capability); - - /* Get vendor name */ - cpuid(0x00000000, (unsigned int *)&c->cpuid_level, - (unsigned int *)&c->x86_vendor_id[0], - (unsigned int *)&c->x86_vendor_id[8], - (unsigned int *)&c->x86_vendor_id[4]); - - get_cpu_vendor(c); - - /* Initialize the standard set of capabilities */ - /* Note that the vendor-specific code below might override */ - - /* Intel-defined flags: level 0x00000001 */ - if (c->cpuid_level >= 0x00000001) { - __u32 misc; - cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4], - &c->x86_capability[0]); - c->x86 = (tfms >> 8) & 0xf; - c->x86_model = (tfms >> 4) & 0xf; - c->x86_mask = tfms & 0xf; - if (c->x86 == 0xf) { - c->x86 += (tfms >> 20) & 0xff; - c->x86_model += ((tfms >> 16) & 0xF) << 4; - } - if (c->x86_capability[0] & (1<<19)) - c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; - } else { - /* Have CPUID level 0 only - unheard of */ - c->x86 = 4; - } - -#ifdef CONFIG_SMP - phys_proc_id[smp_processor_id()] = (cpuid_ebx(1) >> 24) & 0xff; -#endif -} - -/* - * This does the hard work of actually picking apart the CPU stuff... - */ -void __cpuinit identify_cpu(struct cpuinfo_x86 *c) -{ - int i; - u32 xlvl; - - early_identify_cpu(c); - - /* AMD-defined flags: level 0x80000001 */ - xlvl = cpuid_eax(0x80000000); - c->extended_cpuid_level = xlvl; - if ((xlvl & 0xffff0000) == 0x80000000) { - if (xlvl >= 0x80000001) { - c->x86_capability[1] = cpuid_edx(0x80000001); - c->x86_capability[6] = cpuid_ecx(0x80000001); - } - if (xlvl >= 0x80000004) - get_model_name(c); /* Default name */ - } - - /* Transmeta-defined flags: level 0x80860001 */ - xlvl = cpuid_eax(0x80860000); - if ((xlvl & 0xffff0000) == 0x80860000) { - /* Don't set x86_cpuid_level here for now to not confuse. */ - if (xlvl >= 0x80860001) - c->x86_capability[2] = cpuid_edx(0x80860001); - } - - /* - * Vendor-specific initialization. In this section we - * canonicalize the feature flags, meaning if there are - * features a certain CPU supports which CPUID doesn't - * tell us, CPUID claiming incorrect flags, or other bugs, - * we handle them here. - * - * At the end of this section, c->x86_capability better - * indicate the features this CPU genuinely supports! - */ - switch (c->x86_vendor) { - case X86_VENDOR_AMD: - init_amd(c); - break; - - case X86_VENDOR_INTEL: - init_intel(c); - break; - - case X86_VENDOR_UNKNOWN: - default: - display_cacheinfo(c); - break; - } - - select_idle_routine(c); - detect_ht(c); - - /* - * On SMP, boot_cpu_data holds the common feature set between - * all CPUs; so make sure that we indicate which features are - * common between the CPUs. The first time this routine gets - * executed, c == &boot_cpu_data. - */ - if (c != &boot_cpu_data) { - /* AND the already accumulated flags with these */ - for (i = 0 ; i < NCAPINTS ; i++) - boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; - } - -#ifdef CONFIG_X86_MCE - mcheck_init(c); -#endif - if (c == &boot_cpu_data) - mtrr_bp_init(); - else - mtrr_ap_init(); -#ifdef CONFIG_NUMA - numa_add_cpu(smp_processor_id()); -#endif -} - - -void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) -{ - if (c->x86_model_id[0]) - printk("%s", c->x86_model_id); - - if (c->x86_mask || c->cpuid_level >= 0) - printk(" stepping %02x\n", c->x86_mask); - else - printk("\n"); -} - -/* - * Get CPU information for use by the procfs. - */ - -static int show_cpuinfo(struct seq_file *m, void *v) -{ - struct cpuinfo_x86 *c = v; - - /* - * These flag bits must match the definitions in <asm/cpufeature.h>. - * NULL means this bit is undefined or reserved; either way it doesn't - * have meaning as far as Linux is concerned. Note that it's important - * to realize there is a difference between this table and CPUID -- if - * applications want to get the raw CPUID data, they should access - * /dev/cpu/<cpu_nr>/cpuid instead. - */ - static char *x86_cap_flags[] = { - /* Intel-defined */ - "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce", - "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov", - "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx", - "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", NULL, - - /* AMD-defined */ - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL, - NULL, "fxsr_opt", NULL, NULL, NULL, "lm", "3dnowext", "3dnow", - - /* Transmeta-defined */ - "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* Other (Linux-defined) */ - "cxmmx", NULL, "cyrix_arr", "centaur_mcr", NULL, - "constant_tsc", NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* Intel-defined (#2) */ - "pni", NULL, NULL, "monitor", "ds_cpl", NULL, NULL, "est", - "tm2", NULL, "cid", NULL, NULL, "cx16", "xtpr", NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* VIA/Cyrix/Centaur-defined */ - NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en", - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* AMD-defined (#2) */ - "lahf_lm", "cmp_legacy", NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - }; - static char *x86_power_flags[] = { - "ts", /* temperature sensor */ - "fid", /* frequency id control */ - "vid", /* voltage id control */ - "ttp", /* thermal trip */ - "tm", - "stc" - }; - - -#ifdef CONFIG_SMP - if (!cpu_online(c-cpu_data)) - return 0; -#endif - - seq_printf(m,"processor\t: %u\n" - "vendor_id\t: %s\n" - "cpu family\t: %d\n" - "model\t\t: %d\n" - "model name\t: %s\n", - (unsigned)(c-cpu_data), - c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown", - c->x86, - (int)c->x86_model, - c->x86_model_id[0] ? c->x86_model_id : "unknown"); - - if (c->x86_mask || c->cpuid_level >= 0) - seq_printf(m, "stepping\t: %d\n", c->x86_mask); - else - seq_printf(m, "stepping\t: unknown\n"); - - if (cpu_has(c,X86_FEATURE_TSC)) { - seq_printf(m, "cpu MHz\t\t: %u.%03u\n", - cpu_khz / 1000, (cpu_khz % 1000)); - } - - /* Cache size */ - if (c->x86_cache_size >= 0) - seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size); - -#ifdef CONFIG_SMP - if (smp_num_siblings * c->x86_num_cores > 1) { - int cpu = c - cpu_data; - seq_printf(m, "physical id\t: %d\n", phys_proc_id[cpu]); - seq_printf(m, "siblings\t: %d\n", - c->x86_num_cores * smp_num_siblings); - seq_printf(m, "core id\t\t: %d\n", cpu_core_id[cpu]); - seq_printf(m, "cpu cores\t: %d\n", c->x86_num_cores); - } -#endif - - seq_printf(m, - "fpu\t\t: yes\n" - "fpu_exception\t: yes\n" - "cpuid level\t: %d\n" - "wp\t\t: yes\n" - "flags\t\t:", - c->cpuid_level); - - { - int i; - for ( i = 0 ; i < 32*NCAPINTS ; i++ ) - if ( test_bit(i, &c->x86_capability) && - x86_cap_flags[i] != NULL ) - seq_printf(m, " %s", x86_cap_flags[i]); - } - - seq_printf(m, "\nbogomips\t: %lu.%02lu\n", - c->loops_per_jiffy/(500000/HZ), - (c->loops_per_jiffy/(5000/HZ)) % 100); - - if (c->x86_tlbsize > 0) - seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize); - seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size); - seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment); - - seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", - c->x86_phys_bits, c->x86_virt_bits); - - seq_printf(m, "power management:"); - { - unsigned i; - for (i = 0; i < 32; i++) - if (c->x86_power & (1 << i)) { - if (i < ARRAY_SIZE(x86_power_flags)) - seq_printf(m, " %s", x86_power_flags[i]); - else - seq_printf(m, " [%d]", i); - } - } - - seq_printf(m, "\n\n"); - - return 0; -} - -static void *c_start(struct seq_file *m, loff_t *pos) -{ - return *pos < NR_CPUS ? cpu_data + *pos : NULL; -} - -static void *c_next(struct seq_file *m, void *v, loff_t *pos) -{ - ++*pos; - return c_start(m, pos); -} - -static void c_stop(struct seq_file *m, void *v) -{ -} - -struct seq_operations cpuinfo_op = { - .start =c_start, - .next = c_next, - .stop = c_stop, - .show = show_cpuinfo, -}; diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c deleted file mode 100644 index 34082c1cc41..00000000000 --- a/arch/x86_64/kernel/setup64.c +++ /dev/null @@ -1,283 +0,0 @@ -/* - * X86-64 specific CPU setup. - * Copyright (C) 1995 Linus Torvalds - * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen. - * See setup.c for older changelog. - * $Id: setup64.c,v 1.12 2002/03/21 10:09:17 ak Exp $ - */ -#include <linux/config.h> -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/string.h> -#include <linux/bootmem.h> -#include <linux/bitops.h> -#include <linux/module.h> -#include <asm/bootsetup.h> -#include <asm/pda.h> -#include <asm/pgtable.h> -#include <asm/processor.h> -#include <asm/desc.h> -#include <asm/atomic.h> -#include <asm/mmu_context.h> -#include <asm/smp.h> -#include <asm/i387.h> -#include <asm/percpu.h> -#include <asm/proto.h> -#include <asm/sections.h> - -char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,}; - -cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; - -struct x8664_pda cpu_pda[NR_CPUS] __cacheline_aligned; - -struct desc_ptr idt_descr = { 256 * 16, (unsigned long) idt_table }; - -char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned"))); - -unsigned long __supported_pte_mask = ~0UL; -static int do_not_nx __initdata = 0; - -/* noexec=on|off -Control non executable mappings for 64bit processes. - -on Enable(default) -off Disable -*/ -int __init nonx_setup(char *str) -{ - if (!strncmp(str, "on", 2)) { - __supported_pte_mask |= _PAGE_NX; - do_not_nx = 0; - } else if (!strncmp(str, "off", 3)) { - do_not_nx = 1; - __supported_pte_mask &= ~_PAGE_NX; - } - return 0; -} -__setup("noexec=", nonx_setup); /* parsed early actually */ - -int force_personality32 = READ_IMPLIES_EXEC; - -/* noexec32=on|off -Control non executable heap for 32bit processes. -To control the stack too use noexec=off - -on PROT_READ does not imply PROT_EXEC for 32bit processes -off PROT_READ implies PROT_EXEC (default) -*/ -static int __init nonx32_setup(char *str) -{ - if (!strcmp(str, "on")) - force_personality32 &= ~READ_IMPLIES_EXEC; - else if (!strcmp(str, "off")) - force_personality32 |= READ_IMPLIES_EXEC; - return 0; -} -__setup("noexec32=", nonx32_setup); - -/* - * Great future plan: - * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data. - * Always point %gs to its beginning - */ -void __init setup_per_cpu_areas(void) -{ - int i; - unsigned long size; - - /* Copy section for each CPU (we discard the original) */ - size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES); -#ifdef CONFIG_MODULES - if (size < PERCPU_ENOUGH_ROOM) - size = PERCPU_ENOUGH_ROOM; -#endif - - for (i = 0; i < NR_CPUS; i++) { - char *ptr; - - if (!NODE_DATA(cpu_to_node(i))) { - printk("cpu with no node %d, num_online_nodes %d\n", - i, num_online_nodes()); - ptr = alloc_bootmem(size); - } else { - ptr = alloc_bootmem_node(NODE_DATA(cpu_to_node(i)), size); - } - if (!ptr) - panic("Cannot allocate cpu data for CPU %d\n", i); - cpu_pda[i].data_offset = ptr - __per_cpu_start; - memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); - } -} - -void pda_init(int cpu) -{ - struct x8664_pda *pda = &cpu_pda[cpu]; - - /* Setup up data that may be needed in __get_free_pages early */ - asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0)); - wrmsrl(MSR_GS_BASE, cpu_pda + cpu); - - pda->me = pda; - pda->cpunumber = cpu; - pda->irqcount = -1; - pda->kernelstack = - (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE; - pda->active_mm = &init_mm; - pda->mmu_state = 0; - - if (cpu == 0) { - /* others are initialized in smpboot.c */ - pda->pcurrent = &init_task; - pda->irqstackptr = boot_cpu_stack; - } else { - pda->irqstackptr = (char *) - __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); - if (!pda->irqstackptr) - panic("cannot allocate irqstack for cpu %d", cpu); - } - - asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt))); - - pda->irqstackptr += IRQSTACKSIZE-64; -} - -char boot_exception_stacks[N_EXCEPTION_STACKS * EXCEPTION_STKSZ] -__attribute__((section(".bss.page_aligned"))); - -/* May not be marked __init: used by software suspend */ -void syscall_init(void) -{ - /* - * LSTAR and STAR live in a bit strange symbiosis. - * They both write to the same internal register. STAR allows to set CS/DS - * but only a 32bit target. LSTAR sets the 64bit rip. - */ - wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); - wrmsrl(MSR_LSTAR, system_call); - -#ifdef CONFIG_IA32_EMULATION - syscall32_cpu_init (); -#endif - - /* Flags to clear on syscall */ - wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000); -} - -void __cpuinit check_efer(void) -{ - unsigned long efer; - - rdmsrl(MSR_EFER, efer); - if (!(efer & EFER_NX) || do_not_nx) { - __supported_pte_mask &= ~_PAGE_NX; - } -} - -/* - * cpu_init() initializes state that is per-CPU. Some data is already - * initialized (naturally) in the bootstrap process, such as the GDT - * and IDT. We reload them nevertheless, this function acts as a - * 'CPU state barrier', nothing should get across. - * A lot of state is already set up in PDA init. - */ -void __cpuinit cpu_init (void) -{ - int cpu = stack_smp_processor_id(); - struct tss_struct *t = &per_cpu(init_tss, cpu); - unsigned long v; - char *estacks = NULL; - struct task_struct *me; - int i; - - /* CPU 0 is initialised in head64.c */ - if (cpu != 0) { - pda_init(cpu); - } else - estacks = boot_exception_stacks; - - me = current; - - if (cpu_test_and_set(cpu, cpu_initialized)) - panic("CPU#%d already initialized!\n", cpu); - - printk("Initializing CPU#%d\n", cpu); - - clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); - - /* - * Initialize the per-CPU GDT with the boot GDT, - * and set up the GDT descriptor: - */ - if (cpu) { - memcpy(cpu_gdt_table[cpu], cpu_gdt_table[0], GDT_SIZE); - } - - cpu_gdt_descr[cpu].size = GDT_SIZE; - cpu_gdt_descr[cpu].address = (unsigned long)cpu_gdt_table[cpu]; - asm volatile("lgdt %0" :: "m" (cpu_gdt_descr[cpu])); - asm volatile("lidt %0" :: "m" (idt_descr)); - - memcpy(me->thread.tls_array, cpu_gdt_table[cpu], GDT_ENTRY_TLS_ENTRIES * 8); - - /* - * Delete NT - */ - - asm volatile("pushfq ; popq %%rax ; btr $14,%%rax ; pushq %%rax ; popfq" ::: "eax"); - - syscall_init(); - - wrmsrl(MSR_FS_BASE, 0); - wrmsrl(MSR_KERNEL_GS_BASE, 0); - barrier(); - - check_efer(); - - /* - * set up and load the per-CPU TSS - */ - for (v = 0; v < N_EXCEPTION_STACKS; v++) { - if (cpu) { - estacks = (char *)__get_free_pages(GFP_ATOMIC, - EXCEPTION_STACK_ORDER); - if (!estacks) - panic("Cannot allocate exception stack %ld %d\n", - v, cpu); - } - estacks += EXCEPTION_STKSZ; - t->ist[v] = (unsigned long)estacks; - } - - t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap); - /* - * <= is required because the CPU will access up to - * 8 bits beyond the end of the IO permission bitmap. - */ - for (i = 0; i <= IO_BITMAP_LONGS; i++) - t->io_bitmap[i] = ~0UL; - - atomic_inc(&init_mm.mm_count); - me->active_mm = &init_mm; - if (me->mm) - BUG(); - enter_lazy_tlb(&init_mm, me); - - set_tss_desc(cpu, t); - load_TR_desc(); - load_LDT(&init_mm.context); - - /* - * Clear all 6 debug registers: - */ - - set_debug(0UL, 0); - set_debug(0UL, 1); - set_debug(0UL, 2); - set_debug(0UL, 3); - set_debug(0UL, 6); - set_debug(0UL, 7); - - fpu_init(); -} diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c deleted file mode 100644 index 98590a989f3..00000000000 --- a/arch/x86_64/kernel/signal.c +++ /dev/null @@ -1,495 +0,0 @@ -/* - * linux/arch/x86_64/kernel/signal.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs - * - * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson - * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes - * 2000-2002 x86-64 support by Andi Kleen - * - * $Id: signal.c,v 1.18 2001/10/17 22:30:37 ak Exp $ - */ - -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/smp_lock.h> -#include <linux/kernel.h> -#include <linux/signal.h> -#include <linux/errno.h> -#include <linux/wait.h> -#include <linux/ptrace.h> -#include <linux/unistd.h> -#include <linux/stddef.h> -#include <linux/personality.h> -#include <linux/compiler.h> -#include <asm/ucontext.h> -#include <asm/uaccess.h> -#include <asm/i387.h> -#include <asm/proto.h> -#include <asm/ia32_unistd.h> - -/* #define DEBUG_SIG 1 */ - -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - -int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, - sigset_t *set, struct pt_regs * regs); -int ia32_setup_frame(int sig, struct k_sigaction *ka, - sigset_t *set, struct pt_regs * regs); - -asmlinkage long -sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize, struct pt_regs *regs) -{ - sigset_t saveset, newset; - - /* XXX: Don't preclude handling different sized sigset_t's. */ - if (sigsetsize != sizeof(sigset_t)) - return -EINVAL; - - if (copy_from_user(&newset, unewset, sizeof(newset))) - return -EFAULT; - sigdelsetmask(&newset, ~_BLOCKABLE); - - spin_lock_irq(¤t->sighand->siglock); - saveset = current->blocked; - current->blocked = newset; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); -#ifdef DEBUG_SIG - printk("rt_sigsuspend savset(%lx) newset(%lx) regs(%p) rip(%lx)\n", - saveset, newset, regs, regs->rip); -#endif - regs->rax = -EINTR; - while (1) { - current->state = TASK_INTERRUPTIBLE; - schedule(); - if (do_signal(regs, &saveset)) - return -EINTR; - } -} - -asmlinkage long -sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, - struct pt_regs *regs) -{ - return do_sigaltstack(uss, uoss, regs->rsp); -} - - -/* - * Do a signal return; undo the signal stack. - */ - -struct rt_sigframe -{ - char __user *pretcode; - struct ucontext uc; - struct siginfo info; -}; - -static int -restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned long *prax) -{ - unsigned int err = 0; - - /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; - -#define COPY(x) err |= __get_user(regs->x, &sc->x) - - COPY(rdi); COPY(rsi); COPY(rbp); COPY(rsp); COPY(rbx); - COPY(rdx); COPY(rcx); COPY(rip); - COPY(r8); - COPY(r9); - COPY(r10); - COPY(r11); - COPY(r12); - COPY(r13); - COPY(r14); - COPY(r15); - - { - unsigned int tmpflags; - err |= __get_user(tmpflags, &sc->eflags); - regs->eflags = (regs->eflags & ~0x40DD5) | (tmpflags & 0x40DD5); - regs->orig_rax = -1; /* disable syscall checks */ - } - - { - struct _fpstate __user * buf; - err |= __get_user(buf, &sc->fpstate); - - if (buf) { - if (!access_ok(VERIFY_READ, buf, sizeof(*buf))) - goto badframe; - err |= restore_i387(buf); - } else { - struct task_struct *me = current; - if (used_math()) { - clear_fpu(me); - clear_used_math(); - } - } - } - - err |= __get_user(*prax, &sc->rax); - return err; - -badframe: - return 1; -} - -asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) -{ - struct rt_sigframe __user *frame; - sigset_t set; - unsigned long eax; - - frame = (struct rt_sigframe __user *)(regs->rsp - 8); - if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) { - goto badframe; - } - if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) { - goto badframe; - } - - sigdelsetmask(&set, ~_BLOCKABLE); - spin_lock_irq(¤t->sighand->siglock); - current->blocked = set; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - - if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax)) - goto badframe; - -#ifdef DEBUG_SIG - printk("%d sigreturn rip:%lx rsp:%lx frame:%p rax:%lx\n",current->pid,regs.rip,regs.rsp,frame,eax); -#endif - - if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->rsp) == -EFAULT) - goto badframe; - - return eax; - -badframe: - signal_fault(regs,frame,"sigreturn"); - return 0; -} - -/* - * Set up a signal frame. - */ - -static inline int -setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned long mask, struct task_struct *me) -{ - int err = 0; - - err |= __put_user(0, &sc->gs); - err |= __put_user(0, &sc->fs); - - err |= __put_user(regs->rdi, &sc->rdi); - err |= __put_user(regs->rsi, &sc->rsi); - err |= __put_user(regs->rbp, &sc->rbp); - err |= __put_user(regs->rsp, &sc->rsp); - err |= __put_user(regs->rbx, &sc->rbx); - err |= __put_user(regs->rdx, &sc->rdx); - err |= __put_user(regs->rcx, &sc->rcx); - err |= __put_user(regs->rax, &sc->rax); - err |= __put_user(regs->r8, &sc->r8); - err |= __put_user(regs->r9, &sc->r9); - err |= __put_user(regs->r10, &sc->r10); - err |= __put_user(regs->r11, &sc->r11); - err |= __put_user(regs->r12, &sc->r12); - err |= __put_user(regs->r13, &sc->r13); - err |= __put_user(regs->r14, &sc->r14); - err |= __put_user(regs->r15, &sc->r15); - err |= __put_user(me->thread.trap_no, &sc->trapno); - err |= __put_user(me->thread.error_code, &sc->err); - err |= __put_user(regs->rip, &sc->rip); - err |= __put_user(regs->eflags, &sc->eflags); - err |= __put_user(mask, &sc->oldmask); - err |= __put_user(me->thread.cr2, &sc->cr2); - - return err; -} - -/* - * Determine which stack to use.. - */ - -static void __user * -get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size) -{ - unsigned long rsp; - - /* Default to using normal stack - redzone*/ - rsp = regs->rsp - 128; - - /* This is the X/Open sanctioned signal stack switching. */ - /* RED-PEN: redzone on that stack? */ - if (ka->sa.sa_flags & SA_ONSTACK) { - if (sas_ss_flags(rsp) == 0) - rsp = current->sas_ss_sp + current->sas_ss_size; - } - - return (void __user *)round_down(rsp - size, 16); -} - -static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, - sigset_t *set, struct pt_regs * regs) -{ - struct rt_sigframe __user *frame; - struct _fpstate __user *fp = NULL; - int err = 0; - struct task_struct *me = current; - - if (used_math()) { - fp = get_stack(ka, regs, sizeof(struct _fpstate)); - frame = (void __user *)round_down( - (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8; - - if (!access_ok(VERIFY_WRITE, fp, sizeof(struct _fpstate))) - goto give_sigsegv; - - if (save_i387(fp) < 0) - err |= -1; - } else - frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8; - - if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) - goto give_sigsegv; - - if (ka->sa.sa_flags & SA_SIGINFO) { - err |= copy_siginfo_to_user(&frame->info, info); - if (err) - goto give_sigsegv; - } - - /* Create the ucontext. */ - err |= __put_user(0, &frame->uc.uc_flags); - err |= __put_user(0, &frame->uc.uc_link); - err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp); - err |= __put_user(sas_ss_flags(regs->rsp), - &frame->uc.uc_stack.ss_flags); - err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size); - err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me); - err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate); - if (sizeof(*set) == 16) { - __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]); - __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]); - } else - err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); - - /* Set up to return from userspace. If provided, use a stub - already in userspace. */ - /* x86-64 should always use SA_RESTORER. */ - if (ka->sa.sa_flags & SA_RESTORER) { - err |= __put_user(ka->sa.sa_restorer, &frame->pretcode); - } else { - /* could use a vstub here */ - goto give_sigsegv; - } - - if (err) - goto give_sigsegv; - -#ifdef DEBUG_SIG - printk("%d old rip %lx old rsp %lx old rax %lx\n", current->pid,regs->rip,regs->rsp,regs->rax); -#endif - - /* Set up registers for signal handler */ - { - struct exec_domain *ed = current_thread_info()->exec_domain; - if (unlikely(ed && ed->signal_invmap && sig < 32)) - sig = ed->signal_invmap[sig]; - } - regs->rdi = sig; - /* In case the signal handler was declared without prototypes */ - regs->rax = 0; - - /* This also works for non SA_SIGINFO handlers because they expect the - next argument after the signal number on the stack. */ - regs->rsi = (unsigned long)&frame->info; - regs->rdx = (unsigned long)&frame->uc; - regs->rip = (unsigned long) ka->sa.sa_handler; - - regs->rsp = (unsigned long)frame; - - set_fs(USER_DS); - regs->eflags &= ~TF_MASK; - if (test_thread_flag(TIF_SINGLESTEP)) - ptrace_notify(SIGTRAP); -#ifdef DEBUG_SIG - printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", - current->comm, current->pid, frame, regs->rip, frame->pretcode); -#endif - - return 1; - -give_sigsegv: - force_sigsegv(sig, current); - return 0; -} - -/* - * OK, we're invoking a handler - */ - -static int -handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, - sigset_t *oldset, struct pt_regs *regs) -{ - int ret; - -#ifdef DEBUG_SIG - printk("handle_signal pid:%d sig:%lu rip:%lx rsp:%lx regs=%p\n", - current->pid, sig, - regs->rip, regs->rsp, regs); -#endif - - /* Are we from a system call? */ - if ((long)regs->orig_rax >= 0) { - /* If so, check system call restarting.. */ - switch (regs->rax) { - case -ERESTART_RESTARTBLOCK: - case -ERESTARTNOHAND: - regs->rax = -EINTR; - break; - - case -ERESTARTSYS: - if (!(ka->sa.sa_flags & SA_RESTART)) { - regs->rax = -EINTR; - break; - } - /* fallthrough */ - case -ERESTARTNOINTR: - regs->rax = regs->orig_rax; - regs->rip -= 2; - break; - } - } - - /* - * If TF is set due to a debugger (PT_DTRACE), clear the TF - * flag so that register information in the sigcontext is - * correct. - */ - if (unlikely(regs->eflags & TF_MASK)) { - if (likely(current->ptrace & PT_DTRACE)) { - current->ptrace &= ~PT_DTRACE; - regs->eflags &= ~TF_MASK; - } - } - -#ifdef CONFIG_IA32_EMULATION - if (test_thread_flag(TIF_IA32)) { - if (ka->sa.sa_flags & SA_SIGINFO) - ret = ia32_setup_rt_frame(sig, ka, info, oldset, regs); - else - ret = ia32_setup_frame(sig, ka, oldset, regs); - } else -#endif - ret = setup_rt_frame(sig, ka, info, oldset, regs); - - if (ret && !(ka->sa.sa_flags & SA_NODEFER)) { - spin_lock_irq(¤t->sighand->siglock); - sigorsets(¤t->blocked,¤t->blocked,&ka->sa.sa_mask); - sigaddset(¤t->blocked,sig); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - } - - return ret; -} - -/* - * Note that 'init' is a special process: it doesn't get signals it doesn't - * want to handle. Thus you cannot kill init even with a SIGKILL even by - * mistake. - */ -int do_signal(struct pt_regs *regs, sigset_t *oldset) -{ - struct k_sigaction ka; - siginfo_t info; - int signr; - - /* - * We want the common case to go fast, which - * is why we may in certain cases get here from - * kernel mode. Just return without doing anything - * if so. - */ - if (!user_mode(regs)) - return 1; - - if (try_to_freeze()) - goto no_signal; - - if (!oldset) - oldset = ¤t->blocked; - - signr = get_signal_to_deliver(&info, &ka, regs, NULL); - if (signr > 0) { - /* Reenable any watchpoints before delivering the - * signal to user space. The processor register will - * have been cleared if the watchpoint triggered - * inside the kernel. - */ - if (current->thread.debugreg7) - set_debugreg(current->thread.debugreg7, 7); - - /* Whee! Actually deliver the signal. */ - return handle_signal(signr, &info, &ka, oldset, regs); - } - - no_signal: - /* Did we come from a system call? */ - if ((long)regs->orig_rax >= 0) { - /* Restart the system call - no handlers present */ - long res = regs->rax; - if (res == -ERESTARTNOHAND || - res == -ERESTARTSYS || - res == -ERESTARTNOINTR) { - regs->rax = regs->orig_rax; - regs->rip -= 2; - } - if (regs->rax == (unsigned long)-ERESTART_RESTARTBLOCK) { - regs->rax = test_thread_flag(TIF_IA32) ? - __NR_ia32_restart_syscall : - __NR_restart_syscall; - regs->rip -= 2; - } - } - return 0; -} - -void do_notify_resume(struct pt_regs *regs, sigset_t *oldset, __u32 thread_info_flags) -{ -#ifdef DEBUG_SIG - printk("do_notify_resume flags:%x rip:%lx rsp:%lx caller:%lx pending:%lx\n", - thread_info_flags, regs->rip, regs->rsp, __builtin_return_address(0),signal_pending(current)); -#endif - - /* Pending single-step? */ - if (thread_info_flags & _TIF_SINGLESTEP) { - regs->eflags |= TF_MASK; - clear_thread_flag(TIF_SINGLESTEP); - } - - /* deal with pending signal delivery */ - if (thread_info_flags & _TIF_SIGPENDING) - do_signal(regs,oldset); -} - -void signal_fault(struct pt_regs *regs, void __user *frame, char *where) -{ - struct task_struct *me = current; - if (exception_trace) - printk("%s[%d] bad frame in %s frame:%p rip:%lx rsp:%lx orax:%lx\n", - me->comm,me->pid,where,frame,regs->rip,regs->rsp,regs->orig_rax); - - force_sig(SIGSEGV, me); -} diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c deleted file mode 100644 index 6ee74db5230..00000000000 --- a/arch/x86_64/kernel/smp.c +++ /dev/null @@ -1,449 +0,0 @@ -/* - * Intel SMP support routines. - * - * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> - * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com> - * (c) 2002,2003 Andi Kleen, SuSE Labs. - * - * This code is released under the GNU General Public License version 2 or - * later. - */ - -#include <linux/init.h> - -#include <linux/mm.h> -#include <linux/irq.h> -#include <linux/delay.h> -#include <linux/spinlock.h> -#include <linux/smp_lock.h> -#include <linux/smp.h> -#include <linux/kernel_stat.h> -#include <linux/mc146818rtc.h> -#include <linux/interrupt.h> - -#include <asm/mtrr.h> -#include <asm/pgalloc.h> -#include <asm/tlbflush.h> -#include <asm/mach_apic.h> -#include <asm/mmu_context.h> -#include <asm/proto.h> -#include <asm/apicdef.h> - -/* - * Smarter SMP flushing macros. - * c/o Linus Torvalds. - * - * These mean you can really definitely utterly forget about - * writing to user space from interrupts. (Its not allowed anyway). - * - * Optimizations Manfred Spraul <manfred@colorfullife.com> - */ - -static cpumask_t flush_cpumask; -static struct mm_struct * flush_mm; -static unsigned long flush_va; -static DEFINE_SPINLOCK(tlbstate_lock); -#define FLUSH_ALL -1ULL - -/* - * We cannot call mmdrop() because we are in interrupt context, - * instead update mm->cpu_vm_mask. - */ -static inline void leave_mm (unsigned long cpu) -{ - if (read_pda(mmu_state) == TLBSTATE_OK) - BUG(); - clear_bit(cpu, &read_pda(active_mm)->cpu_vm_mask); - load_cr3(swapper_pg_dir); -} - -/* - * - * The flush IPI assumes that a thread switch happens in this order: - * [cpu0: the cpu that switches] - * 1) switch_mm() either 1a) or 1b) - * 1a) thread switch to a different mm - * 1a1) clear_bit(cpu, &old_mm->cpu_vm_mask); - * Stop ipi delivery for the old mm. This is not synchronized with - * the other cpus, but smp_invalidate_interrupt ignore flush ipis - * for the wrong mm, and in the worst case we perform a superfluous - * tlb flush. - * 1a2) set cpu mmu_state to TLBSTATE_OK - * Now the smp_invalidate_interrupt won't call leave_mm if cpu0 - * was in lazy tlb mode. - * 1a3) update cpu active_mm - * Now cpu0 accepts tlb flushes for the new mm. - * 1a4) set_bit(cpu, &new_mm->cpu_vm_mask); - * Now the other cpus will send tlb flush ipis. - * 1a4) change cr3. - * 1b) thread switch without mm change - * cpu active_mm is correct, cpu0 already handles - * flush ipis. - * 1b1) set cpu mmu_state to TLBSTATE_OK - * 1b2) test_and_set the cpu bit in cpu_vm_mask. - * Atomically set the bit [other cpus will start sending flush ipis], - * and test the bit. - * 1b3) if the bit was 0: leave_mm was called, flush the tlb. - * 2) switch %%esp, ie current - * - * The interrupt must handle 2 special cases: - * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm. - * - the cpu performs speculative tlb reads, i.e. even if the cpu only - * runs in kernel space, the cpu could load tlb entries for user space - * pages. - * - * The good news is that cpu mmu_state is local to each cpu, no - * write/read ordering problems. - */ - -/* - * TLB flush IPI: - * - * 1) Flush the tlb entries if the cpu uses the mm that's being flushed. - * 2) Leave the mm if we are in the lazy tlb mode. - */ - -asmlinkage void smp_invalidate_interrupt (void) -{ - unsigned long cpu; - - cpu = get_cpu(); - - if (!cpu_isset(cpu, flush_cpumask)) - goto out; - /* - * This was a BUG() but until someone can quote me the - * line from the intel manual that guarantees an IPI to - * multiple CPUs is retried _only_ on the erroring CPUs - * its staying as a return - * - * BUG(); - */ - - if (flush_mm == read_pda(active_mm)) { - if (read_pda(mmu_state) == TLBSTATE_OK) { - if (flush_va == FLUSH_ALL) - local_flush_tlb(); - else - __flush_tlb_one(flush_va); - } else - leave_mm(cpu); - } -out: - ack_APIC_irq(); - cpu_clear(cpu, flush_cpumask); - put_cpu_no_resched(); -} - -static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, - unsigned long va) -{ - cpumask_t tmp; - /* - * A couple of (to be removed) sanity checks: - * - * - we do not send IPIs to not-yet booted CPUs. - * - current CPU must not be in mask - * - mask must exist :) - */ - BUG_ON(cpus_empty(cpumask)); - cpus_and(tmp, cpumask, cpu_online_map); - BUG_ON(!cpus_equal(tmp, cpumask)); - BUG_ON(cpu_isset(smp_processor_id(), cpumask)); - if (!mm) - BUG(); - - /* - * I'm not happy about this global shared spinlock in the - * MM hot path, but we'll see how contended it is. - * Temporarily this turns IRQs off, so that lockups are - * detected by the NMI watchdog. - */ - spin_lock(&tlbstate_lock); - - flush_mm = mm; - flush_va = va; - cpus_or(flush_cpumask, cpumask, flush_cpumask); - - /* - * We have to send the IPI only to - * CPUs affected. - */ - send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR); - - while (!cpus_empty(flush_cpumask)) - mb(); /* nothing. lockup detection does not belong here */; - - flush_mm = NULL; - flush_va = 0; - spin_unlock(&tlbstate_lock); -} - -void flush_tlb_current_task(void) -{ - struct mm_struct *mm = current->mm; - cpumask_t cpu_mask; - - preempt_disable(); - cpu_mask = mm->cpu_vm_mask; - cpu_clear(smp_processor_id(), cpu_mask); - - local_flush_tlb(); - if (!cpus_empty(cpu_mask)) - flush_tlb_others(cpu_mask, mm, FLUSH_ALL); - preempt_enable(); -} - -void flush_tlb_mm (struct mm_struct * mm) -{ - cpumask_t cpu_mask; - - preempt_disable(); - cpu_mask = mm->cpu_vm_mask; - cpu_clear(smp_processor_id(), cpu_mask); - - if (current->active_mm == mm) { - if (current->mm) - local_flush_tlb(); - else - leave_mm(smp_processor_id()); - } - if (!cpus_empty(cpu_mask)) - flush_tlb_others(cpu_mask, mm, FLUSH_ALL); - - preempt_enable(); -} - -void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) -{ - struct mm_struct *mm = vma->vm_mm; - cpumask_t cpu_mask; - - preempt_disable(); - cpu_mask = mm->cpu_vm_mask; - cpu_clear(smp_processor_id(), cpu_mask); - - if (current->active_mm == mm) { - if(current->mm) - __flush_tlb_one(va); - else - leave_mm(smp_processor_id()); - } - - if (!cpus_empty(cpu_mask)) - flush_tlb_others(cpu_mask, mm, va); - - preempt_enable(); -} - -static void do_flush_tlb_all(void* info) -{ - unsigned long cpu = smp_processor_id(); - - __flush_tlb_all(); - if (read_pda(mmu_state) == TLBSTATE_LAZY) - leave_mm(cpu); -} - -void flush_tlb_all(void) -{ - on_each_cpu(do_flush_tlb_all, NULL, 1, 1); -} - -void smp_kdb_stop(void) -{ - send_IPI_allbutself(KDB_VECTOR); -} - -/* - * this function sends a 'reschedule' IPI to another CPU. - * it goes straight through and wastes no time serializing - * anything. Worst case is that we lose a reschedule ... - */ - -void smp_send_reschedule(int cpu) -{ - send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); -} - -/* - * Structure and data for smp_call_function(). This is designed to minimise - * static memory requirements. It also looks cleaner. - */ -static DEFINE_SPINLOCK(call_lock); - -struct call_data_struct { - void (*func) (void *info); - void *info; - atomic_t started; - atomic_t finished; - int wait; -}; - -static struct call_data_struct * call_data; - -void lock_ipi_call_lock(void) -{ - spin_lock_irq(&call_lock); -} - -void unlock_ipi_call_lock(void) -{ - spin_unlock_irq(&call_lock); -} - -/* - * this function sends a 'generic call function' IPI to all other CPUs - * in the system. - */ -static void __smp_call_function (void (*func) (void *info), void *info, - int nonatomic, int wait) -{ - struct call_data_struct data; - int cpus = num_online_cpus()-1; - - if (!cpus) - return; - - data.func = func; - data.info = info; - atomic_set(&data.started, 0); - data.wait = wait; - if (wait) - atomic_set(&data.finished, 0); - - call_data = &data; - wmb(); - /* Send a message to all other CPUs and wait for them to respond */ - send_IPI_allbutself(CALL_FUNCTION_VECTOR); - - /* Wait for response */ - while (atomic_read(&data.started) != cpus) - cpu_relax(); - - if (!wait) - return; - - while (atomic_read(&data.finished) != cpus) - cpu_relax(); -} - -/* - * smp_call_function - run a function on all other CPUs. - * @func: The function to run. This must be fast and non-blocking. - * @info: An arbitrary pointer to pass to the function. - * @nonatomic: currently unused. - * @wait: If true, wait (atomically) until function has completed on other - * CPUs. - * - * Returns 0 on success, else a negative status code. Does not return until - * remote CPUs are nearly ready to execute func or are or have executed. - * - * You must not call this function with disabled interrupts or from a - * hardware interrupt handler or from a bottom half handler. - * Actually there are a few legal cases, like panic. - */ -int smp_call_function (void (*func) (void *info), void *info, int nonatomic, - int wait) -{ - spin_lock(&call_lock); - __smp_call_function(func,info,nonatomic,wait); - spin_unlock(&call_lock); - return 0; -} - -void smp_stop_cpu(void) -{ - /* - * Remove this CPU: - */ - cpu_clear(smp_processor_id(), cpu_online_map); - local_irq_disable(); - disable_local_APIC(); - local_irq_enable(); -} - -static void smp_really_stop_cpu(void *dummy) -{ - smp_stop_cpu(); - for (;;) - asm("hlt"); -} - -void smp_send_stop(void) -{ - int nolock = 0; - if (reboot_force) - return; - /* Don't deadlock on the call lock in panic */ - if (!spin_trylock(&call_lock)) { - /* ignore locking because we have paniced anyways */ - nolock = 1; - } - __smp_call_function(smp_really_stop_cpu, NULL, 0, 0); - if (!nolock) - spin_unlock(&call_lock); - - local_irq_disable(); - disable_local_APIC(); - local_irq_enable(); -} - -/* - * Reschedule call back. Nothing to do, - * all the work is done automatically when - * we return from the interrupt. - */ -asmlinkage void smp_reschedule_interrupt(void) -{ - ack_APIC_irq(); -} - -asmlinkage void smp_call_function_interrupt(void) -{ - void (*func) (void *info) = call_data->func; - void *info = call_data->info; - int wait = call_data->wait; - - ack_APIC_irq(); - /* - * Notify initiating CPU that I've grabbed the data and am - * about to execute the function - */ - mb(); - atomic_inc(&call_data->started); - /* - * At this point the info structure may be out of scope unless wait==1 - */ - irq_enter(); - (*func)(info); - irq_exit(); - if (wait) { - mb(); - atomic_inc(&call_data->finished); - } -} - -int safe_smp_processor_id(void) -{ - int apicid, i; - - if (disable_apic) - return 0; - - apicid = hard_smp_processor_id(); - if (x86_cpu_to_apicid[apicid] == apicid) - return apicid; - - for (i = 0; i < NR_CPUS; ++i) { - if (x86_cpu_to_apicid[i] == apicid) - return i; - } - - /* No entries in x86_cpu_to_apicid? Either no MPS|ACPI, - * or called too early. Either way, we must be CPU 0. */ - if (x86_cpu_to_apicid[0] == BAD_APICID) - return 0; - - return 0; /* Should not happen */ -} diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c deleted file mode 100644 index 6d23354443c..00000000000 --- a/arch/x86_64/kernel/smpboot.c +++ /dev/null @@ -1,1203 +0,0 @@ -/* - * x86 SMP booting functions - * - * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> - * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com> - * Copyright 2001 Andi Kleen, SuSE Labs. - * - * Much of the core SMP work is based on previous work by Thomas Radke, to - * whom a great many thanks are extended. - * - * Thanks to Intel for making available several different Pentium, - * Pentium Pro and Pentium-II/Xeon MP machines. - * Original development of Linux SMP code supported by Caldera. - * - * This code is released under the GNU General Public License version 2 - * - * Fixes - * Felix Koop : NR_CPUS used properly - * Jose Renau : Handle single CPU case. - * Alan Cox : By repeated request 8) - Total BogoMIP report. - * Greg Wright : Fix for kernel stacks panic. - * Erich Boleyn : MP v1.4 and additional changes. - * Matthias Sattler : Changes for 2.1 kernel map. - * Michel Lespinasse : Changes for 2.1 kernel map. - * Michael Chastain : Change trampoline.S to gnu as. - * Alan Cox : Dumb bug: 'B' step PPro's are fine - * Ingo Molnar : Added APIC timers, based on code - * from Jose Renau - * Ingo Molnar : various cleanups and rewrites - * Tigran Aivazian : fixed "0.00 in /proc/uptime on SMP" bug. - * Maciej W. Rozycki : Bits for genuine 82489DX APICs - * Andi Kleen : Changed for SMP boot into long mode. - * Rusty Russell : Hacked into shape for new "hotplug" boot process. - * Andi Kleen : Converted to new state machine. - * Various cleanups. - * Probably mostly hotplug CPU ready now. - * Ashok Raj : CPU hotplug support - */ - - -#include <linux/config.h> -#include <linux/init.h> - -#include <linux/mm.h> -#include <linux/kernel_stat.h> -#include <linux/smp_lock.h> -#include <linux/irq.h> -#include <linux/bootmem.h> -#include <linux/thread_info.h> -#include <linux/module.h> - -#include <linux/delay.h> -#include <linux/mc146818rtc.h> -#include <asm/mtrr.h> -#include <asm/pgalloc.h> -#include <asm/desc.h> -#include <asm/kdebug.h> -#include <asm/tlbflush.h> -#include <asm/proto.h> -#include <asm/nmi.h> - -/* Number of siblings per CPU package */ -int smp_num_siblings = 1; -/* Package ID of each logical CPU */ -u8 phys_proc_id[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; -u8 cpu_core_id[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; -EXPORT_SYMBOL(phys_proc_id); -EXPORT_SYMBOL(cpu_core_id); - -/* Bitmask of currently online CPUs */ -cpumask_t cpu_online_map; - -EXPORT_SYMBOL(cpu_online_map); - -/* - * Private maps to synchronize booting between AP and BP. - * Probably not needed anymore, but it makes for easier debugging. -AK - */ -cpumask_t cpu_callin_map; -cpumask_t cpu_callout_map; - -cpumask_t cpu_possible_map; -EXPORT_SYMBOL(cpu_possible_map); - -/* Per CPU bogomips and other parameters */ -struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; - -/* Set when the idlers are all forked */ -int smp_threads_ready; - -cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned; -cpumask_t cpu_core_map[NR_CPUS] __cacheline_aligned; -EXPORT_SYMBOL(cpu_core_map); - -/* - * Trampoline 80x86 program as an array. - */ - -extern unsigned char trampoline_data[]; -extern unsigned char trampoline_end[]; - -/* State of each CPU */ -DEFINE_PER_CPU(int, cpu_state) = { 0 }; - -/* - * Store all idle threads, this can be reused instead of creating - * a new thread. Also avoids complicated thread destroy functionality - * for idle threads. - */ -struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ; - -#define get_idle_for_cpu(x) (idle_thread_array[(x)]) -#define set_idle_for_cpu(x,p) (idle_thread_array[(x)] = (p)) - -/* - * Currently trivial. Write the real->protected mode - * bootstrap into the page concerned. The caller - * has made sure it's suitably aligned. - */ - -static unsigned long __cpuinit setup_trampoline(void) -{ - void *tramp = __va(SMP_TRAMPOLINE_BASE); - memcpy(tramp, trampoline_data, trampoline_end - trampoline_data); - return virt_to_phys(tramp); -} - -/* - * The bootstrap kernel entry code has set these up. Save them for - * a given CPU - */ - -static void __cpuinit smp_store_cpu_info(int id) -{ - struct cpuinfo_x86 *c = cpu_data + id; - - *c = boot_cpu_data; - identify_cpu(c); - print_cpu_info(c); -} - -/* - * New Funky TSC sync algorithm borrowed from IA64. - * Main advantage is that it doesn't reset the TSCs fully and - * in general looks more robust and it works better than my earlier - * attempts. I believe it was written by David Mosberger. Some minor - * adjustments for x86-64 by me -AK - * - * Original comment reproduced below. - * - * Synchronize TSC of the current (slave) CPU with the TSC of the - * MASTER CPU (normally the time-keeper CPU). We use a closed loop to - * eliminate the possibility of unaccounted-for errors (such as - * getting a machine check in the middle of a calibration step). The - * basic idea is for the slave to ask the master what itc value it has - * and to read its own itc before and after the master responds. Each - * iteration gives us three timestamps: - * - * slave master - * - * t0 ---\ - * ---\ - * ---> - * tm - * /--- - * /--- - * t1 <--- - * - * - * The goal is to adjust the slave's TSC such that tm falls exactly - * half-way between t0 and t1. If we achieve this, the clocks are - * synchronized provided the interconnect between the slave and the - * master is symmetric. Even if the interconnect were asymmetric, we - * would still know that the synchronization error is smaller than the - * roundtrip latency (t0 - t1). - * - * When the interconnect is quiet and symmetric, this lets us - * synchronize the TSC to within one or two cycles. However, we can - * only *guarantee* that the synchronization is accurate to within a - * round-trip time, which is typically in the range of several hundred - * cycles (e.g., ~500 cycles). In practice, this means that the TSCs - * are usually almost perfectly synchronized, but we shouldn't assume - * that the accuracy is much better than half a micro second or so. - * - * [there are other errors like the latency of RDTSC and of the - * WRMSR. These can also account to hundreds of cycles. So it's - * probably worse. It claims 153 cycles error on a dual Opteron, - * but I suspect the numbers are actually somewhat worse -AK] - */ - -#define MASTER 0 -#define SLAVE (SMP_CACHE_BYTES/8) - -/* Intentionally don't use cpu_relax() while TSC synchronization - because we don't want to go into funky power save modi or cause - hypervisors to schedule us away. Going to sleep would likely affect - latency and low latency is the primary objective here. -AK */ -#define no_cpu_relax() barrier() - -static __cpuinitdata DEFINE_SPINLOCK(tsc_sync_lock); -static volatile __cpuinitdata unsigned long go[SLAVE + 1]; -static int notscsync __cpuinitdata; - -#undef DEBUG_TSC_SYNC - -#define NUM_ROUNDS 64 /* magic value */ -#define NUM_ITERS 5 /* likewise */ - -/* Callback on boot CPU */ -static __cpuinit void sync_master(void *arg) -{ - unsigned long flags, i; - - if (smp_processor_id() != 0) - return; - - go[MASTER] = 0; - - local_irq_save(flags); - { - for (i = 0; i < NUM_ROUNDS*NUM_ITERS; ++i) { - while (!go[MASTER]) - no_cpu_relax(); - go[MASTER] = 0; - rdtscll(go[SLAVE]); - } - } - local_irq_restore(flags); -} - -/* - * Return the number of cycles by which our tsc differs from the tsc - * on the master (time-keeper) CPU. A positive number indicates our - * tsc is ahead of the master, negative that it is behind. - */ -static inline long -get_delta(long *rt, long *master) -{ - unsigned long best_t0 = 0, best_t1 = ~0UL, best_tm = 0; - unsigned long tcenter, t0, t1, tm; - int i; - - for (i = 0; i < NUM_ITERS; ++i) { - rdtscll(t0); - go[MASTER] = 1; - while (!(tm = go[SLAVE])) - no_cpu_relax(); - go[SLAVE] = 0; - rdtscll(t1); - - if (t1 - t0 < best_t1 - best_t0) - best_t0 = t0, best_t1 = t1, best_tm = tm; - } - - *rt = best_t1 - best_t0; - *master = best_tm - best_t0; - - /* average best_t0 and best_t1 without overflow: */ - tcenter = (best_t0/2 + best_t1/2); - if (best_t0 % 2 + best_t1 % 2 == 2) - ++tcenter; - return tcenter - best_tm; -} - -static __cpuinit void sync_tsc(void) -{ - int i, done = 0; - long delta, adj, adjust_latency = 0; - unsigned long flags, rt, master_time_stamp, bound; -#ifdef DEBUG_TSC_SYNC - static struct syncdebug { - long rt; /* roundtrip time */ - long master; /* master's timestamp */ - long diff; /* difference between midpoint and master's timestamp */ - long lat; /* estimate of tsc adjustment latency */ - } t[NUM_ROUNDS] __cpuinitdata; -#endif - - go[MASTER] = 1; - - smp_call_function(sync_master, NULL, 1, 0); - - while (go[MASTER]) /* wait for master to be ready */ - no_cpu_relax(); - - spin_lock_irqsave(&tsc_sync_lock, flags); - { - for (i = 0; i < NUM_ROUNDS; ++i) { - delta = get_delta(&rt, &master_time_stamp); - if (delta == 0) { - done = 1; /* let's lock on to this... */ - bound = rt; - } - - if (!done) { - unsigned long t; - if (i > 0) { - adjust_latency += -delta; - adj = -delta + adjust_latency/4; - } else - adj = -delta; - - rdtscll(t); - wrmsrl(MSR_IA32_TSC, t + adj); - } -#ifdef DEBUG_TSC_SYNC - t[i].rt = rt; - t[i].master = master_time_stamp; - t[i].diff = delta; - t[i].lat = adjust_latency/4; -#endif - } - } - spin_unlock_irqrestore(&tsc_sync_lock, flags); - -#ifdef DEBUG_TSC_SYNC - for (i = 0; i < NUM_ROUNDS; ++i) - printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n", - t[i].rt, t[i].master, t[i].diff, t[i].lat); -#endif - - printk(KERN_INFO - "CPU %d: synchronized TSC with CPU %u (last diff %ld cycles, " - "maxerr %lu cycles)\n", - smp_processor_id(), boot_cpu_id, delta, rt); -} - -static void __cpuinit tsc_sync_wait(void) -{ - if (notscsync || !cpu_has_tsc) - return; - printk(KERN_INFO "CPU %d: Syncing TSC to CPU %u.\n", smp_processor_id(), - boot_cpu_id); - sync_tsc(); -} - -static __init int notscsync_setup(char *s) -{ - notscsync = 1; - return 0; -} -__setup("notscsync", notscsync_setup); - -static atomic_t init_deasserted __cpuinitdata; - -/* - * Report back to the Boot Processor. - * Running on AP. - */ -void __cpuinit smp_callin(void) -{ - int cpuid, phys_id; - unsigned long timeout; - - /* - * If waken up by an INIT in an 82489DX configuration - * we may get here before an INIT-deassert IPI reaches - * our local APIC. We have to wait for the IPI or we'll - * lock up on an APIC access. - */ - while (!atomic_read(&init_deasserted)) - cpu_relax(); - - /* - * (This works even if the APIC is not enabled.) - */ - phys_id = GET_APIC_ID(apic_read(APIC_ID)); - cpuid = smp_processor_id(); - if (cpu_isset(cpuid, cpu_callin_map)) { - panic("smp_callin: phys CPU#%d, CPU#%d already present??\n", - phys_id, cpuid); - } - Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id); - - /* - * STARTUP IPIs are fragile beasts as they might sometimes - * trigger some glue motherboard logic. Complete APIC bus - * silence for 1 second, this overestimates the time the - * boot CPU is spending to send the up to 2 STARTUP IPIs - * by a factor of two. This should be enough. - */ - - /* - * Waiting 2s total for startup (udelay is not yet working) - */ - timeout = jiffies + 2*HZ; - while (time_before(jiffies, timeout)) { - /* - * Has the boot CPU finished it's STARTUP sequence? - */ - if (cpu_isset(cpuid, cpu_callout_map)) - break; - cpu_relax(); - } - - if (!time_before(jiffies, timeout)) { - panic("smp_callin: CPU%d started up but did not get a callout!\n", - cpuid); - } - - /* - * the boot CPU has finished the init stage and is spinning - * on callin_map until we finish. We are free to set up this - * CPU, first the APIC. (this is probably redundant on most - * boards) - */ - - Dprintk("CALLIN, before setup_local_APIC().\n"); - setup_local_APIC(); - - /* - * Get our bogomips. - */ - calibrate_delay(); - Dprintk("Stack at about %p\n",&cpuid); - - disable_APIC_timer(); - - /* - * Save our processor parameters - */ - smp_store_cpu_info(cpuid); - - /* - * Allow the master to continue. - */ - cpu_set(cpuid, cpu_callin_map); -} - -static inline void set_cpu_sibling_map(int cpu) -{ - int i; - - if (smp_num_siblings > 1) { - for_each_cpu(i) { - if (cpu_core_id[cpu] == cpu_core_id[i]) { - cpu_set(i, cpu_sibling_map[cpu]); - cpu_set(cpu, cpu_sibling_map[i]); - } - } - } else { - cpu_set(cpu, cpu_sibling_map[cpu]); - } - - if (current_cpu_data.x86_num_cores > 1) { - for_each_cpu(i) { - if (phys_proc_id[cpu] == phys_proc_id[i]) { - cpu_set(i, cpu_core_map[cpu]); - cpu_set(cpu, cpu_core_map[i]); - } - } - } else { - cpu_core_map[cpu] = cpu_sibling_map[cpu]; - } -} - -/* - * Setup code on secondary processor (after comming out of the trampoline) - */ -void __cpuinit start_secondary(void) -{ - /* - * Dont put anything before smp_callin(), SMP - * booting is too fragile that we want to limit the - * things done here to the most necessary things. - */ - cpu_init(); - smp_callin(); - - /* otherwise gcc will move up the smp_processor_id before the cpu_init */ - barrier(); - - Dprintk("cpu %d: setting up apic clock\n", smp_processor_id()); - setup_secondary_APIC_clock(); - - Dprintk("cpu %d: enabling apic timer\n", smp_processor_id()); - - if (nmi_watchdog == NMI_IO_APIC) { - disable_8259A_irq(0); - enable_NMI_through_LVT0(NULL); - enable_8259A_irq(0); - } - - enable_APIC_timer(); - - /* - * The sibling maps must be set before turing the online map on for - * this cpu - */ - set_cpu_sibling_map(smp_processor_id()); - - /* - * We need to hold call_lock, so there is no inconsistency - * between the time smp_call_function() determines number of - * IPI receipients, and the time when the determination is made - * for which cpus receive the IPI in genapic_flat.c. Holding this - * lock helps us to not include this cpu in a currently in progress - * smp_call_function(). - */ - lock_ipi_call_lock(); - - /* - * Allow the master to continue. - */ - cpu_set(smp_processor_id(), cpu_online_map); - per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; - unlock_ipi_call_lock(); - - mb(); - - /* Wait for TSC sync to not schedule things before. - We still process interrupts, which could see an inconsistent - time in that window unfortunately. */ - tsc_sync_wait(); - - cpu_idle(); -} - -extern volatile unsigned long init_rsp; -extern void (*initial_code)(void); - -#ifdef APIC_DEBUG -static void inquire_remote_apic(int apicid) -{ - unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; - char *names[] = { "ID", "VERSION", "SPIV" }; - int timeout, status; - - printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid); - - for (i = 0; i < sizeof(regs) / sizeof(*regs); i++) { - printk("... APIC #%d %s: ", apicid, names[i]); - - /* - * Wait for idle. - */ - apic_wait_icr_idle(); - - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); - apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]); - - timeout = 0; - do { - udelay(100); - status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK; - } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000); - - switch (status) { - case APIC_ICR_RR_VALID: - status = apic_read(APIC_RRR); - printk("%08x\n", status); - break; - default: - printk("failed\n"); - } - } -} -#endif - -/* - * Kick the secondary to wake up. - */ -static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int start_rip) -{ - unsigned long send_status = 0, accept_status = 0; - int maxlvt, timeout, num_starts, j; - - Dprintk("Asserting INIT.\n"); - - /* - * Turn INIT on target chip - */ - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); - - /* - * Send IPI - */ - apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT - | APIC_DM_INIT); - - Dprintk("Waiting for send to finish...\n"); - timeout = 0; - do { - Dprintk("+"); - udelay(100); - send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; - } while (send_status && (timeout++ < 1000)); - - mdelay(10); - - Dprintk("Deasserting INIT.\n"); - - /* Target chip */ - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); - - /* Send IPI */ - apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); - - Dprintk("Waiting for send to finish...\n"); - timeout = 0; - do { - Dprintk("+"); - udelay(100); - send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; - } while (send_status && (timeout++ < 1000)); - - atomic_set(&init_deasserted, 1); - - /* - * Should we send STARTUP IPIs ? - * - * Determine this based on the APIC version. - * If we don't have an integrated APIC, don't send the STARTUP IPIs. - */ - if (APIC_INTEGRATED(apic_version[phys_apicid])) - num_starts = 2; - else - num_starts = 0; - - /* - * Run STARTUP IPI loop. - */ - Dprintk("#startup loops: %d.\n", num_starts); - - maxlvt = get_maxlvt(); - - for (j = 1; j <= num_starts; j++) { - Dprintk("Sending STARTUP #%d.\n",j); - apic_read_around(APIC_SPIV); - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); - Dprintk("After apic_write.\n"); - - /* - * STARTUP IPI - */ - - /* Target chip */ - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); - - /* Boot on the stack */ - /* Kick the second */ - apic_write_around(APIC_ICR, APIC_DM_STARTUP - | (start_rip >> 12)); - - /* - * Give the other CPU some time to accept the IPI. - */ - udelay(300); - - Dprintk("Startup point 1.\n"); - - Dprintk("Waiting for send to finish...\n"); - timeout = 0; - do { - Dprintk("+"); - udelay(100); - send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; - } while (send_status && (timeout++ < 1000)); - - /* - * Give the other CPU some time to accept the IPI. - */ - udelay(200); - /* - * Due to the Pentium erratum 3AP. - */ - if (maxlvt > 3) { - apic_read_around(APIC_SPIV); - apic_write(APIC_ESR, 0); - } - accept_status = (apic_read(APIC_ESR) & 0xEF); - if (send_status || accept_status) - break; - } - Dprintk("After Startup.\n"); - - if (send_status) - printk(KERN_ERR "APIC never delivered???\n"); - if (accept_status) - printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status); - - return (send_status | accept_status); -} - -struct create_idle { - struct task_struct *idle; - struct completion done; - int cpu; -}; - -void do_fork_idle(void *_c_idle) -{ - struct create_idle *c_idle = _c_idle; - - c_idle->idle = fork_idle(c_idle->cpu); - complete(&c_idle->done); -} - -/* - * Boot one CPU. - */ -static int __cpuinit do_boot_cpu(int cpu, int apicid) -{ - unsigned long boot_error; - int timeout; - unsigned long start_rip; - struct create_idle c_idle = { - .cpu = cpu, - .done = COMPLETION_INITIALIZER(c_idle.done), - }; - DECLARE_WORK(work, do_fork_idle, &c_idle); - - c_idle.idle = get_idle_for_cpu(cpu); - - if (c_idle.idle) { - c_idle.idle->thread.rsp = (unsigned long) (((struct pt_regs *) - (THREAD_SIZE + (unsigned long) c_idle.idle->thread_info)) - 1); - init_idle(c_idle.idle, cpu); - goto do_rest; - } - - /* - * During cold boot process, keventd thread is not spun up yet. - * When we do cpu hot-add, we create idle threads on the fly, we should - * not acquire any attributes from the calling context. Hence the clean - * way to create kernel_threads() is to do that from keventd(). - * We do the current_is_keventd() due to the fact that ACPI notifier - * was also queuing to keventd() and when the caller is already running - * in context of keventd(), we would end up with locking up the keventd - * thread. - */ - if (!keventd_up() || current_is_keventd()) - work.func(work.data); - else { - schedule_work(&work); - wait_for_completion(&c_idle.done); - } - - if (IS_ERR(c_idle.idle)) { - printk("failed fork for CPU %d\n", cpu); - return PTR_ERR(c_idle.idle); - } - - set_idle_for_cpu(cpu, c_idle.idle); - -do_rest: - - cpu_pda[cpu].pcurrent = c_idle.idle; - - start_rip = setup_trampoline(); - - init_rsp = c_idle.idle->thread.rsp; - per_cpu(init_tss,cpu).rsp0 = init_rsp; - initial_code = start_secondary; - clear_ti_thread_flag(c_idle.idle->thread_info, TIF_FORK); - - printk(KERN_INFO "Booting processor %d/%d APIC 0x%x\n", cpu, - cpus_weight(cpu_present_map), - apicid); - - /* - * This grunge runs the startup process for - * the targeted processor. - */ - - atomic_set(&init_deasserted, 0); - - Dprintk("Setting warm reset code and vector.\n"); - - CMOS_WRITE(0xa, 0xf); - local_flush_tlb(); - Dprintk("1.\n"); - *((volatile unsigned short *) phys_to_virt(0x469)) = start_rip >> 4; - Dprintk("2.\n"); - *((volatile unsigned short *) phys_to_virt(0x467)) = start_rip & 0xf; - Dprintk("3.\n"); - - /* - * Be paranoid about clearing APIC errors. - */ - if (APIC_INTEGRATED(apic_version[apicid])) { - apic_read_around(APIC_SPIV); - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); - } - - /* - * Status is now clean - */ - boot_error = 0; - - /* - * Starting actual IPI sequence... - */ - boot_error = wakeup_secondary_via_INIT(apicid, start_rip); - - if (!boot_error) { - /* - * allow APs to start initializing. - */ - Dprintk("Before Callout %d.\n", cpu); - cpu_set(cpu, cpu_callout_map); - Dprintk("After Callout %d.\n", cpu); - - /* - * Wait 5s total for a response - */ - for (timeout = 0; timeout < 50000; timeout++) { - if (cpu_isset(cpu, cpu_callin_map)) - break; /* It has booted */ - udelay(100); - } - - if (cpu_isset(cpu, cpu_callin_map)) { - /* number CPUs logically, starting from 1 (BSP is 0) */ - Dprintk("CPU has booted.\n"); - } else { - boot_error = 1; - if (*((volatile unsigned char *)phys_to_virt(SMP_TRAMPOLINE_BASE)) - == 0xA5) - /* trampoline started but...? */ - printk("Stuck ??\n"); - else - /* trampoline code not run */ - printk("Not responding.\n"); -#ifdef APIC_DEBUG - inquire_remote_apic(apicid); -#endif - } - } - if (boot_error) { - cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */ - clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */ - cpu_clear(cpu, cpu_present_map); - cpu_clear(cpu, cpu_possible_map); - x86_cpu_to_apicid[cpu] = BAD_APICID; - x86_cpu_to_log_apicid[cpu] = BAD_APICID; - return -EIO; - } - - return 0; -} - -cycles_t cacheflush_time; -unsigned long cache_decay_ticks; - -/* - * Cleanup possible dangling ends... - */ -static __cpuinit void smp_cleanup_boot(void) -{ - /* - * Paranoid: Set warm reset code and vector here back - * to default values. - */ - CMOS_WRITE(0, 0xf); - - /* - * Reset trampoline flag - */ - *((volatile int *) phys_to_virt(0x467)) = 0; - -#ifndef CONFIG_HOTPLUG_CPU - /* - * Free pages reserved for SMP bootup. - * When you add hotplug CPU support later remove this - * Note there is more work to be done for later CPU bootup. - */ - - free_page((unsigned long) __va(PAGE_SIZE)); - free_page((unsigned long) __va(SMP_TRAMPOLINE_BASE)); -#endif -} - -/* - * Fall back to non SMP mode after errors. - * - * RED-PEN audit/test this more. I bet there is more state messed up here. - */ -static __init void disable_smp(void) -{ - cpu_present_map = cpumask_of_cpu(0); - cpu_possible_map = cpumask_of_cpu(0); - if (smp_found_config) - phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id); - else - phys_cpu_present_map = physid_mask_of_physid(0); - cpu_set(0, cpu_sibling_map[0]); - cpu_set(0, cpu_core_map[0]); -} - -/* - * Handle user cpus=... parameter. - */ -static __init void enforce_max_cpus(unsigned max_cpus) -{ - int i, k; - k = 0; - for (i = 0; i < NR_CPUS; i++) { - if (!cpu_possible(i)) - continue; - if (++k > max_cpus) { - cpu_clear(i, cpu_possible_map); - cpu_clear(i, cpu_present_map); - } - } -} - -#ifdef CONFIG_HOTPLUG_CPU -/* - * cpu_possible_map should be static, it cannot change as cpu's - * are onlined, or offlined. The reason is per-cpu data-structures - * are allocated by some modules at init time, and dont expect to - * do this dynamically on cpu arrival/departure. - * cpu_present_map on the other hand can change dynamically. - * In case when cpu_hotplug is not compiled, then we resort to current - * behaviour, which is cpu_possible == cpu_present. - * If cpu-hotplug is supported, then we need to preallocate for all - * those NR_CPUS, hence cpu_possible_map represents entire NR_CPUS range. - * - Ashok Raj - */ -static void prefill_possible_map(void) -{ - int i; - for (i = 0; i < NR_CPUS; i++) - cpu_set(i, cpu_possible_map); -} -#endif - -/* - * Various sanity checks. - */ -static int __init smp_sanity_check(unsigned max_cpus) -{ - if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { - printk("weird, boot CPU (#%d) not listed by the BIOS.\n", - hard_smp_processor_id()); - physid_set(hard_smp_processor_id(), phys_cpu_present_map); - } - - /* - * If we couldn't find an SMP configuration at boot time, - * get out of here now! - */ - if (!smp_found_config) { - printk(KERN_NOTICE "SMP motherboard not detected.\n"); - disable_smp(); - if (APIC_init_uniprocessor()) - printk(KERN_NOTICE "Local APIC not detected." - " Using dummy APIC emulation.\n"); - return -1; - } - - /* - * Should not be necessary because the MP table should list the boot - * CPU too, but we do it for the sake of robustness anyway. - */ - if (!physid_isset(boot_cpu_id, phys_cpu_present_map)) { - printk(KERN_NOTICE "weird, boot CPU (#%d) not listed by the BIOS.\n", - boot_cpu_id); - physid_set(hard_smp_processor_id(), phys_cpu_present_map); - } - - /* - * If we couldn't find a local APIC, then get out of here now! - */ - if (APIC_INTEGRATED(apic_version[boot_cpu_id]) && !cpu_has_apic) { - printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", - boot_cpu_id); - printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n"); - nr_ioapics = 0; - return -1; - } - - /* - * If SMP should be disabled, then really disable it! - */ - if (!max_cpus) { - printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n"); - nr_ioapics = 0; - return -1; - } - - return 0; -} - -/* - * Prepare for SMP bootup. The MP table or ACPI has been read - * earlier. Just do some sanity checking here and enable APIC mode. - */ -void __init smp_prepare_cpus(unsigned int max_cpus) -{ - nmi_watchdog_default(); - current_cpu_data = boot_cpu_data; - current_thread_info()->cpu = 0; /* needed? */ - - enforce_max_cpus(max_cpus); - -#ifdef CONFIG_HOTPLUG_CPU - prefill_possible_map(); -#endif - - if (smp_sanity_check(max_cpus) < 0) { - printk(KERN_INFO "SMP disabled\n"); - disable_smp(); - return; - } - - - /* - * Switch from PIC to APIC mode. - */ - connect_bsp_APIC(); - setup_local_APIC(); - - if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id) { - panic("Boot APIC ID in local APIC unexpected (%d vs %d)", - GET_APIC_ID(apic_read(APIC_ID)), boot_cpu_id); - /* Or can we switch back to PIC here? */ - } - - /* - * Now start the IO-APICs - */ - if (!skip_ioapic_setup && nr_ioapics) - setup_IO_APIC(); - else - nr_ioapics = 0; - - /* - * Set up local APIC timer on boot CPU. - */ - - setup_boot_APIC_clock(); -} - -/* - * Early setup to make printk work. - */ -void __init smp_prepare_boot_cpu(void) -{ - int me = smp_processor_id(); - cpu_set(me, cpu_online_map); - cpu_set(me, cpu_callout_map); - cpu_set(0, cpu_sibling_map[0]); - cpu_set(0, cpu_core_map[0]); - per_cpu(cpu_state, me) = CPU_ONLINE; -} - -/* - * Entry point to boot a CPU. - */ -int __cpuinit __cpu_up(unsigned int cpu) -{ - int err; - int apicid = cpu_present_to_apicid(cpu); - - WARN_ON(irqs_disabled()); - - Dprintk("++++++++++++++++++++=_---CPU UP %u\n", cpu); - - if (apicid == BAD_APICID || apicid == boot_cpu_id || - !physid_isset(apicid, phys_cpu_present_map)) { - printk("__cpu_up: bad cpu %d\n", cpu); - return -EINVAL; - } - - /* - * Already booted CPU? - */ - if (cpu_isset(cpu, cpu_callin_map)) { - Dprintk("do_boot_cpu %d Already started\n", cpu); - return -ENOSYS; - } - - per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; - /* Boot it! */ - err = do_boot_cpu(cpu, apicid); - if (err < 0) { - Dprintk("do_boot_cpu failed %d\n", err); - return err; - } - - /* Unleash the CPU! */ - Dprintk("waiting for cpu %d\n", cpu); - - while (!cpu_isset(cpu, cpu_online_map)) - cpu_relax(); - err = 0; - - return err; -} - -/* - * Finish the SMP boot. - */ -void __init smp_cpus_done(unsigned int max_cpus) -{ -#ifndef CONFIG_HOTPLUG_CPU - zap_low_mappings(); -#endif - smp_cleanup_boot(); - -#ifdef CONFIG_X86_IO_APIC - setup_ioapic_dest(); -#endif - - time_init_gtod(); - - check_nmi_watchdog(); -} - -#ifdef CONFIG_HOTPLUG_CPU - -static void remove_siblinginfo(int cpu) -{ - int sibling; - - for_each_cpu_mask(sibling, cpu_sibling_map[cpu]) - cpu_clear(cpu, cpu_sibling_map[sibling]); - for_each_cpu_mask(sibling, cpu_core_map[cpu]) - cpu_clear(cpu, cpu_core_map[sibling]); - cpus_clear(cpu_sibling_map[cpu]); - cpus_clear(cpu_core_map[cpu]); - phys_proc_id[cpu] = BAD_APICID; - cpu_core_id[cpu] = BAD_APICID; -} - -void remove_cpu_from_maps(void) -{ - int cpu = smp_processor_id(); - - cpu_clear(cpu, cpu_callout_map); - cpu_clear(cpu, cpu_callin_map); - clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */ -} - -int __cpu_disable(void) -{ - int cpu = smp_processor_id(); - - /* - * Perhaps use cpufreq to drop frequency, but that could go - * into generic code. - * - * We won't take down the boot processor on i386 due to some - * interrupts only being able to be serviced by the BSP. - * Especially so if we're not using an IOAPIC -zwane - */ - if (cpu == 0) - return -EBUSY; - - disable_APIC_timer(); - - /* - * HACK: - * Allow any queued timer interrupts to get serviced - * This is only a temporary solution until we cleanup - * fixup_irqs as we do for IA64. - */ - local_irq_enable(); - mdelay(1); - - local_irq_disable(); - remove_siblinginfo(cpu); - - /* It's now safe to remove this processor from the online map */ - cpu_clear(cpu, cpu_online_map); - remove_cpu_from_maps(); - fixup_irqs(cpu_online_map); - return 0; -} - -void __cpu_die(unsigned int cpu) -{ - /* We don't do anything here: idle task is faking death itself. */ - unsigned int i; - - for (i = 0; i < 10; i++) { - /* They ack this in play_dead by setting CPU_DEAD */ - if (per_cpu(cpu_state, cpu) == CPU_DEAD) { - printk ("CPU %d is now offline\n", cpu); - return; - } - msleep(100); - } - printk(KERN_ERR "CPU %u didn't die...\n", cpu); -} - -#else /* ... !CONFIG_HOTPLUG_CPU */ - -int __cpu_disable(void) -{ - return -ENOSYS; -} - -void __cpu_die(unsigned int cpu) -{ - /* We said "no" in __cpu_disable */ - BUG(); -} -#endif /* CONFIG_HOTPLUG_CPU */ diff --git a/arch/x86_64/kernel/suspend.c b/arch/x86_64/kernel/suspend.c deleted file mode 100644 index 0612640d91b..00000000000 --- a/arch/x86_64/kernel/suspend.c +++ /dev/null @@ -1,158 +0,0 @@ -/* - * Suspend support specific for i386. - * - * Distribute under GPLv2 - * - * Copyright (c) 2002 Pavel Machek <pavel@suse.cz> - * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org> - */ - -#include <linux/config.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/types.h> -#include <linux/spinlock.h> -#include <linux/poll.h> -#include <linux/delay.h> -#include <linux/sysrq.h> -#include <linux/proc_fs.h> -#include <linux/irq.h> -#include <linux/pm.h> -#include <linux/device.h> -#include <linux/suspend.h> -#include <asm/uaccess.h> -#include <asm/acpi.h> -#include <asm/tlbflush.h> -#include <asm/io.h> -#include <asm/proto.h> - -struct saved_context saved_context; - -unsigned long saved_context_eax, saved_context_ebx, saved_context_ecx, saved_context_edx; -unsigned long saved_context_esp, saved_context_ebp, saved_context_esi, saved_context_edi; -unsigned long saved_context_r08, saved_context_r09, saved_context_r10, saved_context_r11; -unsigned long saved_context_r12, saved_context_r13, saved_context_r14, saved_context_r15; -unsigned long saved_context_eflags; - -void __save_processor_state(struct saved_context *ctxt) -{ - kernel_fpu_begin(); - - /* - * descriptor tables - */ - asm volatile ("sgdt %0" : "=m" (ctxt->gdt_limit)); - asm volatile ("sidt %0" : "=m" (ctxt->idt_limit)); - asm volatile ("str %0" : "=m" (ctxt->tr)); - - /* XMM0..XMM15 should be handled by kernel_fpu_begin(). */ - /* EFER should be constant for kernel version, no need to handle it. */ - /* - * segment registers - */ - asm volatile ("movw %%ds, %0" : "=m" (ctxt->ds)); - asm volatile ("movw %%es, %0" : "=m" (ctxt->es)); - asm volatile ("movw %%fs, %0" : "=m" (ctxt->fs)); - asm volatile ("movw %%gs, %0" : "=m" (ctxt->gs)); - asm volatile ("movw %%ss, %0" : "=m" (ctxt->ss)); - - rdmsrl(MSR_FS_BASE, ctxt->fs_base); - rdmsrl(MSR_GS_BASE, ctxt->gs_base); - rdmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); - - /* - * control registers - */ - asm volatile ("movq %%cr0, %0" : "=r" (ctxt->cr0)); - asm volatile ("movq %%cr2, %0" : "=r" (ctxt->cr2)); - asm volatile ("movq %%cr3, %0" : "=r" (ctxt->cr3)); - asm volatile ("movq %%cr4, %0" : "=r" (ctxt->cr4)); - asm volatile ("movq %%cr8, %0" : "=r" (ctxt->cr8)); -} - -void save_processor_state(void) -{ - __save_processor_state(&saved_context); -} - -static void -do_fpu_end(void) -{ - /* restore FPU regs if necessary */ - /* Do it out of line so that gcc does not move cr0 load to some stupid place */ - kernel_fpu_end(); - mxcsr_feature_mask_init(); -} - -void __restore_processor_state(struct saved_context *ctxt) -{ - /* - * control registers - */ - asm volatile ("movq %0, %%cr8" :: "r" (ctxt->cr8)); - asm volatile ("movq %0, %%cr4" :: "r" (ctxt->cr4)); - asm volatile ("movq %0, %%cr3" :: "r" (ctxt->cr3)); - asm volatile ("movq %0, %%cr2" :: "r" (ctxt->cr2)); - asm volatile ("movq %0, %%cr0" :: "r" (ctxt->cr0)); - - /* - * now restore the descriptor tables to their proper values - * ltr is done i fix_processor_context(). - */ - asm volatile ("lgdt %0" :: "m" (ctxt->gdt_limit)); - asm volatile ("lidt %0" :: "m" (ctxt->idt_limit)); - - /* - * segment registers - */ - asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds)); - asm volatile ("movw %0, %%es" :: "r" (ctxt->es)); - asm volatile ("movw %0, %%fs" :: "r" (ctxt->fs)); - load_gs_index(ctxt->gs); - asm volatile ("movw %0, %%ss" :: "r" (ctxt->ss)); - - wrmsrl(MSR_FS_BASE, ctxt->fs_base); - wrmsrl(MSR_GS_BASE, ctxt->gs_base); - wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); - - fix_processor_context(); - - do_fpu_end(); - mtrr_ap_init(); -} - -void restore_processor_state(void) -{ - __restore_processor_state(&saved_context); -} - -void fix_processor_context(void) -{ - int cpu = smp_processor_id(); - struct tss_struct *t = &per_cpu(init_tss, cpu); - - set_tss_desc(cpu,t); /* This just modifies memory; should not be neccessary. But... This is neccessary, because 386 hardware has concept of busy TSS or some similar stupidity. */ - - cpu_gdt_table[cpu][GDT_ENTRY_TSS].type = 9; - - syscall_init(); /* This sets MSR_*STAR and related */ - load_TR_desc(); /* This does ltr */ - load_LDT(¤t->active_mm->context); /* This does lldt */ - - /* - * Now maybe reload the debug registers - */ - if (current->thread.debugreg7){ - loaddebug(¤t->thread, 0); - loaddebug(¤t->thread, 1); - loaddebug(¤t->thread, 2); - loaddebug(¤t->thread, 3); - /* no 4 and 5 */ - loaddebug(¤t->thread, 6); - loaddebug(¤t->thread, 7); - } - -} - - diff --git a/arch/x86_64/kernel/suspend_asm.S b/arch/x86_64/kernel/suspend_asm.S deleted file mode 100644 index 53f8e165951..00000000000 --- a/arch/x86_64/kernel/suspend_asm.S +++ /dev/null @@ -1,104 +0,0 @@ -/* Copyright 2004,2005 Pavel Machek <pavel@suse.cz>, Andi Kleen <ak@suse.de>, Rafael J. Wysocki <rjw@sisk.pl> - * - * Distribute under GPLv2. - * - * swsusp_arch_resume may not use any stack, nor any variable that is - * not "NoSave" during copying pages: - * - * Its rewriting one kernel image with another. What is stack in "old" - * image could very well be data page in "new" image, and overwriting - * your own stack under you is bad idea. - */ - - .text -#include <linux/linkage.h> -#include <asm/segment.h> -#include <asm/page.h> -#include <asm/offset.h> - -ENTRY(swsusp_arch_suspend) - - movq %rsp, saved_context_esp(%rip) - movq %rax, saved_context_eax(%rip) - movq %rbx, saved_context_ebx(%rip) - movq %rcx, saved_context_ecx(%rip) - movq %rdx, saved_context_edx(%rip) - movq %rbp, saved_context_ebp(%rip) - movq %rsi, saved_context_esi(%rip) - movq %rdi, saved_context_edi(%rip) - movq %r8, saved_context_r08(%rip) - movq %r9, saved_context_r09(%rip) - movq %r10, saved_context_r10(%rip) - movq %r11, saved_context_r11(%rip) - movq %r12, saved_context_r12(%rip) - movq %r13, saved_context_r13(%rip) - movq %r14, saved_context_r14(%rip) - movq %r15, saved_context_r15(%rip) - pushfq ; popq saved_context_eflags(%rip) - - call swsusp_save - ret - -ENTRY(swsusp_arch_resume) - /* set up cr3 */ - leaq init_level4_pgt(%rip),%rax - subq $__START_KERNEL_map,%rax - movq %rax,%cr3 - - movq mmu_cr4_features(%rip), %rax - movq %rax, %rdx - andq $~(1<<7), %rdx # PGE - movq %rdx, %cr4; # turn off PGE - movq %cr3, %rcx; # flush TLB - movq %rcx, %cr3; - movq %rax, %cr4; # turn PGE back on - - movq pagedir_nosave(%rip), %rdx -loop: - testq %rdx, %rdx - jz done - - /* get addresses from the pbe and copy the page */ - movq pbe_address(%rdx), %rsi - movq pbe_orig_address(%rdx), %rdi - movq $512, %rcx - rep - movsq - - /* progress to the next pbe */ - movq pbe_next(%rdx), %rdx - jmp loop -done: - /* Flush TLB, including "global" things (vmalloc) */ - movq mmu_cr4_features(%rip), %rax - movq %rax, %rdx - andq $~(1<<7), %rdx; # PGE - movq %rdx, %cr4; # turn off PGE - movq %cr3, %rcx; # flush TLB - movq %rcx, %cr3 - movq %rax, %cr4; # turn PGE back on - - movl $24, %eax - movl %eax, %ds - - movq saved_context_esp(%rip), %rsp - movq saved_context_ebp(%rip), %rbp - /* Don't restore %rax, it must be 0 anyway */ - movq saved_context_ebx(%rip), %rbx - movq saved_context_ecx(%rip), %rcx - movq saved_context_edx(%rip), %rdx - movq saved_context_esi(%rip), %rsi - movq saved_context_edi(%rip), %rdi - movq saved_context_r08(%rip), %r8 - movq saved_context_r09(%rip), %r9 - movq saved_context_r10(%rip), %r10 - movq saved_context_r11(%rip), %r11 - movq saved_context_r12(%rip), %r12 - movq saved_context_r13(%rip), %r13 - movq saved_context_r14(%rip), %r14 - movq saved_context_r15(%rip), %r15 - pushq saved_context_eflags(%rip) ; popfq - - xorq %rax, %rax - - ret diff --git a/arch/x86_64/kernel/sys_x86_64.c b/arch/x86_64/kernel/sys_x86_64.c deleted file mode 100644 index cc7821c6885..00000000000 --- a/arch/x86_64/kernel/sys_x86_64.c +++ /dev/null @@ -1,170 +0,0 @@ -/* - * linux/arch/x86_64/kernel/sys_x86_64.c - */ - -#include <linux/errno.h> -#include <linux/sched.h> -#include <linux/syscalls.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/smp_lock.h> -#include <linux/sem.h> -#include <linux/msg.h> -#include <linux/shm.h> -#include <linux/stat.h> -#include <linux/mman.h> -#include <linux/file.h> -#include <linux/utsname.h> -#include <linux/personality.h> - -#include <asm/uaccess.h> -#include <asm/ia32.h> - -/* - * sys_pipe() is the normal C calling standard for creating - * a pipe. It's not the way Unix traditionally does this, though. - */ -asmlinkage long sys_pipe(int __user *fildes) -{ - int fd[2]; - int error; - - error = do_pipe(fd); - if (!error) { - if (copy_to_user(fildes, fd, 2*sizeof(int))) - error = -EFAULT; - } - return error; -} - -asmlinkage long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long off) -{ - long error; - struct file * file; - - error = -EINVAL; - if (off & ~PAGE_MASK) - goto out; - - error = -EBADF; - file = NULL; - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, off >> PAGE_SHIFT); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); -out: - return error; -} - -static void find_start_end(unsigned long flags, unsigned long *begin, - unsigned long *end) -{ - if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT)) { - /* This is usually used needed to map code in small - model, so it needs to be in the first 31bit. Limit - it to that. This means we need to move the - unmapped base down for this case. This can give - conflicts with the heap, but we assume that glibc - malloc knows how to fall back to mmap. Give it 1GB - of playground for now. -AK */ - *begin = 0x40000000; - *end = 0x80000000; - } else { - *begin = TASK_UNMAPPED_BASE; - *end = TASK_SIZE; - } -} - -unsigned long -arch_get_unmapped_area(struct file *filp, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) -{ - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - unsigned long start_addr; - unsigned long begin, end; - - find_start_end(flags, &begin, &end); - - if (len > end) - return -ENOMEM; - - if (addr) { - addr = PAGE_ALIGN(addr); - vma = find_vma(mm, addr); - if (end - len >= addr && - (!vma || addr + len <= vma->vm_start)) - return addr; - } - if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32)) - && len <= mm->cached_hole_size) { - mm->cached_hole_size = 0; - mm->free_area_cache = begin; - } - addr = mm->free_area_cache; - if (addr < begin) - addr = begin; - start_addr = addr; - -full_search: - for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { - /* At this point: (!vma || addr < vma->vm_end). */ - if (end - len < addr) { - /* - * Start a new search - just in case we missed - * some holes. - */ - if (start_addr != begin) { - start_addr = addr = begin; - mm->cached_hole_size = 0; - goto full_search; - } - return -ENOMEM; - } - if (!vma || addr + len <= vma->vm_start) { - /* - * Remember the place where we stopped the search: - */ - mm->free_area_cache = addr + len; - return addr; - } - if (addr + mm->cached_hole_size < vma->vm_start) - mm->cached_hole_size = vma->vm_start - addr; - - addr = vma->vm_end; - } -} - -asmlinkage long sys_uname(struct new_utsname __user * name) -{ - int err; - down_read(&uts_sem); - err = copy_to_user(name, &system_utsname, sizeof (*name)); - up_read(&uts_sem); - if (personality(current->personality) == PER_LINUX32) - err |= copy_to_user(&name->machine, "i686", 5); - return err ? -EFAULT : 0; -} - -asmlinkage long sys_time64(long __user * tloc) -{ - struct timeval now; - int i; - - do_gettimeofday(&now); - i = now.tv_sec; - if (tloc) { - if (put_user(i,tloc)) - i = -EFAULT; - } - return i; -} diff --git a/arch/x86_64/kernel/syscall.c b/arch/x86_64/kernel/syscall.c deleted file mode 100644 index e263685f864..00000000000 --- a/arch/x86_64/kernel/syscall.c +++ /dev/null @@ -1,26 +0,0 @@ -/* System call table for x86-64. */ - -#include <linux/linkage.h> -#include <linux/sys.h> -#include <linux/cache.h> -#include <linux/config.h> - -#define __NO_STUBS - -#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ; -#undef _ASM_X86_64_UNISTD_H_ -#include <asm-x86_64/unistd.h> - -#undef __SYSCALL -#define __SYSCALL(nr, sym) [ nr ] = sym, -#undef _ASM_X86_64_UNISTD_H_ - -typedef void (*sys_call_ptr_t)(void); - -extern void sys_ni_syscall(void); - -sys_call_ptr_t sys_call_table[__NR_syscall_max+1] __cacheline_aligned = { - /* Smells like a like a compiler bug -- it doesn't work when the & below is removed. */ - [0 ... __NR_syscall_max] = &sys_ni_syscall, -#include <asm-x86_64/unistd.h> -}; diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c deleted file mode 100644 index 66bf6ddeb0c..00000000000 --- a/arch/x86_64/kernel/time.c +++ /dev/null @@ -1,1314 +0,0 @@ -/* - * linux/arch/x86-64/kernel/time.c - * - * "High Precision Event Timer" based timekeeping. - * - * Copyright (c) 1991,1992,1995 Linus Torvalds - * Copyright (c) 1994 Alan Modra - * Copyright (c) 1995 Markus Kuhn - * Copyright (c) 1996 Ingo Molnar - * Copyright (c) 1998 Andrea Arcangeli - * Copyright (c) 2002 Vojtech Pavlik - * Copyright (c) 2003 Andi Kleen - * RTC support code taken from arch/i386/kernel/timers/time_hpet.c - */ - -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/interrupt.h> -#include <linux/init.h> -#include <linux/mc146818rtc.h> -#include <linux/irq.h> -#include <linux/time.h> -#include <linux/ioport.h> -#include <linux/module.h> -#include <linux/device.h> -#include <linux/sysdev.h> -#include <linux/bcd.h> -#include <linux/kallsyms.h> -#include <linux/acpi.h> -#ifdef CONFIG_ACPI -#include <acpi/achware.h> /* for PM timer frequency */ -#endif -#include <asm/8253pit.h> -#include <asm/pgtable.h> -#include <asm/vsyscall.h> -#include <asm/timex.h> -#include <asm/proto.h> -#include <asm/hpet.h> -#include <asm/sections.h> -#include <linux/cpufreq.h> -#include <linux/hpet.h> -#ifdef CONFIG_X86_LOCAL_APIC -#include <asm/apic.h> -#endif - -u64 jiffies_64 = INITIAL_JIFFIES; - -EXPORT_SYMBOL(jiffies_64); - -#ifdef CONFIG_CPU_FREQ -static void cpufreq_delayed_get(void); -#endif -extern void i8254_timer_resume(void); -extern int using_apic_timer; - -DEFINE_SPINLOCK(rtc_lock); -DEFINE_SPINLOCK(i8253_lock); - -static int nohpet __initdata = 0; -static int notsc __initdata = 0; - -#undef HPET_HACK_ENABLE_DANGEROUS - -unsigned int cpu_khz; /* TSC clocks / usec, not used here */ -static unsigned long hpet_period; /* fsecs / HPET clock */ -unsigned long hpet_tick; /* HPET clocks / interrupt */ -static int hpet_use_timer; -unsigned long vxtime_hz = PIT_TICK_RATE; -int report_lost_ticks; /* command line option */ -unsigned long long monotonic_base; - -struct vxtime_data __vxtime __section_vxtime; /* for vsyscalls */ - -volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; -unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES; -struct timespec __xtime __section_xtime; -struct timezone __sys_tz __section_sys_tz; - -static inline void rdtscll_sync(unsigned long *tsc) -{ -#ifdef CONFIG_SMP - sync_core(); -#endif - rdtscll(*tsc); -} - -/* - * do_gettimeoffset() returns microseconds since last timer interrupt was - * triggered by hardware. A memory read of HPET is slower than a register read - * of TSC, but much more reliable. It's also synchronized to the timer - * interrupt. Note that do_gettimeoffset() may return more than hpet_tick, if a - * timer interrupt has happened already, but vxtime.trigger wasn't updated yet. - * This is not a problem, because jiffies hasn't updated either. They are bound - * together by xtime_lock. - */ - -static inline unsigned int do_gettimeoffset_tsc(void) -{ - unsigned long t; - unsigned long x; - rdtscll_sync(&t); - if (t < vxtime.last_tsc) t = vxtime.last_tsc; /* hack */ - x = ((t - vxtime.last_tsc) * vxtime.tsc_quot) >> 32; - return x; -} - -static inline unsigned int do_gettimeoffset_hpet(void) -{ - /* cap counter read to one tick to avoid inconsistencies */ - unsigned long counter = hpet_readl(HPET_COUNTER) - vxtime.last; - return (min(counter,hpet_tick) * vxtime.quot) >> 32; -} - -unsigned int (*do_gettimeoffset)(void) = do_gettimeoffset_tsc; - -/* - * This version of gettimeofday() has microsecond resolution and better than - * microsecond precision, as we're using at least a 10 MHz (usually 14.31818 - * MHz) HPET timer. - */ - -void do_gettimeofday(struct timeval *tv) -{ - unsigned long seq, t; - unsigned int sec, usec; - - do { - seq = read_seqbegin(&xtime_lock); - - sec = xtime.tv_sec; - usec = xtime.tv_nsec / 1000; - - /* i386 does some correction here to keep the clock - monotonous even when ntpd is fixing drift. - But they didn't work for me, there is a non monotonic - clock anyways with ntp. - I dropped all corrections now until a real solution can - be found. Note when you fix it here you need to do the same - in arch/x86_64/kernel/vsyscall.c and export all needed - variables in vmlinux.lds. -AK */ - - t = (jiffies - wall_jiffies) * (1000000L / HZ) + - do_gettimeoffset(); - usec += t; - - } while (read_seqretry(&xtime_lock, seq)); - - tv->tv_sec = sec + usec / 1000000; - tv->tv_usec = usec % 1000000; -} - -EXPORT_SYMBOL(do_gettimeofday); - -/* - * settimeofday() first undoes the correction that gettimeofday would do - * on the time, and then saves it. This is ugly, but has been like this for - * ages already. - */ - -int do_settimeofday(struct timespec *tv) -{ - time_t wtm_sec, sec = tv->tv_sec; - long wtm_nsec, nsec = tv->tv_nsec; - - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) - return -EINVAL; - - write_seqlock_irq(&xtime_lock); - - nsec -= do_gettimeoffset() * 1000 + - (jiffies - wall_jiffies) * (NSEC_PER_SEC/HZ); - - wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); - wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); - - set_normalized_timespec(&xtime, sec, nsec); - set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); - - time_adjust = 0; /* stop active adjtime() */ - time_status |= STA_UNSYNC; - time_maxerror = NTP_PHASE_LIMIT; - time_esterror = NTP_PHASE_LIMIT; - - write_sequnlock_irq(&xtime_lock); - clock_was_set(); - return 0; -} - -EXPORT_SYMBOL(do_settimeofday); - -unsigned long profile_pc(struct pt_regs *regs) -{ - unsigned long pc = instruction_pointer(regs); - - /* Assume the lock function has either no stack frame or only a single word. - This checks if the address on the stack looks like a kernel text address. - There is a small window for false hits, but in that case the tick - is just accounted to the spinlock function. - Better would be to write these functions in assembler again - and check exactly. */ - if (in_lock_functions(pc)) { - char *v = *(char **)regs->rsp; - if ((v >= _stext && v <= _etext) || - (v >= _sinittext && v <= _einittext) || - (v >= (char *)MODULES_VADDR && v <= (char *)MODULES_END)) - return (unsigned long)v; - return ((unsigned long *)regs->rsp)[1]; - } - return pc; -} -EXPORT_SYMBOL(profile_pc); - -/* - * In order to set the CMOS clock precisely, set_rtc_mmss has to be called 500 - * ms after the second nowtime has started, because when nowtime is written - * into the registers of the CMOS clock, it will jump to the next second - * precisely 500 ms later. Check the Motorola MC146818A or Dallas DS12887 data - * sheet for details. - */ - -static void set_rtc_mmss(unsigned long nowtime) -{ - int real_seconds, real_minutes, cmos_minutes; - unsigned char control, freq_select; - -/* - * IRQs are disabled when we're called from the timer interrupt, - * no need for spin_lock_irqsave() - */ - - spin_lock(&rtc_lock); - -/* - * Tell the clock it's being set and stop it. - */ - - control = CMOS_READ(RTC_CONTROL); - CMOS_WRITE(control | RTC_SET, RTC_CONTROL); - - freq_select = CMOS_READ(RTC_FREQ_SELECT); - CMOS_WRITE(freq_select | RTC_DIV_RESET2, RTC_FREQ_SELECT); - - cmos_minutes = CMOS_READ(RTC_MINUTES); - BCD_TO_BIN(cmos_minutes); - -/* - * since we're only adjusting minutes and seconds, don't interfere with hour - * overflow. This avoids messing with unknown time zones but requires your RTC - * not to be off by more than 15 minutes. Since we're calling it only when - * our clock is externally synchronized using NTP, this shouldn't be a problem. - */ - - real_seconds = nowtime % 60; - real_minutes = nowtime / 60; - if (((abs(real_minutes - cmos_minutes) + 15) / 30) & 1) - real_minutes += 30; /* correct for half hour time zone */ - real_minutes %= 60; - -#if 0 - /* AMD 8111 is a really bad time keeper and hits this regularly. - It probably was an attempt to avoid screwing up DST, but ignore - that for now. */ - if (abs(real_minutes - cmos_minutes) >= 30) { - printk(KERN_WARNING "time.c: can't update CMOS clock " - "from %d to %d\n", cmos_minutes, real_minutes); - } else -#endif - - { - BIN_TO_BCD(real_seconds); - BIN_TO_BCD(real_minutes); - CMOS_WRITE(real_seconds, RTC_SECONDS); - CMOS_WRITE(real_minutes, RTC_MINUTES); - } - -/* - * The following flags have to be released exactly in this order, otherwise the - * DS12887 (popular MC146818A clone with integrated battery and quartz) will - * not reset the oscillator and will not update precisely 500 ms later. You - * won't find this mentioned in the Dallas Semiconductor data sheets, but who - * believes data sheets anyway ... -- Markus Kuhn - */ - - CMOS_WRITE(control, RTC_CONTROL); - CMOS_WRITE(freq_select, RTC_FREQ_SELECT); - - spin_unlock(&rtc_lock); -} - - -/* monotonic_clock(): returns # of nanoseconds passed since time_init() - * Note: This function is required to return accurate - * time even in the absence of multiple timer ticks. - */ -unsigned long long monotonic_clock(void) -{ - unsigned long seq; - u32 last_offset, this_offset, offset; - unsigned long long base; - - if (vxtime.mode == VXTIME_HPET) { - do { - seq = read_seqbegin(&xtime_lock); - - last_offset = vxtime.last; - base = monotonic_base; - this_offset = hpet_readl(HPET_COUNTER); - - } while (read_seqretry(&xtime_lock, seq)); - offset = (this_offset - last_offset); - offset *=(NSEC_PER_SEC/HZ)/hpet_tick; - return base + offset; - }else{ - do { - seq = read_seqbegin(&xtime_lock); - - last_offset = vxtime.last_tsc; - base = monotonic_base; - } while (read_seqretry(&xtime_lock, seq)); - sync_core(); - rdtscll(this_offset); - offset = (this_offset - last_offset)*1000/cpu_khz; - return base + offset; - } - - -} -EXPORT_SYMBOL(monotonic_clock); - -static noinline void handle_lost_ticks(int lost, struct pt_regs *regs) -{ - static long lost_count; - static int warned; - - if (report_lost_ticks) { - printk(KERN_WARNING "time.c: Lost %d timer " - "tick(s)! ", lost); - print_symbol("rip %s)\n", regs->rip); - } - - if (lost_count == 1000 && !warned) { - printk(KERN_WARNING - "warning: many lost ticks.\n" - KERN_WARNING "Your time source seems to be instable or " - "some driver is hogging interupts\n"); - print_symbol("rip %s\n", regs->rip); - if (vxtime.mode == VXTIME_TSC && vxtime.hpet_address) { - printk(KERN_WARNING "Falling back to HPET\n"); - vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick; - vxtime.mode = VXTIME_HPET; - do_gettimeoffset = do_gettimeoffset_hpet; - } - /* else should fall back to PIT, but code missing. */ - warned = 1; - } else - lost_count++; - -#ifdef CONFIG_CPU_FREQ - /* In some cases the CPU can change frequency without us noticing - (like going into thermal throttle) - Give cpufreq a change to catch up. */ - if ((lost_count+1) % 25 == 0) { - cpufreq_delayed_get(); - } -#endif -} - -static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) -{ - static unsigned long rtc_update = 0; - unsigned long tsc; - int delay, offset = 0, lost = 0; - -/* - * Here we are in the timer irq handler. We have irqs locally disabled (so we - * don't need spin_lock_irqsave()) but we don't know if the timer_bh is running - * on the other CPU, so we need a lock. We also need to lock the vsyscall - * variables, because both do_timer() and us change them -arca+vojtech - */ - - write_seqlock(&xtime_lock); - - if (vxtime.hpet_address) - offset = hpet_readl(HPET_COUNTER); - - if (hpet_use_timer) { - /* if we're using the hpet timer functionality, - * we can more accurately know the counter value - * when the timer interrupt occured. - */ - offset = hpet_readl(HPET_T0_CMP) - hpet_tick; - delay = hpet_readl(HPET_COUNTER) - offset; - } else { - spin_lock(&i8253_lock); - outb_p(0x00, 0x43); - delay = inb_p(0x40); - delay |= inb(0x40) << 8; - spin_unlock(&i8253_lock); - delay = LATCH - 1 - delay; - } - - rdtscll_sync(&tsc); - - if (vxtime.mode == VXTIME_HPET) { - if (offset - vxtime.last > hpet_tick) { - lost = (offset - vxtime.last) / hpet_tick - 1; - } - - monotonic_base += - (offset - vxtime.last)*(NSEC_PER_SEC/HZ) / hpet_tick; - - vxtime.last = offset; -#ifdef CONFIG_X86_PM_TIMER - } else if (vxtime.mode == VXTIME_PMTMR) { - lost = pmtimer_mark_offset(); -#endif - } else { - offset = (((tsc - vxtime.last_tsc) * - vxtime.tsc_quot) >> 32) - (USEC_PER_SEC / HZ); - - if (offset < 0) - offset = 0; - - if (offset > (USEC_PER_SEC / HZ)) { - lost = offset / (USEC_PER_SEC / HZ); - offset %= (USEC_PER_SEC / HZ); - } - - monotonic_base += (tsc - vxtime.last_tsc)*1000000/cpu_khz ; - - vxtime.last_tsc = tsc - vxtime.quot * delay / vxtime.tsc_quot; - - if ((((tsc - vxtime.last_tsc) * - vxtime.tsc_quot) >> 32) < offset) - vxtime.last_tsc = tsc - - (((long) offset << 32) / vxtime.tsc_quot) - 1; - } - - if (lost > 0) { - handle_lost_ticks(lost, regs); - jiffies += lost; - } - -/* - * Do the timer stuff. - */ - - do_timer(regs); -#ifndef CONFIG_SMP - update_process_times(user_mode(regs)); -#endif - -/* - * In the SMP case we use the local APIC timer interrupt to do the profiling, - * except when we simulate SMP mode on a uniprocessor system, in that case we - * have to call the local interrupt handler. - */ - -#ifndef CONFIG_X86_LOCAL_APIC - profile_tick(CPU_PROFILING, regs); -#else - if (!using_apic_timer) - smp_local_timer_interrupt(regs); -#endif - -/* - * If we have an externally synchronized Linux clock, then update CMOS clock - * accordingly every ~11 minutes. set_rtc_mmss() will be called in the jiffy - * closest to exactly 500 ms before the next second. If the update fails, we - * don't care, as it'll be updated on the next turn, and the problem (time way - * off) isn't likely to go away much sooner anyway. - */ - - if ((~time_status & STA_UNSYNC) && xtime.tv_sec > rtc_update && - abs(xtime.tv_nsec - 500000000) <= tick_nsec / 2) { - set_rtc_mmss(xtime.tv_sec); - rtc_update = xtime.tv_sec + 660; - } - - write_sequnlock(&xtime_lock); - - return IRQ_HANDLED; -} - -static unsigned int cyc2ns_scale; -#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ - -static inline void set_cyc2ns_scale(unsigned long cpu_mhz) -{ - cyc2ns_scale = (1000 << CYC2NS_SCALE_FACTOR)/cpu_mhz; -} - -static inline unsigned long long cycles_2_ns(unsigned long long cyc) -{ - return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; -} - -unsigned long long sched_clock(void) -{ - unsigned long a = 0; - -#if 0 - /* Don't do a HPET read here. Using TSC always is much faster - and HPET may not be mapped yet when the scheduler first runs. - Disadvantage is a small drift between CPUs in some configurations, - but that should be tolerable. */ - if (__vxtime.mode == VXTIME_HPET) - return (hpet_readl(HPET_COUNTER) * vxtime.quot) >> 32; -#endif - - /* Could do CPU core sync here. Opteron can execute rdtsc speculatively, - which means it is not completely exact and may not be monotonous between - CPUs. But the errors should be too small to matter for scheduling - purposes. */ - - rdtscll(a); - return cycles_2_ns(a); -} - -unsigned long get_cmos_time(void) -{ - unsigned int timeout, year, mon, day, hour, min, sec; - unsigned char last, this; - unsigned long flags; - -/* - * The Linux interpretation of the CMOS clock register contents: When the - * Update-In-Progress (UIP) flag goes from 1 to 0, the RTC registers show the - * second which has precisely just started. Waiting for this can take up to 1 - * second, we timeout approximately after 2.4 seconds on a machine with - * standard 8.3 MHz ISA bus. - */ - - spin_lock_irqsave(&rtc_lock, flags); - - timeout = 1000000; - last = this = 0; - - while (timeout && last && !this) { - last = this; - this = CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP; - timeout--; - } - -/* - * Here we are safe to assume the registers won't change for a whole second, so - * we just go ahead and read them. - */ - - sec = CMOS_READ(RTC_SECONDS); - min = CMOS_READ(RTC_MINUTES); - hour = CMOS_READ(RTC_HOURS); - day = CMOS_READ(RTC_DAY_OF_MONTH); - mon = CMOS_READ(RTC_MONTH); - year = CMOS_READ(RTC_YEAR); - - spin_unlock_irqrestore(&rtc_lock, flags); - -/* - * We know that x86-64 always uses BCD format, no need to check the config - * register. - */ - - BCD_TO_BIN(sec); - BCD_TO_BIN(min); - BCD_TO_BIN(hour); - BCD_TO_BIN(day); - BCD_TO_BIN(mon); - BCD_TO_BIN(year); - -/* - * x86-64 systems only exists since 2002. - * This will work up to Dec 31, 2100 - */ - year += 2000; - - return mktime(year, mon, day, hour, min, sec); -} - -#ifdef CONFIG_CPU_FREQ - -/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency - changes. - - RED-PEN: On SMP we assume all CPUs run with the same frequency. It's - not that important because current Opteron setups do not support - scaling on SMP anyroads. - - Should fix up last_tsc too. Currently gettimeofday in the - first tick after the change will be slightly wrong. */ - -#include <linux/workqueue.h> - -static unsigned int cpufreq_delayed_issched = 0; -static unsigned int cpufreq_init = 0; -static struct work_struct cpufreq_delayed_get_work; - -static void handle_cpufreq_delayed_get(void *v) -{ - unsigned int cpu; - for_each_online_cpu(cpu) { - cpufreq_get(cpu); - } - cpufreq_delayed_issched = 0; -} - -/* if we notice lost ticks, schedule a call to cpufreq_get() as it tries - * to verify the CPU frequency the timing core thinks the CPU is running - * at is still correct. - */ -static void cpufreq_delayed_get(void) -{ - static int warned; - if (cpufreq_init && !cpufreq_delayed_issched) { - cpufreq_delayed_issched = 1; - if (!warned) { - warned = 1; - printk(KERN_DEBUG "Losing some ticks... checking if CPU frequency changed.\n"); - } - schedule_work(&cpufreq_delayed_get_work); - } -} - -static unsigned int ref_freq = 0; -static unsigned long loops_per_jiffy_ref = 0; - -static unsigned long cpu_khz_ref = 0; - -static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, - void *data) -{ - struct cpufreq_freqs *freq = data; - unsigned long *lpj, dummy; - - if (cpu_has(&cpu_data[freq->cpu], X86_FEATURE_CONSTANT_TSC)) - return 0; - - lpj = &dummy; - if (!(freq->flags & CPUFREQ_CONST_LOOPS)) -#ifdef CONFIG_SMP - lpj = &cpu_data[freq->cpu].loops_per_jiffy; -#else - lpj = &boot_cpu_data.loops_per_jiffy; -#endif - - if (!ref_freq) { - ref_freq = freq->old; - loops_per_jiffy_ref = *lpj; - cpu_khz_ref = cpu_khz; - } - if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || - (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || - (val == CPUFREQ_RESUMECHANGE)) { - *lpj = - cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); - - cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new); - if (!(freq->flags & CPUFREQ_CONST_LOOPS)) - vxtime.tsc_quot = (1000L << 32) / cpu_khz; - } - - set_cyc2ns_scale(cpu_khz_ref / 1000); - - return 0; -} - -static struct notifier_block time_cpufreq_notifier_block = { - .notifier_call = time_cpufreq_notifier -}; - -static int __init cpufreq_tsc(void) -{ - INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get, NULL); - if (!cpufreq_register_notifier(&time_cpufreq_notifier_block, - CPUFREQ_TRANSITION_NOTIFIER)) - cpufreq_init = 1; - return 0; -} - -core_initcall(cpufreq_tsc); - -#endif - -/* - * calibrate_tsc() calibrates the processor TSC in a very simple way, comparing - * it to the HPET timer of known frequency. - */ - -#define TICK_COUNT 100000000 - -static unsigned int __init hpet_calibrate_tsc(void) -{ - int tsc_start, hpet_start; - int tsc_now, hpet_now; - unsigned long flags; - - local_irq_save(flags); - local_irq_disable(); - - hpet_start = hpet_readl(HPET_COUNTER); - rdtscl(tsc_start); - - do { - local_irq_disable(); - hpet_now = hpet_readl(HPET_COUNTER); - sync_core(); - rdtscl(tsc_now); - local_irq_restore(flags); - } while ((tsc_now - tsc_start) < TICK_COUNT && - (hpet_now - hpet_start) < TICK_COUNT); - - return (tsc_now - tsc_start) * 1000000000L - / ((hpet_now - hpet_start) * hpet_period / 1000); -} - - -/* - * pit_calibrate_tsc() uses the speaker output (channel 2) of - * the PIT. This is better than using the timer interrupt output, - * because we can read the value of the speaker with just one inb(), - * where we need three i/o operations for the interrupt channel. - * We count how many ticks the TSC does in 50 ms. - */ - -static unsigned int __init pit_calibrate_tsc(void) -{ - unsigned long start, end; - unsigned long flags; - - spin_lock_irqsave(&i8253_lock, flags); - - outb((inb(0x61) & ~0x02) | 0x01, 0x61); - - outb(0xb0, 0x43); - outb((PIT_TICK_RATE / (1000 / 50)) & 0xff, 0x42); - outb((PIT_TICK_RATE / (1000 / 50)) >> 8, 0x42); - rdtscll(start); - sync_core(); - while ((inb(0x61) & 0x20) == 0); - sync_core(); - rdtscll(end); - - spin_unlock_irqrestore(&i8253_lock, flags); - - return (end - start) / 50; -} - -#ifdef CONFIG_HPET -static __init int late_hpet_init(void) -{ - struct hpet_data hd; - unsigned int ntimer; - - if (!vxtime.hpet_address) - return -1; - - memset(&hd, 0, sizeof (hd)); - - ntimer = hpet_readl(HPET_ID); - ntimer = (ntimer & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT; - ntimer++; - - /* - * Register with driver. - * Timer0 and Timer1 is used by platform. - */ - hd.hd_phys_address = vxtime.hpet_address; - hd.hd_address = (void *)fix_to_virt(FIX_HPET_BASE); - hd.hd_nirqs = ntimer; - hd.hd_flags = HPET_DATA_PLATFORM; - hpet_reserve_timer(&hd, 0); -#ifdef CONFIG_HPET_EMULATE_RTC - hpet_reserve_timer(&hd, 1); -#endif - hd.hd_irq[0] = HPET_LEGACY_8254; - hd.hd_irq[1] = HPET_LEGACY_RTC; - if (ntimer > 2) { - struct hpet *hpet; - struct hpet_timer *timer; - int i; - - hpet = (struct hpet *) fix_to_virt(FIX_HPET_BASE); - - for (i = 2, timer = &hpet->hpet_timers[2]; i < ntimer; - timer++, i++) - hd.hd_irq[i] = (timer->hpet_config & - Tn_INT_ROUTE_CNF_MASK) >> - Tn_INT_ROUTE_CNF_SHIFT; - - } - - hpet_alloc(&hd); - return 0; -} -fs_initcall(late_hpet_init); -#endif - -static int hpet_timer_stop_set_go(unsigned long tick) -{ - unsigned int cfg; - -/* - * Stop the timers and reset the main counter. - */ - - cfg = hpet_readl(HPET_CFG); - cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY); - hpet_writel(cfg, HPET_CFG); - hpet_writel(0, HPET_COUNTER); - hpet_writel(0, HPET_COUNTER + 4); - -/* - * Set up timer 0, as periodic with first interrupt to happen at hpet_tick, - * and period also hpet_tick. - */ - if (hpet_use_timer) { - hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | - HPET_TN_32BIT, HPET_T0_CFG); - hpet_writel(hpet_tick, HPET_T0_CMP); - hpet_writel(hpet_tick, HPET_T0_CMP); /* AK: why twice? */ - cfg |= HPET_CFG_LEGACY; - } -/* - * Go! - */ - - cfg |= HPET_CFG_ENABLE; - hpet_writel(cfg, HPET_CFG); - - return 0; -} - -static int hpet_init(void) -{ - unsigned int id; - - if (!vxtime.hpet_address) - return -1; - set_fixmap_nocache(FIX_HPET_BASE, vxtime.hpet_address); - __set_fixmap(VSYSCALL_HPET, vxtime.hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE); - -/* - * Read the period, compute tick and quotient. - */ - - id = hpet_readl(HPET_ID); - - if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER)) - return -1; - - hpet_period = hpet_readl(HPET_PERIOD); - if (hpet_period < 100000 || hpet_period > 100000000) - return -1; - - hpet_tick = (1000000000L * (USEC_PER_SEC / HZ) + hpet_period / 2) / - hpet_period; - - hpet_use_timer = (id & HPET_ID_LEGSUP); - - return hpet_timer_stop_set_go(hpet_tick); -} - -static int hpet_reenable(void) -{ - return hpet_timer_stop_set_go(hpet_tick); -} - -void __init pit_init(void) -{ - unsigned long flags; - - spin_lock_irqsave(&i8253_lock, flags); - outb_p(0x34, 0x43); /* binary, mode 2, LSB/MSB, ch 0 */ - outb_p(LATCH & 0xff, 0x40); /* LSB */ - outb_p(LATCH >> 8, 0x40); /* MSB */ - spin_unlock_irqrestore(&i8253_lock, flags); -} - -int __init time_setup(char *str) -{ - report_lost_ticks = 1; - return 1; -} - -static struct irqaction irq0 = { - timer_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "timer", NULL, NULL -}; - -extern void __init config_acpi_tables(void); - -void __init time_init(void) -{ - char *timename; - -#ifdef HPET_HACK_ENABLE_DANGEROUS - if (!vxtime.hpet_address) { - printk(KERN_WARNING "time.c: WARNING: Enabling HPET base " - "manually!\n"); - outl(0x800038a0, 0xcf8); - outl(0xff000001, 0xcfc); - outl(0x800038a0, 0xcf8); - vxtime.hpet_address = inl(0xcfc) & 0xfffffffe; - printk(KERN_WARNING "time.c: WARNING: Enabled HPET " - "at %#lx.\n", vxtime.hpet_address); - } -#endif - if (nohpet) - vxtime.hpet_address = 0; - - xtime.tv_sec = get_cmos_time(); - xtime.tv_nsec = 0; - - set_normalized_timespec(&wall_to_monotonic, - -xtime.tv_sec, -xtime.tv_nsec); - - if (!hpet_init()) - vxtime_hz = (1000000000000000L + hpet_period / 2) / - hpet_period; - - if (hpet_use_timer) { - cpu_khz = hpet_calibrate_tsc(); - timename = "HPET"; -#ifdef CONFIG_X86_PM_TIMER - } else if (pmtmr_ioport) { - vxtime_hz = PM_TIMER_FREQUENCY; - timename = "PM"; - pit_init(); - cpu_khz = pit_calibrate_tsc(); -#endif - } else { - pit_init(); - cpu_khz = pit_calibrate_tsc(); - timename = "PIT"; - } - - printk(KERN_INFO "time.c: Using %ld.%06ld MHz %s timer.\n", - vxtime_hz / 1000000, vxtime_hz % 1000000, timename); - printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n", - cpu_khz / 1000, cpu_khz % 1000); - vxtime.mode = VXTIME_TSC; - vxtime.quot = (1000000L << 32) / vxtime_hz; - vxtime.tsc_quot = (1000L << 32) / cpu_khz; - vxtime.hz = vxtime_hz; - rdtscll_sync(&vxtime.last_tsc); - setup_irq(0, &irq0); - - set_cyc2ns_scale(cpu_khz / 1000); - -#ifndef CONFIG_SMP - time_init_gtod(); -#endif -} - -/* - * Make an educated guess if the TSC is trustworthy and synchronized - * over all CPUs. - */ -static __init int unsynchronized_tsc(void) -{ -#ifdef CONFIG_SMP - if (oem_force_hpet_timer()) - return 1; - /* Intel systems are normally all synchronized. Exceptions - are handled in the OEM check above. */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) - return 0; - /* All in a single socket - should be synchronized */ - if (cpus_weight(cpu_core_map[0]) == num_online_cpus()) - return 0; -#endif - /* Assume multi socket systems are not synchronized */ - return num_online_cpus() > 1; -} - -/* - * Decide after all CPUs are booted what mode gettimeofday should use. - */ -void __init time_init_gtod(void) -{ - char *timetype; - - if (unsynchronized_tsc()) - notsc = 1; - if (vxtime.hpet_address && notsc) { - timetype = hpet_use_timer ? "HPET" : "PIT/HPET"; - vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick; - vxtime.mode = VXTIME_HPET; - do_gettimeoffset = do_gettimeoffset_hpet; -#ifdef CONFIG_X86_PM_TIMER - /* Using PM for gettimeofday is quite slow, but we have no other - choice because the TSC is too unreliable on some systems. */ - } else if (pmtmr_ioport && !vxtime.hpet_address && notsc) { - timetype = "PM"; - do_gettimeoffset = do_gettimeoffset_pm; - vxtime.mode = VXTIME_PMTMR; - sysctl_vsyscall = 0; - printk(KERN_INFO "Disabling vsyscall due to use of PM timer\n"); -#endif - } else { - timetype = hpet_use_timer ? "HPET/TSC" : "PIT/TSC"; - vxtime.mode = VXTIME_TSC; - } - - printk(KERN_INFO "time.c: Using %s based timekeeping.\n", timetype); -} - -__setup("report_lost_ticks", time_setup); - -static long clock_cmos_diff; -static unsigned long sleep_start; - -static int timer_suspend(struct sys_device *dev, pm_message_t state) -{ - /* - * Estimate time zone so that set_time can update the clock - */ - long cmos_time = get_cmos_time(); - - clock_cmos_diff = -cmos_time; - clock_cmos_diff += get_seconds(); - sleep_start = cmos_time; - return 0; -} - -static int timer_resume(struct sys_device *dev) -{ - unsigned long flags; - unsigned long sec; - unsigned long ctime = get_cmos_time(); - unsigned long sleep_length = (ctime - sleep_start) * HZ; - - if (vxtime.hpet_address) - hpet_reenable(); - else - i8254_timer_resume(); - - sec = ctime + clock_cmos_diff; - write_seqlock_irqsave(&xtime_lock,flags); - xtime.tv_sec = sec; - xtime.tv_nsec = 0; - write_sequnlock_irqrestore(&xtime_lock,flags); - jiffies += sleep_length; - wall_jiffies += sleep_length; - return 0; -} - -static struct sysdev_class timer_sysclass = { - .resume = timer_resume, - .suspend = timer_suspend, - set_kset_name("timer"), -}; - - -/* XXX this driverfs stuff should probably go elsewhere later -john */ -static struct sys_device device_timer = { - .id = 0, - .cls = &timer_sysclass, -}; - -static int time_init_device(void) -{ - int error = sysdev_class_register(&timer_sysclass); - if (!error) - error = sysdev_register(&device_timer); - return error; -} - -device_initcall(time_init_device); - -#ifdef CONFIG_HPET_EMULATE_RTC -/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET - * is enabled, we support RTC interrupt functionality in software. - * RTC has 3 kinds of interrupts: - * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock - * is updated - * 2) Alarm Interrupt - generate an interrupt at a specific time of day - * 3) Periodic Interrupt - generate periodic interrupt, with frequencies - * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2) - * (1) and (2) above are implemented using polling at a frequency of - * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt - * overhead. (DEFAULT_RTC_INT_FREQ) - * For (3), we use interrupts at 64Hz or user specified periodic - * frequency, whichever is higher. - */ -#include <linux/rtc.h> - -extern irqreturn_t rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs); - -#define DEFAULT_RTC_INT_FREQ 64 -#define RTC_NUM_INTS 1 - -static unsigned long UIE_on; -static unsigned long prev_update_sec; - -static unsigned long AIE_on; -static struct rtc_time alarm_time; - -static unsigned long PIE_on; -static unsigned long PIE_freq = DEFAULT_RTC_INT_FREQ; -static unsigned long PIE_count; - -static unsigned long hpet_rtc_int_freq; /* RTC interrupt frequency */ - -int is_hpet_enabled(void) -{ - return vxtime.hpet_address != 0; -} - -/* - * Timer 1 for RTC, we do not use periodic interrupt feature, - * even if HPET supports periodic interrupts on Timer 1. - * The reason being, to set up a periodic interrupt in HPET, we need to - * stop the main counter. And if we do that everytime someone diables/enables - * RTC, we will have adverse effect on main kernel timer running on Timer 0. - * So, for the time being, simulate the periodic interrupt in software. - * - * hpet_rtc_timer_init() is called for the first time and during subsequent - * interuppts reinit happens through hpet_rtc_timer_reinit(). - */ -int hpet_rtc_timer_init(void) -{ - unsigned int cfg, cnt; - unsigned long flags; - - if (!is_hpet_enabled()) - return 0; - /* - * Set the counter 1 and enable the interrupts. - */ - if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ)) - hpet_rtc_int_freq = PIE_freq; - else - hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; - - local_irq_save(flags); - cnt = hpet_readl(HPET_COUNTER); - cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq); - hpet_writel(cnt, HPET_T1_CMP); - local_irq_restore(flags); - - cfg = hpet_readl(HPET_T1_CFG); - cfg |= HPET_TN_ENABLE | HPET_TN_SETVAL | HPET_TN_32BIT; - hpet_writel(cfg, HPET_T1_CFG); - - return 1; -} - -static void hpet_rtc_timer_reinit(void) -{ - unsigned int cfg, cnt; - - if (!(PIE_on | AIE_on | UIE_on)) - return; - - if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ)) - hpet_rtc_int_freq = PIE_freq; - else - hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; - - /* It is more accurate to use the comparator value than current count.*/ - cnt = hpet_readl(HPET_T1_CMP); - cnt += hpet_tick*HZ/hpet_rtc_int_freq; - hpet_writel(cnt, HPET_T1_CMP); - - cfg = hpet_readl(HPET_T1_CFG); - cfg |= HPET_TN_ENABLE | HPET_TN_SETVAL | HPET_TN_32BIT; - hpet_writel(cfg, HPET_T1_CFG); - - return; -} - -/* - * The functions below are called from rtc driver. - * Return 0 if HPET is not being used. - * Otherwise do the necessary changes and return 1. - */ -int hpet_mask_rtc_irq_bit(unsigned long bit_mask) -{ - if (!is_hpet_enabled()) - return 0; - - if (bit_mask & RTC_UIE) - UIE_on = 0; - if (bit_mask & RTC_PIE) - PIE_on = 0; - if (bit_mask & RTC_AIE) - AIE_on = 0; - - return 1; -} - -int hpet_set_rtc_irq_bit(unsigned long bit_mask) -{ - int timer_init_reqd = 0; - - if (!is_hpet_enabled()) - return 0; - - if (!(PIE_on | AIE_on | UIE_on)) - timer_init_reqd = 1; - - if (bit_mask & RTC_UIE) { - UIE_on = 1; - } - if (bit_mask & RTC_PIE) { - PIE_on = 1; - PIE_count = 0; - } - if (bit_mask & RTC_AIE) { - AIE_on = 1; - } - - if (timer_init_reqd) - hpet_rtc_timer_init(); - - return 1; -} - -int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec) -{ - if (!is_hpet_enabled()) - return 0; - - alarm_time.tm_hour = hrs; - alarm_time.tm_min = min; - alarm_time.tm_sec = sec; - - return 1; -} - -int hpet_set_periodic_freq(unsigned long freq) -{ - if (!is_hpet_enabled()) - return 0; - - PIE_freq = freq; - PIE_count = 0; - - return 1; -} - -int hpet_rtc_dropped_irq(void) -{ - if (!is_hpet_enabled()) - return 0; - - return 1; -} - -irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs) -{ - struct rtc_time curr_time; - unsigned long rtc_int_flag = 0; - int call_rtc_interrupt = 0; - - hpet_rtc_timer_reinit(); - - if (UIE_on | AIE_on) { - rtc_get_rtc_time(&curr_time); - } - if (UIE_on) { - if (curr_time.tm_sec != prev_update_sec) { - /* Set update int info, call real rtc int routine */ - call_rtc_interrupt = 1; - rtc_int_flag = RTC_UF; - prev_update_sec = curr_time.tm_sec; - } - } - if (PIE_on) { - PIE_count++; - if (PIE_count >= hpet_rtc_int_freq/PIE_freq) { - /* Set periodic int info, call real rtc int routine */ - call_rtc_interrupt = 1; - rtc_int_flag |= RTC_PF; - PIE_count = 0; - } - } - if (AIE_on) { - if ((curr_time.tm_sec == alarm_time.tm_sec) && - (curr_time.tm_min == alarm_time.tm_min) && - (curr_time.tm_hour == alarm_time.tm_hour)) { - /* Set alarm int info, call real rtc int routine */ - call_rtc_interrupt = 1; - rtc_int_flag |= RTC_AF; - } - } - if (call_rtc_interrupt) { - rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8)); - rtc_interrupt(rtc_int_flag, dev_id, regs); - } - return IRQ_HANDLED; -} -#endif - - - -static int __init nohpet_setup(char *s) -{ - nohpet = 1; - return 0; -} - -__setup("nohpet", nohpet_setup); - - -static int __init notsc_setup(char *s) -{ - notsc = 1; - return 0; -} - -__setup("notsc", notsc_setup); - - diff --git a/arch/x86_64/kernel/trampoline.S b/arch/x86_64/kernel/trampoline.S deleted file mode 100644 index 6d9c9a8e7d0..00000000000 --- a/arch/x86_64/kernel/trampoline.S +++ /dev/null @@ -1,64 +0,0 @@ -/* - * - * Trampoline.S Derived from Setup.S by Linus Torvalds - * - * 4 Jan 1997 Michael Chastain: changed to gnu as. - * - * Entry: CS:IP point to the start of our code, we are - * in real mode with no stack, but the rest of the - * trampoline page to make our stack and everything else - * is a mystery. - * - * In fact we don't actually need a stack so we don't - * set one up. - * - * On entry to trampoline_data, the processor is in real mode - * with 16-bit addressing and 16-bit data. CS has some value - * and IP is zero. Thus, data addresses need to be absolute - * (no relocation) and are taken with regard to r_base. - * - * If you work on this file, check the object module with objdump - * --full-contents --reloc to make sure there are no relocation - * entries. For the GDT entry we do hand relocation in smpboot.c - * because of 64bit linker limitations. - */ - -#include <linux/linkage.h> -#include <asm/segment.h> -#include <asm/page.h> - -.data - -.code16 - -ENTRY(trampoline_data) -r_base = . - wbinvd - mov %cs, %ax # Code and data in the same place - mov %ax, %ds - - cli # We should be safe anyway - - movl $0xA5A5A5A5, trampoline_data - r_base - # write marker for master knows we're running - - lidt idt_48 - r_base # load idt with 0, 0 - lgdt gdt_48 - r_base # load gdt with whatever is appropriate - - xor %ax, %ax - inc %ax # protected mode (PE) bit - lmsw %ax # into protected mode - # flaush prefetch and jump to startup_32 in arch/x86_64/kernel/head.S - ljmpl $__KERNEL32_CS, $(startup_32-__START_KERNEL_map) - - # Careful these need to be in the same 64K segment as the above; -idt_48: - .word 0 # idt limit = 0 - .word 0, 0 # idt base = 0L - -gdt_48: - .short __KERNEL32_CS + 7 # gdt limit - .long cpu_gdt_table-__START_KERNEL_map - -.globl trampoline_end -trampoline_end: diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c deleted file mode 100644 index 6ead433a388..00000000000 --- a/arch/x86_64/kernel/traps.c +++ /dev/null @@ -1,957 +0,0 @@ -/* - * linux/arch/x86-64/traps.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs - * - * Pentium III FXSR, SSE support - * Gareth Hughes <gareth@valinux.com>, May 2000 - * - * $Id: traps.c,v 1.36 2002/03/24 11:09:10 ak Exp $ - */ - -/* - * 'Traps.c' handles hardware traps and faults after we have saved some - * state in 'entry.S'. - */ -#include <linux/config.h> -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/string.h> -#include <linux/errno.h> -#include <linux/ptrace.h> -#include <linux/timer.h> -#include <linux/mm.h> -#include <linux/init.h> -#include <linux/delay.h> -#include <linux/spinlock.h> -#include <linux/interrupt.h> -#include <linux/module.h> -#include <linux/moduleparam.h> -#include <linux/nmi.h> - -#include <asm/system.h> -#include <asm/uaccess.h> -#include <asm/io.h> -#include <asm/atomic.h> -#include <asm/debugreg.h> -#include <asm/desc.h> -#include <asm/i387.h> -#include <asm/kdebug.h> -#include <asm/processor.h> - -#include <asm/smp.h> -#include <asm/pgalloc.h> -#include <asm/pda.h> -#include <asm/proto.h> -#include <asm/nmi.h> - -#include <linux/irq.h> - - -extern struct gate_struct idt_table[256]; - -asmlinkage void divide_error(void); -asmlinkage void debug(void); -asmlinkage void nmi(void); -asmlinkage void int3(void); -asmlinkage void overflow(void); -asmlinkage void bounds(void); -asmlinkage void invalid_op(void); -asmlinkage void device_not_available(void); -asmlinkage void double_fault(void); -asmlinkage void coprocessor_segment_overrun(void); -asmlinkage void invalid_TSS(void); -asmlinkage void segment_not_present(void); -asmlinkage void stack_segment(void); -asmlinkage void general_protection(void); -asmlinkage void page_fault(void); -asmlinkage void coprocessor_error(void); -asmlinkage void simd_coprocessor_error(void); -asmlinkage void reserved(void); -asmlinkage void alignment_check(void); -asmlinkage void machine_check(void); -asmlinkage void spurious_interrupt_bug(void); -asmlinkage void call_debug(void); - -struct notifier_block *die_chain; -static DEFINE_SPINLOCK(die_notifier_lock); - -int register_die_notifier(struct notifier_block *nb) -{ - int err = 0; - unsigned long flags; - spin_lock_irqsave(&die_notifier_lock, flags); - err = notifier_chain_register(&die_chain, nb); - spin_unlock_irqrestore(&die_notifier_lock, flags); - return err; -} - -static inline void conditional_sti(struct pt_regs *regs) -{ - if (regs->eflags & X86_EFLAGS_IF) - local_irq_enable(); -} - -static int kstack_depth_to_print = 10; - -#ifdef CONFIG_KALLSYMS -#include <linux/kallsyms.h> -int printk_address(unsigned long address) -{ - unsigned long offset = 0, symsize; - const char *symname; - char *modname; - char *delim = ":"; - char namebuf[128]; - - symname = kallsyms_lookup(address, &symsize, &offset, &modname, namebuf); - if (!symname) - return printk("[<%016lx>]", address); - if (!modname) - modname = delim = ""; - return printk("<%016lx>{%s%s%s%s%+ld}", - address,delim,modname,delim,symname,offset); -} -#else -int printk_address(unsigned long address) -{ - return printk("[<%016lx>]", address); -} -#endif - -static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, - unsigned *usedp, const char **idp) -{ - static const char ids[N_EXCEPTION_STACKS][8] = { - [DEBUG_STACK - 1] = "#DB", - [NMI_STACK - 1] = "NMI", - [DOUBLEFAULT_STACK - 1] = "#DF", - [STACKFAULT_STACK - 1] = "#SS", - [MCE_STACK - 1] = "#MC", - }; - unsigned k; - - for (k = 0; k < N_EXCEPTION_STACKS; k++) { - unsigned long end; - - end = per_cpu(init_tss, cpu).ist[k]; - if (stack >= end) - continue; - if (stack >= end - EXCEPTION_STKSZ) { - if (*usedp & (1U << k)) - break; - *usedp |= 1U << k; - *idp = ids[k]; - return (unsigned long *)end; - } - } - return NULL; -} - -/* - * x86-64 can have upto three kernel stacks: - * process stack - * interrupt stack - * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack - */ - -void show_trace(unsigned long *stack) -{ - unsigned long addr; - const unsigned cpu = safe_smp_processor_id(); - unsigned long *irqstack_end = (unsigned long *)cpu_pda[cpu].irqstackptr; - int i; - unsigned used = 0; - - printk("\nCall Trace:"); - -#define HANDLE_STACK(cond) \ - do while (cond) { \ - addr = *stack++; \ - if (kernel_text_address(addr)) { \ - /* \ - * If the address is either in the text segment of the \ - * kernel, or in the region which contains vmalloc'ed \ - * memory, it *may* be the address of a calling \ - * routine; if so, print it so that someone tracing \ - * down the cause of the crash will be able to figure \ - * out the call path that was taken. \ - */ \ - i += printk_address(addr); \ - if (i > 50) { \ - printk("\n "); \ - i = 0; \ - } \ - else \ - i += printk(" "); \ - } \ - } while (0) - - for(i = 0; ; ) { - const char *id; - unsigned long *estack_end; - estack_end = in_exception_stack(cpu, (unsigned long)stack, - &used, &id); - - if (estack_end) { - i += printk(" <%s> ", id); - HANDLE_STACK (stack < estack_end); - i += printk(" <EOE> "); - stack = (unsigned long *) estack_end[-2]; - continue; - } - if (irqstack_end) { - unsigned long *irqstack; - irqstack = irqstack_end - - (IRQSTACKSIZE - 64) / sizeof(*irqstack); - - if (stack >= irqstack && stack < irqstack_end) { - i += printk(" <IRQ> "); - HANDLE_STACK (stack < irqstack_end); - stack = (unsigned long *) (irqstack_end[-1]); - irqstack_end = NULL; - i += printk(" <EOI> "); - continue; - } - } - break; - } - - HANDLE_STACK (((long) stack & (THREAD_SIZE-1)) != 0); -#undef HANDLE_STACK - printk("\n"); -} - -void show_stack(struct task_struct *tsk, unsigned long * rsp) -{ - unsigned long *stack; - int i; - const int cpu = safe_smp_processor_id(); - unsigned long *irqstack_end = (unsigned long *) (cpu_pda[cpu].irqstackptr); - unsigned long *irqstack = (unsigned long *) (cpu_pda[cpu].irqstackptr - IRQSTACKSIZE); - - // debugging aid: "show_stack(NULL, NULL);" prints the - // back trace for this cpu. - - if (rsp == NULL) { - if (tsk) - rsp = (unsigned long *)tsk->thread.rsp; - else - rsp = (unsigned long *)&rsp; - } - - stack = rsp; - for(i=0; i < kstack_depth_to_print; i++) { - if (stack >= irqstack && stack <= irqstack_end) { - if (stack == irqstack_end) { - stack = (unsigned long *) (irqstack_end[-1]); - printk(" <EOI> "); - } - } else { - if (((long) stack & (THREAD_SIZE-1)) == 0) - break; - } - if (i && ((i % 4) == 0)) - printk("\n "); - printk("%016lx ", *stack++); - touch_nmi_watchdog(); - } - show_trace((unsigned long *)rsp); -} - -/* - * The architecture-independent dump_stack generator - */ -void dump_stack(void) -{ - unsigned long dummy; - show_trace(&dummy); -} - -EXPORT_SYMBOL(dump_stack); - -void show_registers(struct pt_regs *regs) -{ - int i; - int in_kernel = !user_mode(regs); - unsigned long rsp; - const int cpu = safe_smp_processor_id(); - struct task_struct *cur = cpu_pda[cpu].pcurrent; - - rsp = regs->rsp; - - printk("CPU %d ", cpu); - __show_regs(regs); - printk("Process %s (pid: %d, threadinfo %p, task %p)\n", - cur->comm, cur->pid, cur->thread_info, cur); - - /* - * When in-kernel, we also print out the stack and code at the - * time of the fault.. - */ - if (in_kernel) { - - printk("Stack: "); - show_stack(NULL, (unsigned long*)rsp); - - printk("\nCode: "); - if(regs->rip < PAGE_OFFSET) - goto bad; - - for(i=0;i<20;i++) - { - unsigned char c; - if(__get_user(c, &((unsigned char*)regs->rip)[i])) { -bad: - printk(" Bad RIP value."); - break; - } - printk("%02x ", c); - } - } - printk("\n"); -} - -void handle_BUG(struct pt_regs *regs) -{ - struct bug_frame f; - char tmp; - - if (user_mode(regs)) - return; - if (__copy_from_user(&f, (struct bug_frame *) regs->rip, - sizeof(struct bug_frame))) - return; - if ((unsigned long)f.filename < __PAGE_OFFSET || - f.ud2[0] != 0x0f || f.ud2[1] != 0x0b) - return; - if (__get_user(tmp, f.filename)) - f.filename = "unmapped filename"; - printk("----------- [cut here ] --------- [please bite here ] ---------\n"); - printk(KERN_ALERT "Kernel BUG at %.50s:%d\n", f.filename, f.line); -} - -#ifdef CONFIG_BUG -void out_of_line_bug(void) -{ - BUG(); -} -#endif - -static DEFINE_SPINLOCK(die_lock); -static int die_owner = -1; - -void oops_begin(void) -{ - int cpu = safe_smp_processor_id(); - /* racy, but better than risking deadlock. */ - local_irq_disable(); - if (!spin_trylock(&die_lock)) { - if (cpu == die_owner) - /* nested oops. should stop eventually */; - else - spin_lock(&die_lock); - } - die_owner = cpu; - console_verbose(); - bust_spinlocks(1); -} - -void oops_end(void) -{ - die_owner = -1; - bust_spinlocks(0); - spin_unlock(&die_lock); - if (panic_on_oops) - panic("Oops"); -} - -void __die(const char * str, struct pt_regs * regs, long err) -{ - static int die_counter; - printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter); -#ifdef CONFIG_PREEMPT - printk("PREEMPT "); -#endif -#ifdef CONFIG_SMP - printk("SMP "); -#endif -#ifdef CONFIG_DEBUG_PAGEALLOC - printk("DEBUG_PAGEALLOC"); -#endif - printk("\n"); - notify_die(DIE_OOPS, (char *)str, regs, err, 255, SIGSEGV); - show_registers(regs); - /* Executive summary in case the oops scrolled away */ - printk(KERN_ALERT "RIP "); - printk_address(regs->rip); - printk(" RSP <%016lx>\n", regs->rsp); -} - -void die(const char * str, struct pt_regs * regs, long err) -{ - oops_begin(); - handle_BUG(regs); - __die(str, regs, err); - oops_end(); - do_exit(SIGSEGV); -} -static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err) -{ - if (!(regs->eflags & VM_MASK) && (regs->cs == __KERNEL_CS)) - die(str, regs, err); -} - -void die_nmi(char *str, struct pt_regs *regs) -{ - oops_begin(); - /* - * We are in trouble anyway, lets at least try - * to get a message out. - */ - printk(str, safe_smp_processor_id()); - show_registers(regs); - if (panic_on_timeout || panic_on_oops) - panic("nmi watchdog"); - printk("console shuts up ...\n"); - oops_end(); - do_exit(SIGSEGV); -} - -static void do_trap(int trapnr, int signr, char *str, - struct pt_regs * regs, long error_code, siginfo_t *info) -{ - conditional_sti(regs); - -#ifdef CONFIG_CHECKING - { - unsigned long gs; - struct x8664_pda *pda = cpu_pda + safe_smp_processor_id(); - rdmsrl(MSR_GS_BASE, gs); - if (gs != (unsigned long)pda) { - wrmsrl(MSR_GS_BASE, pda); - printk("%s: wrong gs %lx expected %p rip %lx\n", str, gs, pda, - regs->rip); - } - } -#endif - - if (user_mode(regs)) { - struct task_struct *tsk = current; - - if (exception_trace && unhandled_signal(tsk, signr)) - printk(KERN_INFO - "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n", - tsk->comm, tsk->pid, str, - regs->rip,regs->rsp,error_code); - - tsk->thread.error_code = error_code; - tsk->thread.trap_no = trapnr; - if (info) - force_sig_info(signr, info, tsk); - else - force_sig(signr, tsk); - return; - } - - - /* kernel trap */ - { - const struct exception_table_entry *fixup; - fixup = search_exception_tables(regs->rip); - if (fixup) { - regs->rip = fixup->fixup; - } else - die(str, regs, error_code); - return; - } -} - -#define DO_ERROR(trapnr, signr, str, name) \ -asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ -{ \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ - do_trap(trapnr, signr, str, regs, error_code, NULL); \ -} - -#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ -asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ -{ \ - siginfo_t info; \ - info.si_signo = signr; \ - info.si_errno = 0; \ - info.si_code = sicode; \ - info.si_addr = (void __user *)siaddr; \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ - do_trap(trapnr, signr, str, regs, error_code, &info); \ -} - -DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->rip) -DO_ERROR( 4, SIGSEGV, "overflow", overflow) -DO_ERROR( 5, SIGSEGV, "bounds", bounds) -DO_ERROR_INFO( 6, SIGILL, "invalid operand", invalid_op, ILL_ILLOPN, regs->rip) -DO_ERROR( 7, SIGSEGV, "device not available", device_not_available) -DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) -DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) -DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) -DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) -DO_ERROR(18, SIGSEGV, "reserved", reserved) -DO_ERROR(12, SIGBUS, "stack segment", stack_segment) -DO_ERROR( 8, SIGSEGV, "double fault", double_fault) - -asmlinkage void do_general_protection(struct pt_regs * regs, long error_code) -{ - conditional_sti(regs); - -#ifdef CONFIG_CHECKING - { - unsigned long gs; - struct x8664_pda *pda = cpu_pda + safe_smp_processor_id(); - rdmsrl(MSR_GS_BASE, gs); - if (gs != (unsigned long)pda) { - wrmsrl(MSR_GS_BASE, pda); - oops_in_progress++; - printk("general protection handler: wrong gs %lx expected %p\n", gs, pda); - oops_in_progress--; - } - } -#endif - - if (user_mode(regs)) { - struct task_struct *tsk = current; - - if (exception_trace && unhandled_signal(tsk, SIGSEGV)) - printk(KERN_INFO - "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n", - tsk->comm, tsk->pid, - regs->rip,regs->rsp,error_code); - - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 13; - force_sig(SIGSEGV, tsk); - return; - } - - /* kernel gp */ - { - const struct exception_table_entry *fixup; - fixup = search_exception_tables(regs->rip); - if (fixup) { - regs->rip = fixup->fixup; - return; - } - if (notify_die(DIE_GPF, "general protection fault", regs, - error_code, 13, SIGSEGV) == NOTIFY_STOP) - return; - die("general protection fault", regs, error_code); - } -} - -static void mem_parity_error(unsigned char reason, struct pt_regs * regs) -{ - printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n"); - printk("You probably have a hardware problem with your RAM chips\n"); - - /* Clear and disable the memory parity error line. */ - reason = (reason & 0xf) | 4; - outb(reason, 0x61); -} - -static void io_check_error(unsigned char reason, struct pt_regs * regs) -{ - printk("NMI: IOCK error (debug interrupt?)\n"); - show_registers(regs); - - /* Re-enable the IOCK line, wait for a few seconds */ - reason = (reason & 0xf) | 8; - outb(reason, 0x61); - mdelay(2000); - reason &= ~8; - outb(reason, 0x61); -} - -static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs) -{ printk("Uhhuh. NMI received for unknown reason %02x.\n", reason); - printk("Dazed and confused, but trying to continue\n"); - printk("Do you have a strange power saving mode enabled?\n"); -} - -/* Runs on IST stack. This code must keep interrupts off all the time. - Nested NMIs are prevented by the CPU. */ -asmlinkage void default_do_nmi(struct pt_regs *regs) -{ - unsigned char reason = 0; - int cpu; - - cpu = smp_processor_id(); - - /* Only the BSP gets external NMIs from the system. */ - if (!cpu) - reason = get_nmi_reason(); - - if (!(reason & 0xc0)) { - if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 0, SIGINT) - == NOTIFY_STOP) - return; -#ifdef CONFIG_X86_LOCAL_APIC - /* - * Ok, so this is none of the documented NMI sources, - * so it must be the NMI watchdog. - */ - if (nmi_watchdog > 0) { - nmi_watchdog_tick(regs,reason); - return; - } -#endif - unknown_nmi_error(reason, regs); - return; - } - if (notify_die(DIE_NMI, "nmi", regs, reason, 0, SIGINT) == NOTIFY_STOP) - return; - - /* AK: following checks seem to be broken on modern chipsets. FIXME */ - - if (reason & 0x80) - mem_parity_error(reason, regs); - if (reason & 0x40) - io_check_error(reason, regs); -} - -asmlinkage void do_int3(struct pt_regs * regs, long error_code) -{ - if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) { - return; - } - do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); - return; -} - -/* Help handler running on IST stack to switch back to user stack - for scheduling or signal handling. The actual stack switch is done in - entry.S */ -asmlinkage struct pt_regs *sync_regs(struct pt_regs *eregs) -{ - struct pt_regs *regs = eregs; - /* Did already sync */ - if (eregs == (struct pt_regs *)eregs->rsp) - ; - /* Exception from user space */ - else if (user_mode(eregs)) - regs = ((struct pt_regs *)current->thread.rsp0) - 1; - /* Exception from kernel and interrupts are enabled. Move to - kernel process stack. */ - else if (eregs->eflags & X86_EFLAGS_IF) - regs = (struct pt_regs *)(eregs->rsp -= sizeof(struct pt_regs)); - if (eregs != regs) - *regs = *eregs; - return regs; -} - -/* runs on IST stack. */ -asmlinkage void do_debug(struct pt_regs * regs, unsigned long error_code) -{ - unsigned long condition; - struct task_struct *tsk = current; - siginfo_t info; - -#ifdef CONFIG_CHECKING - { - /* RED-PEN interaction with debugger - could destroy gs */ - unsigned long gs; - struct x8664_pda *pda = cpu_pda + safe_smp_processor_id(); - rdmsrl(MSR_GS_BASE, gs); - if (gs != (unsigned long)pda) { - wrmsrl(MSR_GS_BASE, pda); - printk("debug handler: wrong gs %lx expected %p\n", gs, pda); - } - } -#endif - - get_debugreg(condition, 6); - - if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, - SIGTRAP) == NOTIFY_STOP) - return; - - conditional_sti(regs); - - /* Mask out spurious debug traps due to lazy DR7 setting */ - if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { - if (!tsk->thread.debugreg7) { - goto clear_dr7; - } - } - - tsk->thread.debugreg6 = condition; - - /* Mask out spurious TF errors due to lazy TF clearing */ - if (condition & DR_STEP) { - /* - * The TF error should be masked out only if the current - * process is not traced and if the TRAP flag has been set - * previously by a tracing process (condition detected by - * the PT_DTRACE flag); remember that the i386 TRAP flag - * can be modified by the process itself in user mode, - * allowing programs to debug themselves without the ptrace() - * interface. - */ - if (!user_mode(regs)) - goto clear_TF_reenable; - /* - * Was the TF flag set by a debugger? If so, clear it now, - * so that register information is correct. - */ - if (tsk->ptrace & PT_DTRACE) { - regs->eflags &= ~TF_MASK; - tsk->ptrace &= ~PT_DTRACE; - } - } - - /* Ok, finally something we can handle */ - tsk->thread.trap_no = 1; - tsk->thread.error_code = error_code; - info.si_signo = SIGTRAP; - info.si_errno = 0; - info.si_code = TRAP_BRKPT; - if (!user_mode(regs)) - goto clear_dr7; - - info.si_addr = (void __user *)regs->rip; - force_sig_info(SIGTRAP, &info, tsk); -clear_dr7: - set_debugreg(0UL, 7); - return; - -clear_TF_reenable: - set_tsk_thread_flag(tsk, TIF_SINGLESTEP); - regs->eflags &= ~TF_MASK; -} - -static int kernel_math_error(struct pt_regs *regs, char *str) -{ - const struct exception_table_entry *fixup; - fixup = search_exception_tables(regs->rip); - if (fixup) { - regs->rip = fixup->fixup; - return 1; - } - notify_die(DIE_GPF, str, regs, 0, 16, SIGFPE); - /* Illegal floating point operation in the kernel */ - die(str, regs, 0); - return 0; -} - -/* - * Note that we play around with the 'TS' bit in an attempt to get - * the correct behaviour even in the presence of the asynchronous - * IRQ13 behaviour - */ -asmlinkage void do_coprocessor_error(struct pt_regs *regs) -{ - void __user *rip = (void __user *)(regs->rip); - struct task_struct * task; - siginfo_t info; - unsigned short cwd, swd; - - conditional_sti(regs); - if (!user_mode(regs) && - kernel_math_error(regs, "kernel x87 math error")) - return; - - /* - * Save the info for the exception handler and clear the error. - */ - task = current; - save_init_fpu(task); - task->thread.trap_no = 16; - task->thread.error_code = 0; - info.si_signo = SIGFPE; - info.si_errno = 0; - info.si_code = __SI_FAULT; - info.si_addr = rip; - /* - * (~cwd & swd) will mask out exceptions that are not set to unmasked - * status. 0x3f is the exception bits in these regs, 0x200 is the - * C1 reg you need in case of a stack fault, 0x040 is the stack - * fault bit. We should only be taking one exception at a time, - * so if this combination doesn't produce any single exception, - * then we have a bad program that isn't synchronizing its FPU usage - * and it will suffer the consequences since we won't be able to - * fully reproduce the context of the exception - */ - cwd = get_fpu_cwd(task); - swd = get_fpu_swd(task); - switch (((~cwd) & swd & 0x3f) | (swd & 0x240)) { - case 0x000: - default: - break; - case 0x001: /* Invalid Op */ - case 0x041: /* Stack Fault */ - case 0x241: /* Stack Fault | Direction */ - info.si_code = FPE_FLTINV; - break; - case 0x002: /* Denormalize */ - case 0x010: /* Underflow */ - info.si_code = FPE_FLTUND; - break; - case 0x004: /* Zero Divide */ - info.si_code = FPE_FLTDIV; - break; - case 0x008: /* Overflow */ - info.si_code = FPE_FLTOVF; - break; - case 0x020: /* Precision */ - info.si_code = FPE_FLTRES; - break; - } - force_sig_info(SIGFPE, &info, task); -} - -asmlinkage void bad_intr(void) -{ - printk("bad interrupt"); -} - -asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) -{ - void __user *rip = (void __user *)(regs->rip); - struct task_struct * task; - siginfo_t info; - unsigned short mxcsr; - - conditional_sti(regs); - if (!user_mode(regs) && - kernel_math_error(regs, "kernel simd math error")) - return; - - /* - * Save the info for the exception handler and clear the error. - */ - task = current; - save_init_fpu(task); - task->thread.trap_no = 19; - task->thread.error_code = 0; - info.si_signo = SIGFPE; - info.si_errno = 0; - info.si_code = __SI_FAULT; - info.si_addr = rip; - /* - * The SIMD FPU exceptions are handled a little differently, as there - * is only a single status/control register. Thus, to determine which - * unmasked exception was caught we must mask the exception mask bits - * at 0x1f80, and then use these to mask the exception bits at 0x3f. - */ - mxcsr = get_fpu_mxcsr(task); - switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) { - case 0x000: - default: - break; - case 0x001: /* Invalid Op */ - info.si_code = FPE_FLTINV; - break; - case 0x002: /* Denormalize */ - case 0x010: /* Underflow */ - info.si_code = FPE_FLTUND; - break; - case 0x004: /* Zero Divide */ - info.si_code = FPE_FLTDIV; - break; - case 0x008: /* Overflow */ - info.si_code = FPE_FLTOVF; - break; - case 0x020: /* Precision */ - info.si_code = FPE_FLTRES; - break; - } - force_sig_info(SIGFPE, &info, task); -} - -asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs) -{ -} - -asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) -{ -} - -/* - * 'math_state_restore()' saves the current math information in the - * old math state array, and gets the new ones from the current task - * - * Careful.. There are problems with IBM-designed IRQ13 behaviour. - * Don't touch unless you *really* know how it works. - */ -asmlinkage void math_state_restore(void) -{ - struct task_struct *me = current; - clts(); /* Allow maths ops (or we recurse) */ - - if (!used_math()) - init_fpu(me); - restore_fpu_checking(&me->thread.i387.fxsave); - me->thread_info->status |= TS_USEDFPU; -} - -void do_call_debug(struct pt_regs *regs) -{ - notify_die(DIE_CALL, "debug call", regs, 0, 255, SIGINT); -} - -void __init trap_init(void) -{ - set_intr_gate(0,÷_error); - set_intr_gate_ist(1,&debug,DEBUG_STACK); - set_intr_gate_ist(2,&nmi,NMI_STACK); - set_system_gate(3,&int3); - set_system_gate(4,&overflow); /* int4-5 can be called from all */ - set_system_gate(5,&bounds); - set_intr_gate(6,&invalid_op); - set_intr_gate(7,&device_not_available); - set_intr_gate_ist(8,&double_fault, DOUBLEFAULT_STACK); - set_intr_gate(9,&coprocessor_segment_overrun); - set_intr_gate(10,&invalid_TSS); - set_intr_gate(11,&segment_not_present); - set_intr_gate_ist(12,&stack_segment,STACKFAULT_STACK); - set_intr_gate(13,&general_protection); - set_intr_gate(14,&page_fault); - set_intr_gate(15,&spurious_interrupt_bug); - set_intr_gate(16,&coprocessor_error); - set_intr_gate(17,&alignment_check); -#ifdef CONFIG_X86_MCE - set_intr_gate_ist(18,&machine_check, MCE_STACK); -#endif - set_intr_gate(19,&simd_coprocessor_error); - -#ifdef CONFIG_IA32_EMULATION - set_system_gate(IA32_SYSCALL_VECTOR, ia32_syscall); -#endif - - set_intr_gate(KDB_VECTOR, call_debug); - - /* - * Should be a barrier for any external CPU state. - */ - cpu_init(); -} - - -/* Actual parsing is done early in setup.c. */ -static int __init oops_dummy(char *s) -{ - panic_on_oops = 1; - return -1; -} -__setup("oops=", oops_dummy); - -static int __init kstack_setup(char *s) -{ - kstack_depth_to_print = simple_strtoul(s,NULL,0); - return 0; -} -__setup("kstack=", kstack_setup); - diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S deleted file mode 100644 index 2a94f9b60b2..00000000000 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ /dev/null @@ -1,212 +0,0 @@ -/* ld script to make x86-64 Linux kernel - * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>; - */ - -#define LOAD_OFFSET __START_KERNEL_map - -#include <asm-generic/vmlinux.lds.h> -#include <asm/page.h> -#include <linux/config.h> - -OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64") -OUTPUT_ARCH(i386:x86-64) -ENTRY(phys_startup_64) -jiffies_64 = jiffies; -SECTIONS -{ - . = __START_KERNEL; - phys_startup_64 = startup_64 - LOAD_OFFSET; - _text = .; /* Text and read-only data */ - .text : AT(ADDR(.text) - LOAD_OFFSET) { - *(.text) - SCHED_TEXT - LOCK_TEXT - *(.fixup) - *(.gnu.warning) - } = 0x9090 - /* out-of-line lock text */ - .text.lock : AT(ADDR(.text.lock) - LOAD_OFFSET) { *(.text.lock) } - - _etext = .; /* End of text section */ - - . = ALIGN(16); /* Exception table */ - __start___ex_table = .; - __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) } - __stop___ex_table = .; - - RODATA - - /* Data */ - .data : AT(ADDR(.data) - LOAD_OFFSET) { - *(.data) - CONSTRUCTORS - } - - _edata = .; /* End of data section */ - - __bss_start = .; /* BSS */ - .bss : AT(ADDR(.bss) - LOAD_OFFSET) { - *(.bss.page_aligned) - *(.bss) - } - __bss_end = .; - - . = ALIGN(PAGE_SIZE); - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { - *(.data.cacheline_aligned) - } - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { - *(.data.read_mostly) - } - -#define VSYSCALL_ADDR (-10*1024*1024) -#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) -#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) - -#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR) -#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) - -#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR) -#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) - - . = VSYSCALL_ADDR; - .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } - __vsyscall_0 = VSYSCALL_VIRT_ADDR; - - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .xtime_lock : AT(VLOAD(.xtime_lock)) { *(.xtime_lock) } - xtime_lock = VVIRT(.xtime_lock); - - .vxtime : AT(VLOAD(.vxtime)) { *(.vxtime) } - vxtime = VVIRT(.vxtime); - - .wall_jiffies : AT(VLOAD(.wall_jiffies)) { *(.wall_jiffies) } - wall_jiffies = VVIRT(.wall_jiffies); - - .sys_tz : AT(VLOAD(.sys_tz)) { *(.sys_tz) } - sys_tz = VVIRT(.sys_tz); - - .sysctl_vsyscall : AT(VLOAD(.sysctl_vsyscall)) { *(.sysctl_vsyscall) } - sysctl_vsyscall = VVIRT(.sysctl_vsyscall); - - .xtime : AT(VLOAD(.xtime)) { *(.xtime) } - xtime = VVIRT(.xtime); - - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) } - jiffies = VVIRT(.jiffies); - - .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { *(.vsyscall_1) } - .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) { *(.vsyscall_2) } - .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { *(.vsyscall_3) } - - . = VSYSCALL_VIRT_ADDR + 4096; - -#undef VSYSCALL_ADDR -#undef VSYSCALL_PHYS_ADDR -#undef VSYSCALL_VIRT_ADDR -#undef VLOAD_OFFSET -#undef VLOAD -#undef VVIRT_OFFSET -#undef VVIRT - - . = ALIGN(8192); /* init_task */ - .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { - *(.data.init_task) - } - - . = ALIGN(4096); - .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { - *(.data.page_aligned) - } - - . = ALIGN(4096); /* Init code and data */ - __init_begin = .; - .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { - _sinittext = .; - *(.init.text) - _einittext = .; - } - __initdata_begin = .; - .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) } - __initdata_end = .; - . = ALIGN(16); - __setup_start = .; - .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) } - __setup_end = .; - __initcall_start = .; - .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { - *(.initcall1.init) - *(.initcall2.init) - *(.initcall3.init) - *(.initcall4.init) - *(.initcall5.init) - *(.initcall6.init) - *(.initcall7.init) - } - __initcall_end = .; - __con_initcall_start = .; - .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { - *(.con_initcall.init) - } - __con_initcall_end = .; - SECURITY_INIT - . = ALIGN(8); - __alt_instructions = .; - .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { - *(.altinstructions) - } - __alt_instructions_end = .; - .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { - *(.altinstr_replacement) - } - /* .exit.text is discard at runtime, not link time, to deal with references - from .altinstructions and .eh_frame */ - .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) } - .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) } - . = ALIGN(4096); - __initramfs_start = .; - .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) } - __initramfs_end = .; - . = ALIGN(32); - __per_cpu_start = .; - .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) } - __per_cpu_end = .; - . = ALIGN(4096); - __init_end = .; - - . = ALIGN(4096); - __nosave_begin = .; - .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) } - . = ALIGN(4096); - __nosave_end = .; - - _end = . ; - - /* Sections to be discarded */ - /DISCARD/ : { - *(.exitcall.exit) -#ifndef CONFIG_DEBUG_INFO - *(.eh_frame) -#endif - } - - /* DWARF 2 */ - .debug_info 0 : { *(.debug_info) } - .debug_abbrev 0 : { *(.debug_abbrev) } - .debug_line 0 : { *(.debug_line) } - .debug_frame 0 : { *(.debug_frame) } - .debug_str 0 : { *(.debug_str) } - .debug_loc 0 : { *(.debug_loc) } - .debug_macinfo 0 : { *(.debug_macinfo) } - /* SGI/MIPS DWARF 2 extensions */ - .debug_weaknames 0 : { *(.debug_weaknames) } - .debug_funcnames 0 : { *(.debug_funcnames) } - .debug_typenames 0 : { *(.debug_typenames) } - .debug_varnames 0 : { *(.debug_varnames) } - - - .comment 0 : { *(.comment) } -} diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c deleted file mode 100644 index 2e573442594..00000000000 --- a/arch/x86_64/kernel/vsyscall.c +++ /dev/null @@ -1,226 +0,0 @@ -/* - * linux/arch/x86_64/kernel/vsyscall.c - * - * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE - * Copyright 2003 Andi Kleen, SuSE Labs. - * - * Thanks to hpa@transmeta.com for some useful hint. - * Special thanks to Ingo Molnar for his early experience with - * a different vsyscall implementation for Linux/IA32 and for the name. - * - * vsyscall 1 is located at -10Mbyte, vsyscall 2 is located - * at virtual address -10Mbyte+1024bytes etc... There are at max 4 - * vsyscalls. One vsyscall can reserve more than 1 slot to avoid - * jumping out of line if necessary. We cannot add more with this - * mechanism because older kernels won't return -ENOSYS. - * If we want more than four we need a vDSO. - * - * Note: the concept clashes with user mode linux. If you use UML and - * want per guest time just set the kernel.vsyscall64 sysctl to 0. - */ - -#include <linux/time.h> -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/timer.h> -#include <linux/seqlock.h> -#include <linux/jiffies.h> -#include <linux/sysctl.h> - -#include <asm/vsyscall.h> -#include <asm/pgtable.h> -#include <asm/page.h> -#include <asm/fixmap.h> -#include <asm/errno.h> -#include <asm/io.h> - -#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) -#define force_inline __attribute__((always_inline)) inline - -int __sysctl_vsyscall __section_sysctl_vsyscall = 1; -seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED; - -#include <asm/unistd.h> - -static force_inline void timeval_normalize(struct timeval * tv) -{ - time_t __sec; - - __sec = tv->tv_usec / 1000000; - if (__sec) { - tv->tv_usec %= 1000000; - tv->tv_sec += __sec; - } -} - -static force_inline void do_vgettimeofday(struct timeval * tv) -{ - long sequence, t; - unsigned long sec, usec; - - do { - sequence = read_seqbegin(&__xtime_lock); - - sec = __xtime.tv_sec; - usec = (__xtime.tv_nsec / 1000) + - (__jiffies - __wall_jiffies) * (1000000 / HZ); - - if (__vxtime.mode != VXTIME_HPET) { - sync_core(); - rdtscll(t); - if (t < __vxtime.last_tsc) - t = __vxtime.last_tsc; - usec += ((t - __vxtime.last_tsc) * - __vxtime.tsc_quot) >> 32; - /* See comment in x86_64 do_gettimeofday. */ - } else { - usec += ((readl((void *)fix_to_virt(VSYSCALL_HPET) + 0xf0) - - __vxtime.last) * __vxtime.quot) >> 32; - } - } while (read_seqretry(&__xtime_lock, sequence)); - - tv->tv_sec = sec + usec / 1000000; - tv->tv_usec = usec % 1000000; -} - -/* RED-PEN may want to readd seq locking, but then the variable should be write-once. */ -static force_inline void do_get_tz(struct timezone * tz) -{ - *tz = __sys_tz; -} - -static force_inline int gettimeofday(struct timeval *tv, struct timezone *tz) -{ - int ret; - asm volatile("vsysc2: syscall" - : "=a" (ret) - : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) : __syscall_clobber ); - return ret; -} - -static force_inline long time_syscall(long *t) -{ - long secs; - asm volatile("vsysc1: syscall" - : "=a" (secs) - : "0" (__NR_time),"D" (t) : __syscall_clobber); - return secs; -} - -static int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) -{ - if (unlikely(!__sysctl_vsyscall)) - return gettimeofday(tv,tz); - if (tv) - do_vgettimeofday(tv); - if (tz) - do_get_tz(tz); - return 0; -} - -/* This will break when the xtime seconds get inaccurate, but that is - * unlikely */ -static time_t __vsyscall(1) vtime(time_t *t) -{ - if (unlikely(!__sysctl_vsyscall)) - return time_syscall(t); - else if (t) - *t = __xtime.tv_sec; - return __xtime.tv_sec; -} - -static long __vsyscall(2) venosys_0(void) -{ - return -ENOSYS; -} - -static long __vsyscall(3) venosys_1(void) -{ - return -ENOSYS; -} - -#ifdef CONFIG_SYSCTL - -#define SYSCALL 0x050f -#define NOP2 0x9090 - -/* - * NOP out syscall in vsyscall page when not needed. - */ -static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - extern u16 vsysc1, vsysc2; - u16 *map1, *map2; - int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); - if (!write) - return ret; - /* gcc has some trouble with __va(__pa()), so just do it this - way. */ - map1 = ioremap(__pa_symbol(&vsysc1), 2); - if (!map1) - return -ENOMEM; - map2 = ioremap(__pa_symbol(&vsysc2), 2); - if (!map2) { - ret = -ENOMEM; - goto out; - } - if (!sysctl_vsyscall) { - *map1 = SYSCALL; - *map2 = SYSCALL; - } else { - *map1 = NOP2; - *map2 = NOP2; - } - iounmap(map2); -out: - iounmap(map1); - return ret; -} - -static int vsyscall_sysctl_nostrat(ctl_table *t, int __user *name, int nlen, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, - void **context) -{ - return -ENOSYS; -} - -static ctl_table kernel_table2[] = { - { .ctl_name = 99, .procname = "vsyscall64", - .data = &sysctl_vsyscall, .maxlen = sizeof(int), .mode = 0644, - .strategy = vsyscall_sysctl_nostrat, - .proc_handler = vsyscall_sysctl_change }, - { 0, } -}; - -static ctl_table kernel_root_table2[] = { - { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555, - .child = kernel_table2 }, - { 0 }, -}; - -#endif - -static void __init map_vsyscall(void) -{ - extern char __vsyscall_0; - unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0); - - __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL); -} - -static int __init vsyscall_init(void) -{ - BUG_ON(((unsigned long) &vgettimeofday != - VSYSCALL_ADDR(__NR_vgettimeofday))); - BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime)); - BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE))); - map_vsyscall(); -#ifdef CONFIG_SYSCTL - register_sysctl_table(kernel_root_table2, 0); -#endif - return 0; -} - -__initcall(vsyscall_init); diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c deleted file mode 100644 index 68ec03070e5..00000000000 --- a/arch/x86_64/kernel/x8664_ksyms.c +++ /dev/null @@ -1,209 +0,0 @@ -#include <linux/config.h> -#include <linux/module.h> -#include <linux/smp.h> -#include <linux/user.h> -#include <linux/sched.h> -#include <linux/in6.h> -#include <linux/interrupt.h> -#include <linux/smp_lock.h> -#include <linux/pm.h> -#include <linux/pci.h> -#include <linux/apm_bios.h> -#include <linux/kernel.h> -#include <linux/string.h> -#include <linux/syscalls.h> -#include <linux/tty.h> -#include <linux/ioctl32.h> - -#include <asm/semaphore.h> -#include <asm/processor.h> -#include <asm/i387.h> -#include <asm/uaccess.h> -#include <asm/checksum.h> -#include <asm/io.h> -#include <asm/delay.h> -#include <asm/irq.h> -#include <asm/mmx.h> -#include <asm/desc.h> -#include <asm/pgtable.h> -#include <asm/pgalloc.h> -#include <asm/nmi.h> -#include <asm/kdebug.h> -#include <asm/unistd.h> -#include <asm/tlbflush.h> -#include <asm/kdebug.h> - -extern spinlock_t rtc_lock; - -#ifdef CONFIG_SMP -extern void __write_lock_failed(rwlock_t *rw); -extern void __read_lock_failed(rwlock_t *rw); -#endif - -#if defined(CONFIG_BLK_DEV_IDE) || defined(CONFIG_BLK_DEV_HD) || defined(CONFIG_BLK_DEV_IDE_MODULE) || defined(CONFIG_BLK_DEV_HD_MODULE) -extern struct drive_info_struct drive_info; -EXPORT_SYMBOL(drive_info); -#endif - -extern unsigned long get_cmos_time(void); - -/* platform dependent support */ -EXPORT_SYMBOL(boot_cpu_data); -//EXPORT_SYMBOL(dump_fpu); -EXPORT_SYMBOL(__ioremap); -EXPORT_SYMBOL(ioremap_nocache); -EXPORT_SYMBOL(iounmap); -EXPORT_SYMBOL(enable_irq); -EXPORT_SYMBOL(disable_irq); -EXPORT_SYMBOL(disable_irq_nosync); -EXPORT_SYMBOL(probe_irq_mask); -EXPORT_SYMBOL(kernel_thread); -EXPORT_SYMBOL(pm_idle); -EXPORT_SYMBOL(pm_power_off); -EXPORT_SYMBOL(get_cmos_time); - -EXPORT_SYMBOL(__down_failed); -EXPORT_SYMBOL(__down_failed_interruptible); -EXPORT_SYMBOL(__down_failed_trylock); -EXPORT_SYMBOL(__up_wakeup); -/* Networking helper routines. */ -EXPORT_SYMBOL(csum_partial_copy_nocheck); -EXPORT_SYMBOL(ip_compute_csum); -/* Delay loops */ -EXPORT_SYMBOL(__udelay); -EXPORT_SYMBOL(__ndelay); -EXPORT_SYMBOL(__delay); -EXPORT_SYMBOL(__const_udelay); - -EXPORT_SYMBOL(__get_user_1); -EXPORT_SYMBOL(__get_user_2); -EXPORT_SYMBOL(__get_user_4); -EXPORT_SYMBOL(__get_user_8); -EXPORT_SYMBOL(__put_user_1); -EXPORT_SYMBOL(__put_user_2); -EXPORT_SYMBOL(__put_user_4); -EXPORT_SYMBOL(__put_user_8); - -EXPORT_SYMBOL(strpbrk); -EXPORT_SYMBOL(strstr); - -EXPORT_SYMBOL(strncpy_from_user); -EXPORT_SYMBOL(__strncpy_from_user); -EXPORT_SYMBOL(clear_user); -EXPORT_SYMBOL(__clear_user); -EXPORT_SYMBOL(copy_user_generic); -EXPORT_SYMBOL(copy_from_user); -EXPORT_SYMBOL(copy_to_user); -EXPORT_SYMBOL(copy_in_user); -EXPORT_SYMBOL(strnlen_user); - -#ifdef CONFIG_PCI -EXPORT_SYMBOL(pci_alloc_consistent); -EXPORT_SYMBOL(pci_free_consistent); -#endif - -#ifdef CONFIG_PCI -EXPORT_SYMBOL(pci_mem_start); -#endif - -EXPORT_SYMBOL(copy_page); -EXPORT_SYMBOL(clear_page); - -EXPORT_SYMBOL(cpu_pda); -#ifdef CONFIG_SMP -EXPORT_SYMBOL(cpu_data); -EXPORT_SYMBOL(cpu_online_map); -EXPORT_SYMBOL(__write_lock_failed); -EXPORT_SYMBOL(__read_lock_failed); - -EXPORT_SYMBOL(synchronize_irq); -EXPORT_SYMBOL(smp_call_function); -EXPORT_SYMBOL(cpu_callout_map); -#endif - -#ifdef CONFIG_VT -EXPORT_SYMBOL(screen_info); -#endif - -EXPORT_SYMBOL(get_wchan); - -EXPORT_SYMBOL(rtc_lock); - -EXPORT_SYMBOL_GPL(set_nmi_callback); -EXPORT_SYMBOL_GPL(unset_nmi_callback); - -/* Export string functions. We normally rely on gcc builtin for most of these, - but gcc sometimes decides not to inline them. */ -#undef memcpy -#undef memset -#undef memmove -#undef memchr -#undef strlen -#undef strncmp -#undef strncpy -#undef strchr - -extern void * memset(void *,int,__kernel_size_t); -extern size_t strlen(const char *); -extern void * memmove(void * dest,const void *src,size_t count); -extern void *memchr(const void *s, int c, size_t n); -extern void * memcpy(void *,const void *,__kernel_size_t); -extern void * __memcpy(void *,const void *,__kernel_size_t); - -EXPORT_SYMBOL(memset); -EXPORT_SYMBOL(strlen); -EXPORT_SYMBOL(memmove); -EXPORT_SYMBOL(strncmp); -EXPORT_SYMBOL(strncpy); -EXPORT_SYMBOL(strchr); -EXPORT_SYMBOL(strncat); -EXPORT_SYMBOL(memchr); -EXPORT_SYMBOL(strrchr); -EXPORT_SYMBOL(strnlen); -EXPORT_SYMBOL(memscan); -EXPORT_SYMBOL(memcpy); -EXPORT_SYMBOL(__memcpy); - -#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM -/* prototypes are wrong, these are assembly with custom calling functions */ -extern void rwsem_down_read_failed_thunk(void); -extern void rwsem_wake_thunk(void); -extern void rwsem_downgrade_thunk(void); -extern void rwsem_down_write_failed_thunk(void); -EXPORT_SYMBOL(rwsem_down_read_failed_thunk); -EXPORT_SYMBOL(rwsem_wake_thunk); -EXPORT_SYMBOL(rwsem_downgrade_thunk); -EXPORT_SYMBOL(rwsem_down_write_failed_thunk); -#endif - -EXPORT_SYMBOL(empty_zero_page); - -#ifdef CONFIG_HAVE_DEC_LOCK -EXPORT_SYMBOL(_atomic_dec_and_lock); -#endif - -EXPORT_SYMBOL(die_chain); -EXPORT_SYMBOL(register_die_notifier); - -#ifdef CONFIG_SMP -EXPORT_SYMBOL(cpu_sibling_map); -EXPORT_SYMBOL(smp_num_siblings); -#endif - -extern void do_softirq_thunk(void); -EXPORT_SYMBOL(do_softirq_thunk); - -#ifdef CONFIG_BUG -EXPORT_SYMBOL(out_of_line_bug); -#endif - -EXPORT_SYMBOL(init_level4_pgt); - -extern unsigned long __supported_pte_mask; -EXPORT_SYMBOL(__supported_pte_mask); - -#ifdef CONFIG_SMP -EXPORT_SYMBOL(flush_tlb_page); -#endif - -EXPORT_SYMBOL(cpu_khz); |
