diff options
Diffstat (limited to 'arch/i386')
34 files changed, 697 insertions, 319 deletions
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index 5b1a7d46d1d..bfea1bedcbf 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -80,6 +80,7 @@ config X86_VOYAGER config X86_NUMAQ bool "NUMAQ (IBM/Sequent)" + select SMP select NUMA help This option is used for getting Linux to run on a (IBM/Sequent) NUMA @@ -400,6 +401,7 @@ choice config NOHIGHMEM bool "off" + depends on !X86_NUMAQ ---help--- Linux can use up to 64 Gigabytes of physical memory on x86 systems. However, the address space of 32-bit x86 processors is only 4 @@ -436,6 +438,7 @@ config NOHIGHMEM config HIGHMEM4G bool "4GB" + depends on !X86_NUMAQ help Select this if you have a 32-bit processor and between 1 and 4 gigabytes of physical RAM. @@ -503,10 +506,6 @@ config NUMA default n if X86_PC default y if (X86_NUMAQ || X86_SUMMIT) -# Need comments to help the hapless user trying to turn on NUMA support -comment "NUMA (NUMA-Q) requires SMP, 64GB highmem support" - depends on X86_NUMAQ && (!HIGHMEM64G || !SMP) - comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI" depends on X86_SUMMIT && (!HIGHMEM64G || !ACPI) @@ -660,13 +659,18 @@ config BOOT_IOREMAP default y config REGPARM - bool "Use register arguments (EXPERIMENTAL)" - depends on EXPERIMENTAL - default n + bool "Use register arguments" + default y help - Compile the kernel with -mregparm=3. This uses a different ABI - and passes the first three arguments of a function call in registers. - This will probably break binary only modules. + Compile the kernel with -mregparm=3. This instructs gcc to use + a more efficient function call ABI which passes the first three + arguments of a function call via registers, which results in denser + and faster code. + + If this option is disabled, then the default ABI of passing + arguments via the stack is used. + + If unsure, say Y. config SECCOMP bool "Enable seccomp to safely compute untrusted bytecode" diff --git a/arch/i386/Kconfig.debug b/arch/i386/Kconfig.debug index bf32ecc9ad0..00108ba9a78 100644 --- a/arch/i386/Kconfig.debug +++ b/arch/i386/Kconfig.debug @@ -31,6 +31,15 @@ config DEBUG_STACK_USAGE This option will slow down process creation somewhat. +config STACK_BACKTRACE_COLS + int "Stack backtraces per line" if DEBUG_KERNEL + range 1 3 + default 2 + help + Selects how many stack backtrace entries per line to display. + + This can save screen space when displaying traces. + comment "Page alloc debug is incompatible with Software Suspend on i386" depends on DEBUG_KERNEL && SOFTWARE_SUSPEND diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile index 65656c033d7..5b9ed21216c 100644 --- a/arch/i386/kernel/Makefile +++ b/arch/i386/kernel/Makefile @@ -7,7 +7,7 @@ extra-y := head.o init_task.o vmlinux.lds obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o \ ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \ pci-dma.o i386_ksyms.o i387.o dmi_scan.o bootflag.o \ - quirks.o i8237.o topology.o + quirks.o i8237.o topology.o alternative.o obj-y += cpu/ obj-y += timers/ diff --git a/arch/i386/kernel/alternative.c b/arch/i386/kernel/alternative.c new file mode 100644 index 00000000000..5cbd6f99fb2 --- /dev/null +++ b/arch/i386/kernel/alternative.c @@ -0,0 +1,321 @@ +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/list.h> +#include <asm/alternative.h> +#include <asm/sections.h> + +#define DEBUG 0 +#if DEBUG +# define DPRINTK(fmt, args...) printk(fmt, args) +#else +# define DPRINTK(fmt, args...) +#endif + +/* Use inline assembly to define this because the nops are defined + as inline assembly strings in the include files and we cannot + get them easily into strings. */ +asm("\t.data\nintelnops: " + GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6 + GENERIC_NOP7 GENERIC_NOP8); +asm("\t.data\nk8nops: " + K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6 + K8_NOP7 K8_NOP8); +asm("\t.data\nk7nops: " + K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6 + K7_NOP7 K7_NOP8); + +extern unsigned char intelnops[], k8nops[], k7nops[]; +static unsigned char *intel_nops[ASM_NOP_MAX+1] = { + NULL, + intelnops, + intelnops + 1, + intelnops + 1 + 2, + intelnops + 1 + 2 + 3, + intelnops + 1 + 2 + 3 + 4, + intelnops + 1 + 2 + 3 + 4 + 5, + intelnops + 1 + 2 + 3 + 4 + 5 + 6, + intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7, +}; +static unsigned char *k8_nops[ASM_NOP_MAX+1] = { + NULL, + k8nops, + k8nops + 1, + k8nops + 1 + 2, + k8nops + 1 + 2 + 3, + k8nops + 1 + 2 + 3 + 4, + k8nops + 1 + 2 + 3 + 4 + 5, + k8nops + 1 + 2 + 3 + 4 + 5 + 6, + k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, +}; +static unsigned char *k7_nops[ASM_NOP_MAX+1] = { + NULL, + k7nops, + k7nops + 1, + k7nops + 1 + 2, + k7nops + 1 + 2 + 3, + k7nops + 1 + 2 + 3 + 4, + k7nops + 1 + 2 + 3 + 4 + 5, + k7nops + 1 + 2 + 3 + 4 + 5 + 6, + k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, +}; +static struct nop { + int cpuid; + unsigned char **noptable; +} noptypes[] = { + { X86_FEATURE_K8, k8_nops }, + { X86_FEATURE_K7, k7_nops }, + { -1, NULL } +}; + + +extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; +extern struct alt_instr __smp_alt_instructions[], __smp_alt_instructions_end[]; +extern u8 *__smp_locks[], *__smp_locks_end[]; + +extern u8 __smp_alt_begin[], __smp_alt_end[]; + + +static unsigned char** find_nop_table(void) +{ + unsigned char **noptable = intel_nops; + int i; + + for (i = 0; noptypes[i].cpuid >= 0; i++) { + if (boot_cpu_has(noptypes[i].cpuid)) { + noptable = noptypes[i].noptable; + break; + } + } + return noptable; +} + +/* Replace instructions with better alternatives for this CPU type. + This runs before SMP is initialized to avoid SMP problems with + self modifying code. This implies that assymetric systems where + APs have less capabilities than the boot processor are not handled. + Tough. Make sure you disable such features by hand. */ + +void apply_alternatives(struct alt_instr *start, struct alt_instr *end) +{ + unsigned char **noptable = find_nop_table(); + struct alt_instr *a; + int diff, i, k; + + DPRINTK("%s: alt table %p -> %p\n", __FUNCTION__, start, end); + for (a = start; a < end; a++) { + BUG_ON(a->replacementlen > a->instrlen); + if (!boot_cpu_has(a->cpuid)) + continue; + memcpy(a->instr, a->replacement, a->replacementlen); + diff = a->instrlen - a->replacementlen; + /* Pad the rest with nops */ + for (i = a->replacementlen; diff > 0; diff -= k, i += k) { + k = diff; + if (k > ASM_NOP_MAX) + k = ASM_NOP_MAX; + memcpy(a->instr + i, noptable[k], k); + } + } +} + +static void alternatives_smp_save(struct alt_instr *start, struct alt_instr *end) +{ + struct alt_instr *a; + + DPRINTK("%s: alt table %p-%p\n", __FUNCTION__, start, end); + for (a = start; a < end; a++) { + memcpy(a->replacement + a->replacementlen, + a->instr, + a->instrlen); + } +} + +static void alternatives_smp_apply(struct alt_instr *start, struct alt_instr *end) +{ + struct alt_instr *a; + + for (a = start; a < end; a++) { + memcpy(a->instr, + a->replacement + a->replacementlen, + a->instrlen); + } +} + +static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end) +{ + u8 **ptr; + + for (ptr = start; ptr < end; ptr++) { + if (*ptr < text) + continue; + if (*ptr > text_end) + continue; + **ptr = 0xf0; /* lock prefix */ + }; +} + +static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end) +{ + unsigned char **noptable = find_nop_table(); + u8 **ptr; + + for (ptr = start; ptr < end; ptr++) { + if (*ptr < text) + continue; + if (*ptr > text_end) + continue; + **ptr = noptable[1][0]; + }; +} + +struct smp_alt_module { + /* what is this ??? */ + struct module *mod; + char *name; + + /* ptrs to lock prefixes */ + u8 **locks; + u8 **locks_end; + + /* .text segment, needed to avoid patching init code ;) */ + u8 *text; + u8 *text_end; + + struct list_head next; +}; +static LIST_HEAD(smp_alt_modules); +static DEFINE_SPINLOCK(smp_alt); + +static int smp_alt_once = 0; +static int __init bootonly(char *str) +{ + smp_alt_once = 1; + return 1; +} +__setup("smp-alt-boot", bootonly); + +void alternatives_smp_module_add(struct module *mod, char *name, + void *locks, void *locks_end, + void *text, void *text_end) +{ + struct smp_alt_module *smp; + unsigned long flags; + + if (smp_alt_once) { + if (boot_cpu_has(X86_FEATURE_UP)) + alternatives_smp_unlock(locks, locks_end, + text, text_end); + return; + } + + smp = kzalloc(sizeof(*smp), GFP_KERNEL); + if (NULL == smp) + return; /* we'll run the (safe but slow) SMP code then ... */ + + smp->mod = mod; + smp->name = name; + smp->locks = locks; + smp->locks_end = locks_end; + smp->text = text; + smp->text_end = text_end; + DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n", + __FUNCTION__, smp->locks, smp->locks_end, + smp->text, smp->text_end, smp->name); + + spin_lock_irqsave(&smp_alt, flags); + list_add_tail(&smp->next, &smp_alt_modules); + if (boot_cpu_has(X86_FEATURE_UP)) + alternatives_smp_unlock(smp->locks, smp->locks_end, + smp->text, smp->text_end); + spin_unlock_irqrestore(&smp_alt, flags); +} + +void alternatives_smp_module_del(struct module *mod) +{ + struct smp_alt_module *item; + unsigned long flags; + + if (smp_alt_once) + return; + + spin_lock_irqsave(&smp_alt, flags); + list_for_each_entry(item, &smp_alt_modules, next) { + if (mod != item->mod) + continue; + list_del(&item->next); + spin_unlock_irqrestore(&smp_alt, flags); + DPRINTK("%s: %s\n", __FUNCTION__, item->name); + kfree(item); + return; + } + spin_unlock_irqrestore(&smp_alt, flags); +} + +void alternatives_smp_switch(int smp) +{ + struct smp_alt_module *mod; + unsigned long flags; + + if (smp_alt_once) + return; + BUG_ON(!smp && (num_online_cpus() > 1)); + + spin_lock_irqsave(&smp_alt, flags); + if (smp) { + printk(KERN_INFO "SMP alternatives: switching to SMP code\n"); + clear_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability); + clear_bit(X86_FEATURE_UP, cpu_data[0].x86_capability); + alternatives_smp_apply(__smp_alt_instructions, + __smp_alt_instructions_end); + list_for_each_entry(mod, &smp_alt_modules, next) + alternatives_smp_lock(mod->locks, mod->locks_end, + mod->text, mod->text_end); + } else { + printk(KERN_INFO "SMP alternatives: switching to UP code\n"); + set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability); + set_bit(X86_FEATURE_UP, cpu_data[0].x86_capability); + apply_alternatives(__smp_alt_instructions, + __smp_alt_instructions_end); + list_for_each_entry(mod, &smp_alt_modules, next) + alternatives_smp_unlock(mod->locks, mod->locks_end, + mod->text, mod->text_end); + } + spin_unlock_irqrestore(&smp_alt, flags); +} + +void __init alternative_instructions(void) +{ + apply_alternatives(__alt_instructions, __alt_instructions_end); + + /* switch to patch-once-at-boottime-only mode and free the + * tables in case we know the number of CPUs will never ever + * change */ +#ifdef CONFIG_HOTPLUG_CPU + if (num_possible_cpus() < 2) + smp_alt_once = 1; +#else + smp_alt_once = 1; +#endif + + if (smp_alt_once) { + if (1 == num_possible_cpus()) { + printk(KERN_INFO "SMP alternatives: switching to UP code\n"); + set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability); + set_bit(X86_FEATURE_UP, cpu_data[0].x86_capability); + apply_alternatives(__smp_alt_instructions, + __smp_alt_instructions_end); + alternatives_smp_unlock(__smp_locks, __smp_locks_end, + _text, _etext); + } + free_init_pages("SMP alternatives", + (unsigned long)__smp_alt_begin, + (unsigned long)__smp_alt_end); + } else { + alternatives_smp_save(__smp_alt_instructions, + __smp_alt_instructions_end); + alternatives_smp_module_add(NULL, "core kernel", + __smp_locks, __smp_locks_end, + _text, _etext); + alternatives_smp_switch(0); + } +} diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c index 776c90989e0..eb5279d23b7 100644 --- a/arch/i386/kernel/apic.c +++ b/arch/i386/kernel/apic.c @@ -38,6 +38,7 @@ #include <asm/i8253.h> #include <mach_apic.h> +#include <mach_apicdef.h> #include <mach_ipi.h> #include "io_ports.h" diff --git a/arch/i386/kernel/cpu/centaur.c b/arch/i386/kernel/cpu/centaur.c index f52669ecb93..bd75629dd26 100644 --- a/arch/i386/kernel/cpu/centaur.c +++ b/arch/i386/kernel/cpu/centaur.c @@ -4,6 +4,7 @@ #include <asm/processor.h> #include <asm/msr.h> #include <asm/e820.h> +#include <asm/mtrr.h> #include "cpu.h" #ifdef CONFIG_X86_OOSTORE diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index e6bd095ae10..7e3d6b6a4e9 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -25,9 +25,10 @@ EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr); DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]); EXPORT_PER_CPU_SYMBOL(cpu_16bit_stack); -static int cachesize_override __devinitdata = -1; -static int disable_x86_fxsr __devinitdata = 0; -static int disable_x86_serial_nr __devinitdata = 1; +static int cachesize_override __cpuinitdata = -1; +static int disable_x86_fxsr __cpuinitdata; +static int disable_x86_serial_nr __cpuinitdata = 1; +static int disable_x86_sep __cpuinitdata; struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {}; @@ -59,7 +60,7 @@ static int __init cachesize_setup(char *str) } __setup("cachesize=", cachesize_setup); -int __devinit get_model_name(struct cpuinfo_x86 *c) +int __cpuinit get_model_name(struct cpuinfo_x86 *c) { unsigned int *v; char *p, *q; @@ -89,7 +90,7 @@ int __devinit get_model_name(struct cpuinfo_x86 *c) } -void __devinit display_cacheinfo(struct cpuinfo_x86 *c) +void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) { unsigned int n, dummy, ecx, edx, l2size; @@ -130,7 +131,7 @@ void __devinit display_cacheinfo(struct cpuinfo_x86 *c) /* in particular, if CPUID levels 0x80000002..4 are supported, this isn't used */ /* Look up CPU names by table lookup. */ -static char __devinit *table_lookup_model(struct cpuinfo_x86 *c) +static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c) { struct cpu_model_info *info; @@ -151,7 +152,7 @@ static char __devinit *table_lookup_model(struct cpuinfo_x86 *c) } -static void __devinit get_cpu_vendor(struct cpuinfo_x86 *c, int early) +static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c, int early) { char *v = c->x86_vendor_id; int i; @@ -187,6 +188,14 @@ static int __init x86_fxsr_setup(char * s) __setup("nofxsr", x86_fxsr_setup); +static int __init x86_sep_setup(char * s) +{ + disable_x86_sep = 1; + return 1; +} +__setup("nosep", x86_sep_setup); + + /* Standard macro to see if a specific flag is changeable */ static inline int flag_is_changeable_p(u32 flag) { @@ -210,7 +219,7 @@ static inline int flag_is_changeable_p(u32 flag) /* Probe for the CPUID instruction */ -static int __devinit have_cpuid_p(void) +static int __cpuinit have_cpuid_p(void) { return flag_is_changeable_p(X86_EFLAGS_ID); } @@ -254,7 +263,7 @@ static void __init early_cpu_detect(void) } } -void __devinit generic_identify(struct cpuinfo_x86 * c) +void __cpuinit generic_identify(struct cpuinfo_x86 * c) { u32 tfms, xlvl; int junk; @@ -307,7 +316,7 @@ void __devinit generic_identify(struct cpuinfo_x86 * c) #endif } -static void __devinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c) +static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c) { if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr ) { /* Disable processor serial number */ @@ -335,7 +344,7 @@ __setup("serialnumber", x86_serial_nr_setup); /* * This does the hard work of actually picking apart the CPU stuff... */ -void __devinit identify_cpu(struct cpuinfo_x86 *c) +void __cpuinit identify_cpu(struct cpuinfo_x86 *c) { int i; @@ -405,6 +414,10 @@ void __devinit identify_cpu(struct cpuinfo_x86 *c) clear_bit(X86_FEATURE_XMM, c->x86_capability); } + /* SEP disabled? */ + if (disable_x86_sep) + clear_bit(X86_FEATURE_SEP, c->x86_capability); + if (disable_pse) clear_bit(X86_FEATURE_PSE, c->x86_capability); @@ -417,7 +430,7 @@ void __devinit identify_cpu(struct cpuinfo_x86 *c) else /* Last resort... */ sprintf(c->x86_model_id, "%02x/%02x", - c->x86_vendor, c->x86_model); + c->x86, c->x86_model); } /* Now the feature flags better reflect actual CPU features! */ @@ -453,7 +466,7 @@ void __devinit identify_cpu(struct cpuinfo_x86 *c) } #ifdef CONFIG_X86_HT -void __devinit detect_ht(struct cpuinfo_x86 *c) +void __cpuinit detect_ht(struct cpuinfo_x86 *c) { u32 eax, ebx, ecx, edx; int index_msb, core_bits; @@ -500,7 +513,7 @@ void __devinit detect_ht(struct cpuinfo_x86 *c) } #endif -void __devinit print_cpu_info(struct cpuinfo_x86 *c) +void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) { char *vendor = NULL; @@ -523,7 +536,7 @@ void __devinit print_cpu_info(struct cpuinfo_x86 *c) printk("\n"); } -cpumask_t cpu_initialized __devinitdata = CPU_MASK_NONE; +cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; /* This is hacky. :) * We're emulating future behavior. @@ -570,7 +583,7 @@ void __init early_cpu_init(void) * and IDT. We reload them nevertheless, this function acts as a * 'CPU state barrier', nothing should get across. */ -void __devinit cpu_init(void) +void __cpuinit cpu_init(void) { int cpu = smp_processor_id(); struct tss_struct * t = &per_cpu(init_tss, cpu); @@ -670,7 +683,7 @@ void __devinit cpu_init(void) } #ifdef CONFIG_HOTPLUG_CPU -void __devinit cpu_uninit(void) +void __cpuinit cpu_uninit(void) { int cpu = raw_smp_processor_id(); cpu_clear(cpu, cpu_initialized); diff --git a/arch/i386/kernel/cpu/cpufreq/powernow-k8.c b/arch/i386/kernel/cpu/cpufreq/powernow-k8.c index e11a09207ec..3d5110b65cc 100644 --- a/arch/i386/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/i386/kernel/cpu/cpufreq/powernow-k8.c @@ -1145,9 +1145,7 @@ static int __cpuinit powernowk8_init(void) { unsigned int i, supported_cpus = 0; - for (i=0; i<NR_CPUS; i++) { - if (!cpu_online(i)) - continue; + for_each_cpu(i) { if (check_supported_cpu(i)) supported_cpus++; } diff --git a/arch/i386/kernel/cpu/intel.c b/arch/i386/kernel/cpu/intel.c index 8c0120186b9..5386b29bb5a 100644 --- a/arch/i386/kernel/cpu/intel.c +++ b/arch/i386/kernel/cpu/intel.c @@ -29,7 +29,7 @@ extern int trap_init_f00f_bug(void); struct movsl_mask movsl_mask __read_mostly; #endif -void __devinit early_intel_workaround(struct cpuinfo_x86 *c) +void __cpuinit early_intel_workaround(struct cpuinfo_x86 *c) { if (c->x86_vendor != X86_VENDOR_INTEL) return; @@ -44,7 +44,7 @@ void __devinit early_intel_workaround(struct cpuinfo_x86 *c) * This is called before we do cpu ident work */ -int __devinit ppro_with_ram_bug(void) +int __cpuinit ppro_with_ram_bug(void) { /* Uses data from early_cpu_detect now */ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && @@ -62,7 +62,7 @@ int __devinit ppro_with_ram_bug(void) * P4 Xeon errata 037 workaround. * Hardware prefetcher may cause stale data to be loaded into the cache. */ -static void __devinit Intel_errata_workarounds(struct cpuinfo_x86 *c) +static void __cpuinit Intel_errata_workarounds(struct cpuinfo_x86 *c) { unsigned long lo, hi; @@ -81,7 +81,7 @@ static void __devinit Intel_errata_workarounds(struct cpuinfo_x86 *c) /* * find out the number of processor cores on the die */ -static int __devinit num_cpu_cores(struct cpuinfo_x86 *c) +static int __cpuinit num_cpu_cores(struct cpuinfo_x86 *c) { unsigned int eax, ebx, ecx, edx; @@ -96,7 +96,7 @@ static int __devinit num_cpu_cores(struct cpuinfo_x86 *c) return 1; } -static void __devinit init_intel(struct cpuinfo_x86 *c) +static void __cpuinit init_intel(struct cpuinfo_x86 *c) { unsigned int l2 = 0; char *p = NULL; @@ -205,7 +205,7 @@ static unsigned int intel_size_cache(struct cpuinfo_x86 * c, unsigned int size) return size; } -static struct cpu_dev intel_cpu_dev __devinitdata = { +static struct cpu_dev intel_cpu_dev __cpuinitdata = { .c_vendor = "Intel", .c_ident = { "GenuineIntel" }, .c_models = { diff --git a/arch/i386/kernel/cpu/intel_cacheinfo.c b/arch/i386/kernel/cpu/intel_cacheinfo.c index ffe58cee0c4..ce61921369e 100644 --- a/arch/i386/kernel/cpu/intel_cacheinfo.c +++ b/arch/i386/kernel/cpu/intel_cacheinfo.c @@ -174,7 +174,7 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */ unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */ - if (c->cpuid_level > 4) { + if (c->cpuid_level > 3) { static int is_initialized; if (is_initialized == 0) { @@ -330,7 +330,7 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) } } } -static void __devinit cache_remove_shared_cpu_map(unsigned int cpu, int index) +static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index) { struct _cpuid4_info *this_leaf, *sibling_leaf; int sibling; diff --git a/arch/i386/kernel/cpu/proc.c b/arch/i386/kernel/cpu/proc.c index 89a85af33d2..5cfbd801169 100644 --- a/arch/i386/kernel/cpu/proc.c +++ b/arch/i386/kernel/cpu/proc.c @@ -40,7 +40,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) /* Other (Linux-defined) */ "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr", NULL, NULL, NULL, NULL, - "constant_tsc", NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "constant_tsc", "up", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, diff --git a/arch/i386/kernel/crash.c b/arch/i386/kernel/crash.c index d49dbe8dc96..e3c5fca0aa8 100644 --- a/arch/i386/kernel/crash.c +++ b/arch/i386/kernel/crash.c @@ -105,7 +105,7 @@ static int crash_nmi_callback(struct pt_regs *regs, int cpu) return 1; local_irq_disable(); - if (!user_mode(regs)) { + if (!user_mode_vm(regs)) { crash_fixup_ss_esp(&fixed_regs, regs); regs = &fixed_regs; } diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index 4d704724b2f..cfc683f153b 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -226,6 +226,10 @@ ENTRY(system_call) pushl %eax # save orig_eax SAVE_ALL GET_THREAD_INFO(%ebp) + testl $TF_MASK,EFLAGS(%esp) + jz no_singlestep + orl $_TIF_SINGLESTEP,TI_flags(%ebp) +no_singlestep: # system call tracing in operation / emulation /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S index e0b7c632efb..3debc2e2654 100644 --- a/arch/i386/kernel/head.S +++ b/arch/i386/kernel/head.S @@ -450,7 +450,6 @@ int_msg: .globl boot_gdt_descr .globl idt_descr -.globl cpu_gdt_descr ALIGN # early boot GDT descriptor (must use 1:1 address mapping) @@ -470,8 +469,6 @@ cpu_gdt_descr: .word GDT_ENTRIES*8-1 .long cpu_gdt_table - .fill NR_CPUS-1,8,0 # space for the other GDT descriptors - /* * The boot_gdt_table must mirror the equivalent in setup.S and is * used only for booting. @@ -485,7 +482,7 @@ ENTRY(boot_gdt_table) /* * The Global Descriptor Table contains 28 quadwords, per-CPU. */ - .align PAGE_SIZE_asm + .align L1_CACHE_BYTES ENTRY(cpu_gdt_table) .quad 0x0000000000000000 /* NULL descriptor */ .quad 0x0000000000000000 /* 0x0b reserved */ diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index 39d9a5fa907..311b4e7266f 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -351,8 +351,8 @@ static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold) { int i, j; Dprintk("Rotating IRQs among CPUs.\n"); - for (i = 0; i < NR_CPUS; i++) { - for (j = 0; cpu_online(i) && (j < NR_IRQS); j++) { + for_each_online_cpu(i) { + for (j = 0; j < NR_IRQS; j++) { if (!irq_desc[j].action) continue; /* Is it a significant load ? */ @@ -381,7 +381,7 @@ static void do_irq_balance(void) unsigned long imbalance = 0; cpumask_t allowed_mask, target_cpu_mask, tmp; - for (i = 0; i < NR_CPUS; i++) { + for_each_cpu(i) { int package_index; CPU_IRQ(i) = 0; if (!cpu_online(i)) @@ -422,9 +422,7 @@ static void do_irq_balance(void) } } /* Find the least loaded processor package */ - for (i = 0; i < NR_CPUS; i++) { - if (!cpu_online(i)) - continue; + for_each_online_cpu(i) { if (i != CPU_TO_PACKAGEINDEX(i)) continue; if (min_cpu_irq > CPU_IRQ(i)) { @@ -441,9 +439,7 @@ tryanothercpu: */ tmp_cpu_irq = 0; tmp_loaded = -1; - for (i = 0; i < NR_CPUS; i++) { - if (!cpu_online(i)) - continue; + for_each_online_cpu(i) { if (i != CPU_TO_PACKAGEINDEX(i)) continue; if (max_cpu_irq <= CPU_IRQ(i)) @@ -619,9 +615,7 @@ static int __init balanced_irq_init(void) if (smp_num_siblings > 1 && !cpus_empty(tmp)) physical_balance = 1; - for (i = 0; i < NR_CPUS; i++) { - if (!cpu_online(i)) - continue; + for_each_online_cpu(i) { irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL); irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL); if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) { @@ -638,9 +632,11 @@ static int __init balanced_irq_init(void) else printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq"); failed: - for (i = 0; i < NR_CPUS; i++) { + for_each_cpu(i) { kfree(irq_cpu_data[i].irq_delta); + irq_cpu_data[i].irq_delta = NULL; kfree(irq_cpu_data[i].last_irq); + irq_cpu_data[i].last_irq = NULL; } return 0; } @@ -1761,7 +1757,8 @@ static void __init setup_ioapic_ids_from_mpc(void) * Don't check I/O APIC IDs for xAPIC systems. They have * no meaning without the serial APIC bus. */ - if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && boot_cpu_data.x86 < 15)) + if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + || APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) return; /* * This is broken; anything with a real cpu count has to diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c index 694a1399763..7a59050242a 100644 --- a/arch/i386/kernel/kprobes.c +++ b/arch/i386/kernel/kprobes.c @@ -84,9 +84,9 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p) void __kprobes arch_remove_kprobe(struct kprobe *p) { - down(&kprobe_mutex); + mutex_lock(&kprobe_mutex); free_insn_slot(p->ainsn.insn); - up(&kprobe_mutex); + mutex_unlock(&kprobe_mutex); } static inline void save_previous_kprobe(struct kprobe_ctlblk *kcb) diff --git a/arch/i386/kernel/module.c b/arch/i386/kernel/module.c index 5149c8a621f..470cf97e7cd 100644 --- a/arch/i386/kernel/module.c +++ b/arch/i386/kernel/module.c @@ -104,26 +104,38 @@ int apply_relocate_add(Elf32_Shdr *sechdrs, return -ENOEXEC; } -extern void apply_alternatives(void *start, void *end); - int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *me) { - const Elf_Shdr *s; + const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL; char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; - /* look for .altinstructions to patch */ for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { - void *seg; - if (strcmp(".altinstructions", secstrings + s->sh_name)) - continue; - seg = (void *)s->sh_addr; - apply_alternatives(seg, seg + s->sh_size); - } + if (!strcmp(".text", secstrings + s->sh_name)) + text = s; + if (!strcmp(".altinstructions", secstrings + s->sh_name)) + alt = s; + if (!strcmp(".smp_locks", secstrings + s->sh_name)) + locks= s; + } + + if (alt) { + /* patch .altinstructions */ + void *aseg = (void *)alt->sh_addr; + apply_alternatives(aseg, aseg + alt->sh_size); + } + if (locks && text) { + void *lseg = (void *)locks->sh_addr; + void *tseg = (void *)text->sh_addr; + alternatives_smp_module_add(me, me->name, + lseg, lseg + locks->sh_size, + tseg, tseg + text->sh_size); + } return 0; } void module_arch_cleanup(struct module *mod) { + alternatives_smp_module_del(mod); } diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c index e6e2f43db85..8d8aa9d1796 100644 --- a/arch/i386/kernel/mpparse.c +++ b/arch/i386/kernel/mpparse.c @@ -828,6 +828,8 @@ void __init find_smp_config (void) smp_scan_config(address, 0x400); } +int es7000_plat; + /* -------------------------------------------------------------------------- ACPI-based MP Configuration -------------------------------------------------------------------------- */ @@ -935,7 +937,8 @@ void __init mp_register_ioapic ( mp_ioapics[idx].mpc_apicaddr = address; set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); - if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 < 15)) + if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + && !APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) tmpid = io_apic_get_unique_id(idx, id); else tmpid = id; @@ -1011,8 +1014,6 @@ void __init mp_override_legacy_irq ( return; } -int es7000_plat; - void __init mp_config_acpi_legacy_irqs (void) { struct mpc_config_intsrc intsrc; diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index be87c5e2ee9..9074818b947 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -143,7 +143,7 @@ static int __init check_nmi_watchdog(void) local_irq_enable(); mdelay((10*1000)/nmi_hz); // wait 10 ticks - for (cpu = 0; cpu < NR_CPUS; cpu++) { + for_each_cpu(cpu) { #ifdef CONFIG_SMP /* Check cpu_callin_map here because that is set after the timer is started. */ @@ -510,7 +510,7 @@ void touch_nmi_watchdog (void) * Just reset the alert counters, (other CPUs might be * spinning on locks we hold): */ - for (i = 0; i < NR_CPUS; i++) + for_each_cpu(i) alert_counter[i] = 0; /* @@ -543,7 +543,7 @@ void nmi_watchdog_tick (struct pt_regs * regs) /* * die_nmi will return ONLY if NOTIFY_STOP happens.. */ - die_nmi(regs, "NMI Watchdog detected LOCKUP"); + die_nmi(regs, "BUG: NMI Watchdog detected LOCKUP"); } else { last_irq_sums[cpu] = sum; alert_counter[cpu] = 0; diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index 0480454ebff..299e6167408 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -295,7 +295,7 @@ void show_regs(struct pt_regs * regs) printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, smp_processor_id()); print_symbol("EIP is at %s\n", regs->eip); - if (user_mode(regs)) + if (user_mode_vm(regs)) printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp); printk(" EFLAGS: %08lx %s (%s %.*s)\n", regs->eflags, print_tainted(), system_utsname.release, diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c index 5c1fb6aada5..506462ef36a 100644 --- a/arch/i386/kernel/ptrace.c +++ b/arch/i386/kernel/ptrace.c @@ -34,10 +34,10 @@ /* * Determines which flags the user has access to [1 = access, 0 = no access]. - * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), IOPL(12-13), IF(9). + * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), NT(14), IOPL(12-13), IF(9). * Also masks reserved bits (31-22, 15, 5, 3, 1). */ -#define FLAG_MASK 0x00054dd5 +#define FLAG_MASK 0x00050dd5 /* set's the trap flag. */ #define TRAP_FLAG 0x100 diff --git a/arch/i386/kernel/semaphore.c b/arch/i386/kernel/semaphore.c index 7455ab64394..967dc74df9e 100644 --- a/arch/i386/kernel/semaphore.c +++ b/arch/i386/kernel/semaphore.c @@ -110,11 +110,11 @@ asm( ".align 4\n" ".globl __write_lock_failed\n" "__write_lock_failed:\n\t" - LOCK "addl $" RW_LOCK_BIAS_STR ",(%eax)\n" + LOCK_PREFIX "addl $" RW_LOCK_BIAS_STR ",(%eax)\n" "1: rep; nop\n\t" "cmpl $" RW_LOCK_BIAS_STR ",(%eax)\n\t" "jne 1b\n\t" - LOCK "subl $" RW_LOCK_BIAS_STR ",(%eax)\n\t" + LOCK_PREFIX "subl $" RW_LOCK_BIAS_STR ",(%eax)\n\t" "jnz __write_lock_failed\n\t" "ret" ); @@ -124,11 +124,11 @@ asm( ".align 4\n" ".globl __read_lock_failed\n" "__read_lock_failed:\n\t" - LOCK "incl (%eax)\n" + LOCK_PREFIX "incl (%eax)\n" "1: rep; nop\n\t" "cmpl $1,(%eax)\n\t" "js 1b\n\t" - LOCK "decl (%eax)\n\t" + LOCK_PREFIX "decl (%eax)\n\t" "js __read_lock_failed\n\t" "ret" ); diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index ab62a9f4701..2d8782960f4 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -1377,101 +1377,6 @@ static void __init register_memory(void) pci_mem_start, gapstart, gapsize); } -/* Use inline assembly to define this because the nops are defined - as inline assembly strings in the include files and we cannot - get them easily into strings. */ -asm("\t.data\nintelnops: " - GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6 - GENERIC_NOP7 GENERIC_NOP8); -asm("\t.data\nk8nops: " - K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6 - K8_NOP7 K8_NOP8); -asm("\t.data\nk7nops: " - K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6 - K7_NOP7 K7_NOP8); - -extern unsigned char intelnops[], k8nops[], k7nops[]; -static unsigned char *intel_nops[ASM_NOP_MAX+1] = { - NULL, - intelnops, - intelnops + 1, - intelnops + 1 + 2, - intelnops + 1 + 2 + 3, - intelnops + 1 + 2 + 3 + 4, - intelnops + 1 + 2 + 3 + 4 + 5, - intelnops + 1 + 2 + 3 + 4 + 5 + 6, - intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7, -}; -static unsigned char *k8_nops[ASM_NOP_MAX+1] = { - NULL, - k8nops, - k8nops + 1, - k8nops + 1 + 2, - k8nops + 1 + 2 + 3, - k8nops + 1 + 2 + 3 + 4, - k8nops + 1 + 2 + 3 + 4 + 5, - k8nops + 1 + 2 + 3 + 4 + 5 + 6, - k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, -}; -static unsigned char *k7_nops[ASM_NOP_MAX+1] = { - NULL, - k7nops, - k7nops + 1, - k7nops + 1 + 2, - k7nops + 1 + 2 + 3, - k7nops + 1 + 2 + 3 + 4, - k7nops + 1 + 2 + 3 + 4 + 5, - k7nops + 1 + 2 + 3 + 4 + 5 + 6, - k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, -}; -static struct nop { - int cpuid; - unsigned char **noptable; -} noptypes[] = { - { X86_FEATURE_K8, k8_nops }, - { X86_FEATURE_K7, k7_nops }, - { -1, NULL } -}; - -/* Replace instructions with better alternatives for this CPU type. - - This runs before SMP is initialized to avoid SMP problems with - self modifying code. This implies that assymetric systems where - APs have less capabilities than the boot processor are not handled. - Tough. Make sure you disable such features by hand. */ -void apply_alternatives(void *start, void *end) -{ - struct alt_instr *a; - int diff, i, k; - unsigned char **noptable = intel_nops; - for (i = 0; noptypes[i].cpuid >= 0; i++) { - if (boot_cpu_has(noptypes[i].cpuid)) { - noptable = noptypes[i].noptable; - break; - } - } - for (a = start; (void *)a < end; a++) { - if (!boot_cpu_has(a->cpuid)) - continue; - BUG_ON(a->replacementlen > a->instrlen); - memcpy(a->instr, a->replacement, a->replacementlen); - diff = a->instrlen - a->replacementlen; - /* Pad the rest with nops */ - for (i = a->replacementlen; diff > 0; diff -= k, i += k) { - k = diff; - if (k > ASM_NOP_MAX) - k = ASM_NOP_MAX; - memcpy(a->instr + i, noptable[k], k); - } - } -} - -void __init alternative_instructions(void) -{ - extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; - apply_alternatives(__alt_instructions, __alt_instructions_end); -} - static char * __init machine_specific_memory_setup(void); #ifdef CONFIG_MCA @@ -1554,6 +1459,16 @@ void __init setup_arch(char **cmdline_p) parse_cmdline_early(cmdline_p); +#ifdef CONFIG_EARLY_PRINTK + { + char *s = strstr(*cmdline_p, "earlyprintk="); + if (s) { + setup_early_printk(strchr(s, '=') + 1); + printk("early console enabled\n"); + } + } +#endif + max_low_pfn = setup_memory(); /* @@ -1578,19 +1493,6 @@ void __init setup_arch(char **cmdline_p) * NOTE: at this point the bootmem allocator is fully available. */ -#ifdef CONFIG_EARLY_PRINTK - { - char *s = strstr(*cmdline_p, "earlyprintk="); - if (s) { - extern void setup_early_printk(char *); - - setup_early_printk(strchr(s, '=') + 1); - printk("early console enabled\n"); - } - } -#endif - - dmi_scan_machine(); #ifdef CONFIG_X86_GENERICARCH diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c index 963616d364e..5c352c3a9e7 100644 --- a/arch/i386/kernel/signal.c +++ b/arch/i386/kernel/signal.c @@ -123,7 +123,8 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax err |= __get_user(tmp, &sc->seg); \ loadsegment(seg,tmp); } -#define FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | X86_EFLAGS_DF | \ +#define FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_RF | \ + X86_EFLAGS_OF | X86_EFLAGS_DF | \ X86_EFLAGS_TF | X86_EFLAGS_SF | X86_EFLAGS_ZF | \ X86_EFLAGS_AF | X86_EFLAGS_PF | X86_EFLAGS_CF) @@ -582,9 +583,6 @@ static void fastcall do_signal(struct pt_regs *regs) if (!user_mode(regs)) return; - if (try_to_freeze()) - goto no_signal; - if (test_thread_flag(TIF_RESTORE_SIGMASK)) oldset = ¤t->saved_sigmask; else @@ -613,7 +611,6 @@ static void fastcall do_signal(struct pt_regs *regs) return; } -no_signal: /* Did we come from a system call? */ if (regs->orig_eax >= 0) { /* Restart the system call - no handlers present */ diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index 7007e178379..4c470e99a74 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -899,6 +899,7 @@ static int __devinit do_boot_cpu(int apicid, int cpu) unsigned short nmi_high = 0, nmi_low = 0; ++cpucount; + alternatives_smp_switch(1); /* * We can't use kernel_thread since we must avoid to @@ -1368,6 +1369,8 @@ void __cpu_die(unsigned int cpu) /* They ack this in play_dead by setting CPU_DEAD */ if (per_cpu(cpu_state, cpu) == CPU_DEAD) { printk ("CPU %d is now offline\n", cpu); + if (1 == num_online_cpus()) + alternatives_smp_switch(0); return; } msleep(100); diff --git a/arch/i386/kernel/topology.c b/arch/i386/kernel/topology.c index 67a0e1baa28..296355292c7 100644 --- a/arch/i386/kernel/topology.c +++ b/arch/i386/kernel/topology.c @@ -41,6 +41,15 @@ int arch_register_cpu(int num){ parent = &node_devices[node].node; #endif /* CONFIG_NUMA */ + /* + * CPU0 cannot be offlined due to several + * restrictions and assumptions in kernel. This basically + * doesnt add a control file, one cannot attempt to offline + * BSP. + */ + if (!num) + cpu_devices[num].cpu.no_control = 1; + return register_cpu(&cpu_devices[num].cpu, num, parent); } diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index b814dbdcc91..de5386b01d3 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -99,6 +99,8 @@ int register_die_notifier(struct notifier_block *nb) { int err = 0; unsigned long flags; + + vmalloc_sync_all(); spin_lock_irqsave(&die_notifier_lock, flags); err = notifier_chain_register(&i386die_chain, nb); spin_unlock_irqrestore(&die_notifier_lock, flags); @@ -112,12 +114,30 @@ static inline int valid_stack_ptr(struct thread_info *tinfo, void *p) p < (void *)tinfo + THREAD_SIZE - 3; } -static void print_addr_and_symbol(unsigned long addr, char *log_lvl) +/* + * Print CONFIG_STACK_BACKTRACE_COLS address/symbol entries per line. + */ +static inline int print_addr_and_symbol(unsigned long addr, char *log_lvl, + int printed) { - printk(log_lvl); + if (!printed) + printk(log_lvl); + +#if CONFIG_STACK_BACKTRACE_COLS == 1 printk(" [<%08lx>] ", addr); +#else + printk(" <%08lx> ", addr); +#endif print_symbol("%s", addr); - printk("\n"); + + printed = (printed + 1) % CONFIG_STACK_BACKTRACE_COLS; + + if (printed) + printk(" "); + else + printk("\n"); + + return printed; } static inline unsigned long print_context_stack(struct thread_info *tinfo, @@ -125,20 +145,24 @@ static inline unsigned long print_context_stack(struct thread_info *tinfo, char *log_lvl) { unsigned long addr; + int printed = 0; /* nr of entries already printed on current line */ #ifdef CONFIG_FRAME_POINTER while (valid_stack_ptr(tinfo, (void *)ebp)) { addr = *(unsigned long *)(ebp + 4); - print_addr_and_symbol(addr, log_lvl); + printed = print_addr_and_symbol(addr, log_lvl, printed); ebp = *(unsigned long *)ebp; } #else while (valid_stack_ptr(tinfo, stack)) { addr = *stack++; if (__kernel_text_address(addr)) - print_addr_and_symbol(addr, log_lvl); + printed = print_addr_and_symbol(addr, log_lvl, printed); } #endif + if (printed) + printk("\n"); + return ebp; } @@ -166,8 +190,7 @@ static void show_trace_log_lvl(struct task_struct *task, stack = (unsigned long*)context->previous_esp; if (!stack) break; - printk(log_lvl); - printk(" =======================\n"); + printk("%s =======================\n", log_lvl); } } @@ -194,21 +217,17 @@ static void show_stack_log_lvl(struct task_struct *task, unsigned long *esp, for(i = 0; i < kstack_depth_to_print; i++) { if (kstack_end(stack)) break; - if (i && ((i % 8) == 0)) { - printk("\n"); - printk(log_lvl); - printk(" "); - } + if (i && ((i % 8) == 0)) + printk("\n%s ", log_lvl); printk("%08lx ", *stack++); } - printk("\n"); - printk(log_lvl); - printk("Call Trace:\n"); + printk("\n%sCall Trace:\n", log_lvl); show_trace_log_lvl(task, esp, log_lvl); } void show_stack(struct task_struct *task, unsigned long *esp) { + printk(" "); show_stack_log_lvl(task, esp, ""); } @@ -233,7 +252,7 @@ void show_registers(struct pt_regs *regs) esp = (unsigned long) (®s->esp); savesegment(ss, ss); - if (user_mode(regs)) { + if (user_mode_vm(regs)) { in_kernel = 0; esp = regs->esp; ss = regs->xss & 0xffff; @@ -333,6 +352,8 @@ void die(const char * str, struct pt_regs * regs, long err) static int die_counter; unsigned long flags; + oops_enter(); + if (die.lock_owner != raw_smp_processor_id()) { console_verbose(); spin_lock_irqsave(&die.lock, flags); @@ -385,6 +406,7 @@ void die(const char * str, struct pt_regs * regs, long err) ssleep(5); panic("Fatal exception"); } + oops_exit(); do_exit(SIGSEGV); } @@ -623,7 +645,7 @@ void die_nmi (struct pt_regs *regs, const char *msg) /* If we are in kernel we are probably nested up pretty bad * and might aswell get out now while we still can. */ - if (!user_mode(regs)) { + if (!user_mode_vm(regs)) { current->thread.trap_no = 2; crash_kexec(regs); } @@ -694,6 +716,7 @@ fastcall void do_nmi(struct pt_regs * regs, long error_code) void set_nmi_callback(nmi_callback_t callback) { + vmalloc_sync_all(); rcu_assign_pointer(nmi_callback, callback); } EXPORT_SYMBOL_GPL(set_nmi_callback); diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index 4710195b6b7..3f21c6f6466 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -68,6 +68,26 @@ SECTIONS *(.data.init_task) } + /* might get freed after init */ + . = ALIGN(4096); + __smp_alt_begin = .; + __smp_alt_instructions = .; + .smp_altinstructions : AT(ADDR(.smp_altinstructions) - LOAD_OFFSET) { + *(.smp_altinstructions) + } + __smp_alt_instructions_end = .; + . = ALIGN(4); + __smp_locks = .; + .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { + *(.smp_locks) + } + __smp_locks_end = .; + .smp_altinstr_replacement : AT(ADDR(.smp_altinstr_replacement) - LOAD_OFFSET) { + *(.smp_altinstr_replacement) + } + . = ALIGN(4096); + __smp_alt_end = .; + /* will be freed after init */ . = ALIGN(4096); /* Init code and data */ __init_begin = .; diff --git a/arch/i386/kernel/vsyscall-sysenter.S b/arch/i386/kernel/vsyscall-sysenter.S index 76b72815940..3b62baa6a37 100644 --- a/arch/i386/kernel/vsyscall-sysenter.S +++ b/arch/i386/kernel/vsyscall-sysenter.S @@ -21,6 +21,9 @@ * instruction clobbers %esp, the user's %esp won't even survive entry * into the kernel. We store %esp in %ebp. Code in entry.S must fetch * arg6 from the stack. + * + * You can not use this vsyscall for the clone() syscall because the + * three dwords on the parent stack do not get copied to the child. */ .text .globl __kernel_vsyscall diff --git a/arch/i386/mach-es7000/es7000.h b/arch/i386/mach-es7000/es7000.h index f1e3204f5de..80566ca4a80 100644 --- a/arch/i386/mach-es7000/es7000.h +++ b/arch/i386/mach-es7000/es7000.h @@ -83,6 +83,7 @@ struct es7000_oem_table { struct psai psai; }; +#ifdef CONFIG_ACPI struct acpi_table_sdt { unsigned long pa; unsigned long count; @@ -99,6 +100,9 @@ struct oem_table { u32 OEMTableSize; }; +extern int find_unisys_acpi_oem_table(unsigned long *oem_addr); +#endif + struct mip_reg { unsigned long long off_0; unsigned long long off_8; @@ -114,7 +118,6 @@ struct mip_reg { #define MIP_FUNC(VALUE) (VALUE & 0xff) extern int parse_unisys_oem (char *oemptr); -extern int find_unisys_acpi_oem_table(unsigned long *oem_addr); extern void setup_unisys(void); extern int es7000_start_cpu(int cpu, unsigned long eip); extern void es7000_sw_apic(void); diff --git a/arch/i386/mach-es7000/es7000plat.c b/arch/i386/mach-es7000/es7000plat.c index a9ab0644f40..3d0fc853516 100644 --- a/arch/i386/mach-es7000/es7000plat.c +++ b/arch/i386/mach-es7000/es7000plat.c @@ -51,8 +51,6 @@ struct mip_reg *host_reg; int mip_port; unsigned long mip_addr, host_addr; -#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_ACPI) - /* * GSI override for ES7000 platforms. */ @@ -76,8 +74,6 @@ es7000_rename_gsi(int ioapic, int gsi) return gsi; } -#endif /* (CONFIG_X86_IO_APIC) && (CONFIG_ACPI) */ - void __init setup_unisys(void) { @@ -160,6 +156,7 @@ parse_unisys_oem (char *oemptr) return es7000_plat; } +#ifdef CONFIG_ACPI int __init find_unisys_acpi_oem_table(unsigned long *oem_addr) { @@ -212,6 +209,7 @@ find_unisys_acpi_oem_table(unsigned long *oem_addr) } return -1; } +#endif static void es7000_spin(int n) diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c index cf572d9a3b6..7f0fcf219a2 100644 --- a/arch/i386/mm/fault.c +++ b/arch/i386/mm/fault.c @@ -214,6 +214,68 @@ static noinline void force_sig_info_fault(int si_signo, int si_code, fastcall void do_invalid_op(struct pt_regs *, unsigned long); +static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) +{ + unsigned index = pgd_index(address); + pgd_t *pgd_k; + pud_t *pud, *pud_k; + pmd_t *pmd, *pmd_k; + + pgd += index; + pgd_k = init_mm.pgd + index; + + if (!pgd_present(*pgd_k)) + return NULL; + + /* + * set_pgd(pgd, *pgd_k); here would be useless on PAE + * and redundant with the set_pmd() on non-PAE. As would + * set_pud. + */ + + pud = pud_offset(pgd, address); + pud_k = pud_offset(pgd_k, address); + if (!pud_present(*pud_k)) + return NULL; + + pmd = pmd_offset(pud, address); + pmd_k = pmd_offset(pud_k, address); + if (!pmd_present(*pmd_k)) + return NULL; + if (!pmd_present(*pmd)) + set_pmd(pmd, *pmd_k); + else + BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); + return pmd_k; +} + +/* + * Handle a fault on the vmalloc or module mapping area + * + * This assumes no large pages in there. + */ +static inline int vmalloc_fault(unsigned long address) +{ + unsigned long pgd_paddr; + pmd_t *pmd_k; + pte_t *pte_k; + /* + * Synchronize this task's top level page-table + * with the 'reference' page table. + * + * Do _not_ use "current" here. We might be inside + * an interrupt in the middle of a task switch.. + */ + pgd_paddr = read_cr3(); + pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); + if (!pmd_k) + return -1; + pte_k = pte_offset_kernel(pmd_k, address); + if (!pte_present(*pte_k)) + return -1; + return 0; +} + /* * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate @@ -223,6 +285,8 @@ fastcall void do_invalid_op(struct pt_regs *, unsigned long); * bit 0 == 0 means no page found, 1 means protection fault * bit 1 == 0 means read, 1 means write * bit 2 == 0 means kernel, 1 means user-mode + * bit 3 == 1 means use of reserved bit detected + * bit 4 == 1 means fault was an instruction fetch */ fastcall void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) @@ -237,13 +301,6 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs, /* get the address */ address = read_cr2(); - if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, - SIGSEGV) == NOTIFY_STOP) - return; - /* It's safe to allow irq's after cr2 has been saved */ - if (regs->eflags & (X86_EFLAGS_IF|VM_MASK)) - local_irq_enable(); - tsk = current; si_code = SEGV_MAPERR; @@ -259,17 +316,29 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs, * * This verifies that the fault happens in kernel space * (error_code & 4) == 0, and that the fault was not a - * protection error (error_code & 1) == 0. + * protection error (error_code & 9) == 0. */ - if (unlikely(address >= TASK_SIZE)) { - if (!(error_code & 5)) - goto vmalloc_fault; - /* + if (unlikely(address >= TASK_SIZE)) { + if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0) + return; + if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, + SIGSEGV) == NOTIFY_STOP) + return; + /* * Don't take the mm semaphore here. If we fixup a prefetch * fault we could otherwise deadlock. */ goto bad_area_nosemaphore; - } + } + + if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, + SIGSEGV) == NOTIFY_STOP) + return; + + /* It's safe to allow irq's after cr2 has been saved and the vmalloc + fault has been handled. */ + if (regs->eflags & (X86_EFLAGS_IF|VM_MASK)) + local_irq_enable(); mm = tsk->mm; @@ -440,24 +509,31 @@ no_context: bust_spinlocks(1); -#ifdef CONFIG_X86_PAE - if (error_code & 16) { - pte_t *pte = lookup_address(address); + if (oops_may_print()) { + #ifdef CONFIG_X86_PAE + if (error_code & 16) { + pte_t *pte = lookup_address(address); - if (pte && pte_present(*pte) && !pte_exec_kernel(*pte)) - printk(KERN_CRIT "kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n", current->uid); + if (pte && pte_present(*pte) && !pte_exec_kernel(*pte)) + printk(KERN_CRIT "kernel tried to execute " + "NX-protected page - exploit attempt? " + "(uid: %d)\n", current->uid); + } + #endif + if (address < PAGE_SIZE) + printk(KERN_ALERT "BUG: unable to handle kernel NULL " + "pointer dereference"); + else + printk(KERN_ALERT "BUG: unable to handle kernel paging" + " request"); + printk(" at virtual address %08lx\n",address); + printk(KERN_ALERT " printing eip:\n"); + printk("%08lx\n", regs->eip); } -#endif - if (address < PAGE_SIZE) - printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference"); - else - printk(KERN_ALERT "Unable to handle kernel paging request"); - printk(" at virtual address %08lx\n",address); - printk(KERN_ALERT " printing eip:\n"); - printk("%08lx\n", regs->eip); page = read_cr3(); page = ((unsigned long *) __va(page))[address >> 22]; - printk(KERN_ALERT "*pde = %08lx\n", page); + if (oops_may_print()) + printk(KERN_ALERT "*pde = %08lx\n", page); /* * We must not directly access the pte in the highpte * case, the page table might be allocated in highmem. @@ -465,7 +541,7 @@ no_context: * it's allocated already. */ #ifndef CONFIG_HIGHPTE - if (page & 1) { + if ((page & 1) && oops_may_print()) { page &= PAGE_MASK; address &= 0x003ff000; page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT]; @@ -510,51 +586,41 @@ do_sigbus: tsk->thread.error_code = error_code; tsk->thread.trap_no = 14; force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); - return; - -vmalloc_fault: - { - /* - * Synchronize this task's top level page-table - * with the 'reference' page table. - * - * Do _not_ use "tsk" here. We might be inside - * an interrupt in the middle of a task switch.. - */ - int index = pgd_index(address); - unsigned long pgd_paddr; - pgd_t *pgd, *pgd_k; - pud_t *pud, *pud_k; - pmd_t *pmd, *pmd_k; - pte_t *pte_k; - - pgd_paddr = read_cr3(); - pgd = index + (pgd_t *)__va(pgd_paddr); - pgd_k = init_mm.pgd + index; - - if (!pgd_present(*pgd_k)) - goto no_context; - - /* - * set_pgd(pgd, *pgd_k); here would be useless on PAE - * and redundant with the set_pmd() on non-PAE. As would - * set_pud. - */ +} - pud = pud_offset(pgd, address); - pud_k = pud_offset(pgd_k, address); - if (!pud_present(*pud_k)) - goto no_context; - - pmd = pmd_offset(pud, address); - pmd_k = pmd_offset(pud_k, address); - if (!pmd_present(*pmd_k)) - goto no_context; - set_pmd(pmd, *pmd_k); +#ifndef CONFIG_X86_PAE +void vmalloc_sync_all(void) +{ + /* + * Note that races in the updates of insync and start aren't + * problematic: insync can only get set bits added, and updates to + * start are only improving performance (without affecting correctness + * if undone). + */ + static DECLARE_BITMAP(insync, PTRS_PER_PGD); + static unsigned long start = TASK_SIZE; + unsigned long address; - pte_k = pte_offset_kernel(pmd_k, address); - if (!pte_present(*pte_k)) - goto no_context; - return; + BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); + for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) { + if (!test_bit(pgd_index(address), insync)) { + unsigned long flags; + struct page *page; + + spin_lock_irqsave(&pgd_lock, flags); + for (page = pgd_list; page; page = + (struct page *)page->index) + if (!vmalloc_sync_one(page_address(page), + address)) { + BUG_ON(page != pgd_list); + break; + } + spin_unlock_irqrestore(&pgd_lock, flags); + if (!page) + set_bit(pgd_index(address), insync); + } + if (address == start && test_bit(pgd_index(address), insync)) + start = address + PGDIR_SIZE; } } +#endif diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index 7ba55a6e2db..9f66ac582a8 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -720,21 +720,6 @@ static int noinline do_test_wp_bit(void) return flag; } -void free_initmem(void) -{ - unsigned long addr; - - addr = (unsigned long)(&__init_begin); - for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { - ClearPageReserved(virt_to_page(addr)); - init_page_count(virt_to_page(addr)); - memset((void *)addr, 0xcc, PAGE_SIZE); - free_page(addr); - totalram_pages++; - } - printk (KERN_INFO "Freeing unused kernel memory: %dk freed\n", (__init_end - __init_begin) >> 10); -} - #ifdef CONFIG_DEBUG_RODATA extern char __start_rodata, __end_rodata; @@ -758,17 +743,31 @@ void mark_rodata_ro(void) } #endif +void free_init_pages(char *what, unsigned long begin, unsigned long end) +{ + unsigned long addr; + + for (addr = begin; addr < end; addr += PAGE_SIZE) { + ClearPageReserved(virt_to_page(addr)); + init_page_count(virt_to_page(addr)); + memset((void *)addr, 0xcc, PAGE_SIZE); + free_page(addr); + totalram_pages++; + } + printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10); +} + +void free_initmem(void) +{ + free_init_pages("unused kernel memory", + (unsigned long)(&__init_begin), + (unsigned long)(&__init_end)); +} #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - if (start < end) - printk (KERN_INFO "Freeing initrd memory: %ldk freed\n", (end - start) >> 10); - for (; start < end; start += PAGE_SIZE) { - ClearPageReserved(virt_to_page(start)); - init_page_count(virt_to_page(start)); - free_page(start); - totalram_pages++; - } + free_init_pages("initrd memory", start, end); } #endif + diff --git a/arch/i386/oprofile/nmi_int.c b/arch/i386/oprofile/nmi_int.c index 0493e8b8ec4..1accce50c2c 100644 --- a/arch/i386/oprofile/nmi_int.c +++ b/arch/i386/oprofile/nmi_int.c @@ -122,7 +122,7 @@ static void nmi_save_registers(void * dummy) static void free_msrs(void) { int i; - for (i = 0; i < NR_CPUS; ++i) { + for_each_cpu(i) { kfree(cpu_msrs[i].counters); cpu_msrs[i].counters = NULL; kfree(cpu_msrs[i].controls); @@ -138,10 +138,7 @@ static int allocate_msrs(void) size_t counters_size = sizeof(struct op_msr) * model->num_counters; int i; - for (i = 0; i < NR_CPUS; ++i) { - if (!cpu_online(i)) - continue; - + for_each_online_cpu(i) { cpu_msrs[i].counters = kmalloc(counters_size, GFP_KERNEL); if (!cpu_msrs[i].counters) { success = 0; |