diff options
Diffstat (limited to 'arch/x86/kernel/process.c')
| -rw-r--r-- | arch/x86/kernel/process.c | 478 |
1 files changed, 136 insertions, 342 deletions
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 57d1868a86a..4505e2a950d 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -1,3 +1,5 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/errno.h> #include <linux/kernel.h> #include <linux/mm.h> @@ -12,24 +14,54 @@ #include <linux/user-return-notifier.h> #include <linux/dmi.h> #include <linux/utsname.h> +#include <linux/stackprotector.h> +#include <linux/tick.h> +#include <linux/cpuidle.h> #include <trace/events/power.h> #include <linux/hw_breakpoint.h> -#include <asm/system.h> +#include <asm/cpu.h> #include <asm/apic.h> #include <asm/syscalls.h> #include <asm/idle.h> #include <asm/uaccess.h> #include <asm/i387.h> +#include <asm/fpu-internal.h> #include <asm/debugreg.h> +#include <asm/nmi.h> + +/* + * per-CPU TSS segments. Threads are completely 'soft' on Linux, + * no more per-task TSS's. The TSS size is kept cacheline-aligned + * so they are allowed to end up in the .data..cacheline_aligned + * section. Since TSS's are completely CPU-local, we want them + * on exact cacheline boundaries, to eliminate cacheline ping-pong. + */ +__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; -unsigned long idle_halt; -EXPORT_SYMBOL(idle_halt); -unsigned long idle_nomwait; -EXPORT_SYMBOL(idle_nomwait); +#ifdef CONFIG_X86_64 +static DEFINE_PER_CPU(unsigned char, is_idle); +static ATOMIC_NOTIFIER_HEAD(idle_notifier); + +void idle_notifier_register(struct notifier_block *n) +{ + atomic_notifier_chain_register(&idle_notifier, n); +} +EXPORT_SYMBOL_GPL(idle_notifier_register); + +void idle_notifier_unregister(struct notifier_block *n) +{ + atomic_notifier_chain_unregister(&idle_notifier, n); +} +EXPORT_SYMBOL_GPL(idle_notifier_unregister); +#endif struct kmem_cache *task_xstate_cachep; EXPORT_SYMBOL_GPL(task_xstate_cachep); +/* + * this gets called so that we can store lazy state into memory and copy the + * current task into the new thread. + */ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { int ret; @@ -40,7 +72,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) ret = fpu_alloc(&dst->thread.fpu); if (ret) return ret; - fpu_copy(&dst->thread.fpu, &src->thread.fpu); + fpu_copy(dst, src); } return 0; } @@ -50,10 +82,9 @@ void free_thread_xstate(struct task_struct *tsk) fpu_free(&tsk->thread.fpu); } -void free_thread_info(struct thread_info *ti) +void arch_release_task_struct(struct task_struct *tsk) { - free_thread_xstate(ti->task); - free_pages((unsigned long)ti, get_order(THREAD_SIZE)); + free_thread_xstate(tsk); } void arch_task_cache_init(void) @@ -86,32 +117,8 @@ void exit_thread(void) put_cpu(); kfree(bp); } -} -void show_regs(struct pt_regs *regs) -{ - show_registers(regs); - show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs), - regs->bp); -} - -void show_regs_common(void) -{ - const char *board, *product; - - board = dmi_get_system_info(DMI_BOARD_NAME); - if (!board) - board = ""; - product = dmi_get_system_info(DMI_PRODUCT_NAME); - if (!product) - product = ""; - - printk(KERN_CONT "\n"); - printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s %s/%s\n", - current->pid, current->comm, print_tainted(), - init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version, board, product); + drop_fpu(me); } void flush_thread(void) @@ -120,12 +127,13 @@ void flush_thread(void) flush_ptrace_hw_breakpoint(tsk); memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); + drop_init_fpu(tsk); /* - * Forget coprocessor state.. + * Free the FPU state for non xsave platforms. They get reallocated + * lazily at the first use. */ - tsk->fpu_counter = 0; - clear_fpu(tsk); - clear_used_math(); + if (!use_eager_fpu()) + free_thread_xstate(tsk); } static void hard_disable_TSC(void) @@ -230,349 +238,147 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, propagate_user_return_notify(prev_p, next_p); } -int sys_fork(struct pt_regs *regs) -{ - return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL); -} - -/* - * This is trivial, and on the face of it looks like it - * could equally well be done in user mode. - * - * Not so, for quite unobvious reasons - register pressure. - * In user mode vfork() cannot have a stack frame, and if - * done by calling the "clone()" system call directly, you - * do not have enough call-clobbered registers to hold all - * the information you need. - */ -int sys_vfork(struct pt_regs *regs) -{ - return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0, - NULL, NULL); -} - -long -sys_clone(unsigned long clone_flags, unsigned long newsp, - void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) -{ - if (!newsp) - newsp = regs->sp; - return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); -} - -/* - * This gets run with %si containing the - * function to call, and %di containing - * the "args". - */ -extern void kernel_thread_helper(void); - -/* - * Create a kernel thread - */ -int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) -{ - struct pt_regs regs; - - memset(®s, 0, sizeof(regs)); - - regs.si = (unsigned long) fn; - regs.di = (unsigned long) arg; - -#ifdef CONFIG_X86_32 - regs.ds = __USER_DS; - regs.es = __USER_DS; - regs.fs = __KERNEL_PERCPU; - regs.gs = __KERNEL_STACK_CANARY; -#else - regs.ss = __KERNEL_DS; -#endif - - regs.orig_ax = -1; - regs.ip = (unsigned long) kernel_thread_helper; - regs.cs = __KERNEL_CS | get_kernel_rpl(); - regs.flags = X86_EFLAGS_IF | 0x2; - - /* Ok, create the new process.. */ - return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); -} -EXPORT_SYMBOL(kernel_thread); - -/* - * sys_execve() executes a new program. - */ -long sys_execve(const char __user *name, - const char __user *const __user *argv, - const char __user *const __user *envp, struct pt_regs *regs) -{ - long error; - char *filename; - - filename = getname(name); - error = PTR_ERR(filename); - if (IS_ERR(filename)) - return error; - error = do_execve(filename, argv, envp, regs); - -#ifdef CONFIG_X86_32 - if (error == 0) { - /* Make sure we don't return using sysenter.. */ - set_thread_flag(TIF_IRET); - } -#endif - - putname(filename); - return error; -} - /* * Idle related variables and functions */ -unsigned long boot_option_idle_override = 0; +unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE; EXPORT_SYMBOL(boot_option_idle_override); -/* - * Powermanagement idle function, if any.. - */ -void (*pm_idle)(void); -EXPORT_SYMBOL(pm_idle); +static void (*x86_idle)(void); -#ifdef CONFIG_X86_32 -/* - * This halt magic was a workaround for ancient floppy DMA - * wreckage. It should be safe to remove. - */ -static int hlt_counter; -void disable_hlt(void) +#ifndef CONFIG_SMP +static inline void play_dead(void) { - hlt_counter++; + BUG(); } -EXPORT_SYMBOL(disable_hlt); +#endif -void enable_hlt(void) +#ifdef CONFIG_X86_64 +void enter_idle(void) { - hlt_counter--; + this_cpu_write(is_idle, 1); + atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); } -EXPORT_SYMBOL(enable_hlt); -static inline int hlt_use_halt(void) +static void __exit_idle(void) { - return (!hlt_counter && boot_cpu_data.hlt_works_ok); + if (x86_test_and_clear_bit_percpu(0, is_idle) == 0) + return; + atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); } -#else -static inline int hlt_use_halt(void) + +/* Called from interrupts to signify idle end */ +void exit_idle(void) { - return 1; + /* idle loop has pid 0 */ + if (current->pid) + return; + __exit_idle(); } #endif -/* - * We use this if we don't have any better - * idle routine.. - */ -void default_idle(void) +void arch_cpu_idle_enter(void) { - if (hlt_use_halt()) { - trace_power_start(POWER_CSTATE, 1, smp_processor_id()); - current_thread_info()->status &= ~TS_POLLING; - /* - * TS_POLLING-cleared state must be visible before we - * test NEED_RESCHED: - */ - smp_mb(); - - if (!need_resched()) - safe_halt(); /* enables interrupts racelessly */ - else - local_irq_enable(); - current_thread_info()->status |= TS_POLLING; - } else { - local_irq_enable(); - /* loop is done by the caller */ - cpu_relax(); - } + local_touch_nmi(); + enter_idle(); } -#ifdef CONFIG_APM_MODULE -EXPORT_SYMBOL(default_idle); -#endif -void stop_this_cpu(void *dummy) +void arch_cpu_idle_exit(void) { - local_irq_disable(); - /* - * Remove this CPU: - */ - set_cpu_online(smp_processor_id(), false); - disable_local_APIC(); - - for (;;) { - if (hlt_works(smp_processor_id())) - halt(); - } + __exit_idle(); } -static void do_nothing(void *unused) +void arch_cpu_idle_dead(void) { + play_dead(); } /* - * cpu_idle_wait - Used to ensure that all the CPUs discard old value of - * pm_idle and update to new pm_idle value. Required while changing pm_idle - * handler on SMP systems. - * - * Caller must have changed pm_idle to the new value before the call. Old - * pm_idle value will not be used by any CPU after the return of this function. + * Called from the generic idle code. */ -void cpu_idle_wait(void) +void arch_cpu_idle(void) { - smp_mb(); - /* kick all the CPUs so that they exit out of pm_idle */ - smp_call_function(do_nothing, NULL, 1); + x86_idle(); } -EXPORT_SYMBOL_GPL(cpu_idle_wait); /* - * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, - * which can obviate IPI to trigger checking of need_resched. - * We execute MONITOR against need_resched and enter optimized wait state - * through MWAIT. Whenever someone changes need_resched, we would be woken - * up from MWAIT (without an IPI). - * - * New with Core Duo processors, MWAIT can take some hints based on CPU - * capability. + * We use this if we don't have any better idle routine.. */ -void mwait_idle_with_hints(unsigned long ax, unsigned long cx) +void default_idle(void) { - trace_power_start(POWER_CSTATE, (ax>>4)+1, smp_processor_id()); - if (!need_resched()) { - if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) - clflush((void *)¤t_thread_info()->flags); - - __monitor((void *)¤t_thread_info()->flags, 0, 0); - smp_mb(); - if (!need_resched()) - __mwait(ax, cx); - } + trace_cpu_idle_rcuidle(1, smp_processor_id()); + safe_halt(); + trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); } +#ifdef CONFIG_APM_MODULE +EXPORT_SYMBOL(default_idle); +#endif -/* Default MONITOR/MWAIT with no hints, used for default C1 state */ -static void mwait_idle(void) +#ifdef CONFIG_XEN +bool xen_set_default_idle(void) { - if (!need_resched()) { - trace_power_start(POWER_CSTATE, 1, smp_processor_id()); - if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) - clflush((void *)¤t_thread_info()->flags); + bool ret = !!x86_idle; - __monitor((void *)¤t_thread_info()->flags, 0, 0); - smp_mb(); - if (!need_resched()) - __sti_mwait(0, 0); - else - local_irq_enable(); - } else - local_irq_enable(); -} + x86_idle = default_idle; -/* - * On SMP it's slightly faster (but much more power-consuming!) - * to poll the ->work.need_resched flag instead of waiting for the - * cross-CPU IPI to arrive. Use this option with caution. - */ -static void poll_idle(void) -{ - trace_power_start(POWER_CSTATE, 0, smp_processor_id()); - local_irq_enable(); - while (!need_resched()) - cpu_relax(); - trace_power_end(0); + return ret; } - -/* - * mwait selection logic: - * - * It depends on the CPU. For AMD CPUs that support MWAIT this is - * wrong. Family 0x10 and 0x11 CPUs will enter C1 on HLT. Powersavings - * then depend on a clock divisor and current Pstate of the core. If - * all cores of a processor are in halt state (C1) the processor can - * enter the C1E (C1 enhanced) state. If mwait is used this will never - * happen. - * - * idle=mwait overrides this decision and forces the usage of mwait. - */ -static int __cpuinitdata force_mwait; - -#define MWAIT_INFO 0x05 -#define MWAIT_ECX_EXTENDED_INFO 0x01 -#define MWAIT_EDX_C1 0xf0 - -static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) +#endif +void stop_this_cpu(void *dummy) { - u32 eax, ebx, ecx, edx; - - if (force_mwait) - return 1; - - if (c->cpuid_level < MWAIT_INFO) - return 0; - - cpuid(MWAIT_INFO, &eax, &ebx, &ecx, &edx); - /* Check, whether EDX has extended info about MWAIT */ - if (!(ecx & MWAIT_ECX_EXTENDED_INFO)) - return 1; - + local_irq_disable(); /* - * edx enumeratios MONITOR/MWAIT extensions. Check, whether - * C1 supports MWAIT + * Remove this CPU: */ - return (edx & MWAIT_EDX_C1); + set_cpu_online(smp_processor_id(), false); + disable_local_APIC(); + + for (;;) + halt(); } -bool c1e_detected; -EXPORT_SYMBOL(c1e_detected); +bool amd_e400_c1e_detected; +EXPORT_SYMBOL(amd_e400_c1e_detected); -static cpumask_var_t c1e_mask; +static cpumask_var_t amd_e400_c1e_mask; -void c1e_remove_cpu(int cpu) +void amd_e400_remove_cpu(int cpu) { - if (c1e_mask != NULL) - cpumask_clear_cpu(cpu, c1e_mask); + if (amd_e400_c1e_mask != NULL) + cpumask_clear_cpu(cpu, amd_e400_c1e_mask); } /* - * C1E aware idle routine. We check for C1E active in the interrupt + * AMD Erratum 400 aware idle routine. We check for C1E active in the interrupt * pending message MSR. If we detect C1E, then we handle it the same * way as C3 power states (local apic timer and TSC stop) */ -static void c1e_idle(void) +static void amd_e400_idle(void) { - if (need_resched()) - return; - - if (!c1e_detected) { + if (!amd_e400_c1e_detected) { u32 lo, hi; rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); if (lo & K8_INTP_C1E_ACTIVE_MASK) { - c1e_detected = true; + amd_e400_c1e_detected = true; if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) mark_tsc_unstable("TSC halt in AMD C1E"); - printk(KERN_INFO "System has AMD C1E enabled\n"); + pr_info("System has AMD C1E enabled\n"); } } - if (c1e_detected) { + if (amd_e400_c1e_detected) { int cpu = smp_processor_id(); - if (!cpumask_test_cpu(cpu, c1e_mask)) { - cpumask_set_cpu(cpu, c1e_mask); + if (!cpumask_test_cpu(cpu, amd_e400_c1e_mask)) { + cpumask_set_cpu(cpu, amd_e400_c1e_mask); /* * Force broadcast so ACPI can not interfere. */ clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, &cpu); - printk(KERN_INFO "Switch to broadcast mode on CPU%d\n", - cpu); + pr_info("Switch to broadcast mode on CPU%d\n", cpu); } clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); @@ -582,43 +388,35 @@ static void c1e_idle(void) * The switch back from broadcast mode needs to be * called with interrupts disabled. */ - local_irq_disable(); - clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu); - local_irq_enable(); + local_irq_disable(); + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu); + local_irq_enable(); } else default_idle(); } -void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) +void select_idle_routine(const struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP - if (pm_idle == poll_idle && smp_num_siblings > 1) { - printk_once(KERN_WARNING "WARNING: polling idle and HT enabled," - " performance may degrade.\n"); - } + if (boot_option_idle_override == IDLE_POLL && smp_num_siblings > 1) + pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n"); #endif - if (pm_idle) + if (x86_idle || boot_option_idle_override == IDLE_POLL) return; - if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) { - /* - * One CPU supports mwait => All CPUs supports mwait - */ - printk(KERN_INFO "using mwait in idle threads.\n"); - pm_idle = mwait_idle; - } else if (cpu_has_amd_erratum(amd_erratum_400)) { + if (cpu_has_bug(c, X86_BUG_AMD_APIC_C1E)) { /* E400: APIC timer interrupt does not wake up CPU from C1e */ - printk(KERN_INFO "using C1E aware idle routine\n"); - pm_idle = c1e_idle; + pr_info("using AMD E400 aware idle routine\n"); + x86_idle = amd_e400_idle; } else - pm_idle = default_idle; + x86_idle = default_idle; } -void __init init_c1e_mask(void) +void __init init_amd_e400_c1e_mask(void) { - /* If we're using c1e_idle, we need to allocate c1e_mask. */ - if (pm_idle == c1e_idle) - zalloc_cpumask_var(&c1e_mask, GFP_KERNEL); + /* If we're using amd_e400_idle, we need to allocate amd_e400_c1e_mask. */ + if (x86_idle == amd_e400_idle) + zalloc_cpumask_var(&amd_e400_c1e_mask, GFP_KERNEL); } static int __init idle_setup(char *str) @@ -627,11 +425,10 @@ static int __init idle_setup(char *str) return -EINVAL; if (!strcmp(str, "poll")) { - printk("using polling idle threads.\n"); - pm_idle = poll_idle; - } else if (!strcmp(str, "mwait")) - force_mwait = 1; - else if (!strcmp(str, "halt")) { + pr_info("using polling idle threads\n"); + boot_option_idle_override = IDLE_POLL; + cpu_idle_poll_ctrl(true); + } else if (!strcmp(str, "halt")) { /* * When the boot option of idle=halt is added, halt is * forced to be used for CPU idle. In such case CPU C2/C3 @@ -639,9 +436,8 @@ static int __init idle_setup(char *str) * To continue to load the CPU idle driver, don't touch * the boot_option_idle_override. */ - pm_idle = default_idle; - idle_halt = 1; - return 0; + x86_idle = default_idle; + boot_option_idle_override = IDLE_HALT; } else if (!strcmp(str, "nomwait")) { /* * If the boot option of "idle=nomwait" is added, @@ -649,12 +445,10 @@ static int __init idle_setup(char *str) * states. In such case it won't touch the variable * of boot_option_idle_override. */ - idle_nomwait = 1; - return 0; + boot_option_idle_override = IDLE_NOMWAIT; } else return -1; - boot_option_idle_override = 1; return 0; } early_param("idle", idle_setup); |
