From 10a0a8d4e3f6bf2d077f94344441909abe670f5a Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Jul 2007 18:37:02 -0700 Subject: Add common orderly_poweroff() Various pieces of code around the kernel want to be able to trigger an orderly poweroff. This pulls them together into a single implementation. By default the poweroff command is /sbin/poweroff, but it can be set via sysctl: kernel/poweroff_cmd. This is split at whitespace, so it can include command-line arguments. This patch replaces four other instances of invoking either "poweroff" or "shutdown -h now": two sbus drivers, and acpi thermal management. sparc64 has its own "powerd"; still need to determine whether it should be replaced by orderly_poweroff(). Signed-off-by: Jeremy Fitzhardinge Acked-by: Len Brown Signed-off-by: Chris Wright Cc: Andrew Morton Cc: Randy Dunlap Cc: Andi Kleen Cc: Al Viro Cc: Arnd Bergmann Cc: David S. Miller --- kernel/sysctl.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7063ebc6db0..44a1d699aad 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include @@ -705,6 +706,15 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif + { + .ctl_name = CTL_UNNUMBERED, + .procname = "poweroff_cmd", + .data = &poweroff_cmd, + .maxlen = POWEROFF_CMD_PATH_LEN, + .mode = 0644, + .proc_handler = &proc_dostring, + .strategy = &sysctl_string, + }, { .ctl_name = 0 } }; -- cgit v1.2.3-70-g09d2 From 77afcf78a2ded9a91838734234949c0ead5feb12 Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Thu, 19 Jul 2007 01:47:41 -0700 Subject: PM: Integrate beeping flag with existing acpi_sleep flags Move "debug during resume from s2ram" into the variable we already use for real-mode flags to simplify code. It also closes nasty trap for the user in acpi_sleep_setup; order of parameters actually mattered there, acpi_sleep=s3_bios,s3_mode doing something different from acpi_sleep=s3_mode,s3_bios. Signed-off-by: Pavel Machek Signed-off-by: Rafael J. Wysocki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/acpi/sleep.c | 12 ++++++++---- arch/i386/kernel/acpi/wakeup.S | 18 ++++++++---------- arch/x86_64/kernel/acpi/sleep.c | 8 +++++--- arch/x86_64/kernel/acpi/wakeup.S | 15 ++++++--------- include/linux/acpi.h | 3 +-- kernel/power/main.c | 23 ----------------------- kernel/sysctl.c | 2 +- 7 files changed, 29 insertions(+), 52 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/arch/i386/kernel/acpi/sleep.c b/arch/i386/kernel/acpi/sleep.c index 4ee83577bf6..c42b5ab49de 100644 --- a/arch/i386/kernel/acpi/sleep.c +++ b/arch/i386/kernel/acpi/sleep.c @@ -14,7 +14,7 @@ /* address in low memory of the wakeup routine. */ unsigned long acpi_wakeup_address = 0; -unsigned long acpi_video_flags; +unsigned long acpi_realmode_flags; extern char wakeup_start, wakeup_end; extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long)); @@ -68,9 +68,11 @@ static int __init acpi_sleep_setup(char *str) { while ((str != NULL) && (*str != '\0')) { if (strncmp(str, "s3_bios", 7) == 0) - acpi_video_flags = 1; + acpi_realmode_flags |= 1; if (strncmp(str, "s3_mode", 7) == 0) - acpi_video_flags |= 2; + acpi_realmode_flags |= 2; + if (strncmp(str, "s3_beep", 7) == 0) + acpi_realmode_flags |= 4; str = strchr(str, ','); if (str != NULL) str += strspn(str, ", \t"); @@ -80,9 +82,11 @@ static int __init acpi_sleep_setup(char *str) __setup("acpi_sleep=", acpi_sleep_setup); +/* Ouch, we want to delete this. We already have better version in userspace, in + s2ram from suspend.sf.net project */ static __init int reset_videomode_after_s3(struct dmi_system_id *d) { - acpi_video_flags |= 2; + acpi_realmode_flags |= 2; return 0; } diff --git a/arch/i386/kernel/acpi/wakeup.S b/arch/i386/kernel/acpi/wakeup.S index b4e2ec3c392..ed0a0f2c159 100644 --- a/arch/i386/kernel/acpi/wakeup.S +++ b/arch/i386/kernel/acpi/wakeup.S @@ -47,7 +47,7 @@ wakeup_code: movw %ax, %ds # Make ds:0 point to wakeup_start movw %ax, %ss - testl $1, beep_flags - wakeup_code + testl $4, realmode_flags - wakeup_code jz 1f BEEP 1: @@ -61,7 +61,7 @@ wakeup_code: cmpl $0x12345678, %eax jne bogus_real_magic - testl $1, video_flags - wakeup_code + testl $1, realmode_flags - wakeup_code jz 1f lcall $0xc000,$3 movw %cs, %ax @@ -69,7 +69,7 @@ wakeup_code: movw %ax, %ss 1: - testl $2, video_flags - wakeup_code + testl $2, realmode_flags - wakeup_code jz 1f mov video_mode - wakeup_code, %ax call mode_set @@ -108,11 +108,11 @@ wakeup_code: cmpl $0x12345678, %eax jne bogus_real_magic - testl $2, beep_flags - wakeup_code + testl $8, realmode_flags - wakeup_code jz 1f BEEP 1: - ljmpl $__KERNEL_CS,$wakeup_pmode_return + ljmpl $__KERNEL_CS, $wakeup_pmode_return real_save_gdt: .word 0 .long 0 @@ -121,7 +121,7 @@ real_save_cr3: .long 0 real_save_cr4: .long 0 real_magic: .long 0 video_mode: .long 0 -video_flags: .long 0 +realmode_flags: .long 0 beep_flags: .long 0 real_efer_save_restore: .long 0 real_save_efer_edx: .long 0 @@ -285,10 +285,8 @@ ENTRY(acpi_copy_wakeup_routine) movl saved_videomode, %edx movl %edx, video_mode - wakeup_start (%eax) - movl acpi_video_flags, %edx - movl %edx, video_flags - wakeup_start (%eax) - movl s2ram_beep, %edx - movl %edx, beep_flags - wakeup_start (%eax) + movl acpi_realmode_flags, %edx + movl %edx, realmode_flags - wakeup_start (%eax) movl $0x12345678, real_magic - wakeup_start (%eax) movl $0x12345678, saved_magic popl %ebx diff --git a/arch/x86_64/kernel/acpi/sleep.c b/arch/x86_64/kernel/acpi/sleep.c index 195b7034a14..4277f2b27e6 100644 --- a/arch/x86_64/kernel/acpi/sleep.c +++ b/arch/x86_64/kernel/acpi/sleep.c @@ -55,7 +55,7 @@ /* address in low memory of the wakeup routine. */ unsigned long acpi_wakeup_address = 0; -unsigned long acpi_video_flags; +unsigned long acpi_realmode_flags; extern char wakeup_start, wakeup_end; extern unsigned long acpi_copy_wakeup_routine(unsigned long); @@ -103,9 +103,11 @@ static int __init acpi_sleep_setup(char *str) { while ((str != NULL) && (*str != '\0')) { if (strncmp(str, "s3_bios", 7) == 0) - acpi_video_flags = 1; + acpi_realmode_flags |= 1; if (strncmp(str, "s3_mode", 7) == 0) - acpi_video_flags |= 2; + acpi_realmode_flags |= 2; + if (strncmp(str, "s3_beep", 7) == 0) + acpi_realmode_flags |= 4; str = strchr(str, ','); if (str != NULL) str += strspn(str, ", \t"); diff --git a/arch/x86_64/kernel/acpi/wakeup.S b/arch/x86_64/kernel/acpi/wakeup.S index ed63d184579..13f1480cbec 100644 --- a/arch/x86_64/kernel/acpi/wakeup.S +++ b/arch/x86_64/kernel/acpi/wakeup.S @@ -50,7 +50,7 @@ wakeup_code: movw %ax, %ss # Data segment must be set up before we can see whether to beep. - testl $1, beep_flags - wakeup_code + testl $4, realmode_flags - wakeup_code jz 1f BEEP 1: @@ -70,7 +70,7 @@ wakeup_code: testl %eax, %eax jnz no_longmode - testl $1, video_flags - wakeup_code + testl $1, realmode_flags - wakeup_code jz 1f lcall $0xc000,$3 movw %cs, %ax @@ -78,7 +78,7 @@ wakeup_code: movw %ax, %ss 1: - testl $2, video_flags - wakeup_code + testl $2, realmode_flags - wakeup_code jz 1f mov video_mode - wakeup_code, %ax call mode_seta @@ -251,9 +251,8 @@ gdt_48a: .long gdta - wakeup_code # gdt base (relocated in later) real_magic: .quad 0 -beep_flags: .quad 0 video_mode: .quad 0 -video_flags: .quad 0 +realmode_flags: .quad 0 .code16 bogus_real_magic: @@ -367,12 +366,10 @@ ENTRY(acpi_copy_wakeup_routine) pushq %rax pushq %rdx - movl s2ram_beep, %edx - movl %edx, beep_flags - wakeup_start (,%rdi) movl saved_video_mode, %edx movl %edx, video_mode - wakeup_start (,%rdi) - movl acpi_video_flags, %edx - movl %edx, video_flags - wakeup_start (,%rdi) + movl acpi_realmode_flags, %edx + movl %edx, realmode_flags - wakeup_start (,%rdi) movq $0x12345678, real_magic - wakeup_start (,%rdi) movq $0x123456789abcdef0, %rdx movq %rdx, saved_magic diff --git a/include/linux/acpi.h b/include/linux/acpi.h index c0ccdd72036..dc234c508a6 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -122,8 +122,7 @@ extern struct acpi_mcfg_allocation *pci_mmcfg_config; extern int pci_mmcfg_config_num; extern int sbf_port; -extern unsigned long acpi_video_flags; -extern unsigned long s2ram_beep; +extern unsigned long acpi_realmode_flags; #else /* !CONFIG_ACPI */ diff --git a/kernel/power/main.c b/kernel/power/main.c index c74a56436d8..32147b57c3b 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -332,27 +332,6 @@ static ssize_t state_store(struct kset *kset, const char *buf, size_t n) power_attr(state); -unsigned long s2ram_beep = 0; - -static ssize_t s2ram_beep_show(struct kset *kset, char *buf) -{ - return sprintf(buf, "%d\n", s2ram_beep); -} - -static ssize_t -s2ram_beep_store(struct kset *kset, const char *buf, size_t n) -{ - int val; - - if (sscanf(buf, "%d", &val) > 0) { - s2ram_beep = val; - return n; - } - return -EINVAL; -} - -power_attr(s2ram_beep); - #ifdef CONFIG_PM_TRACE int pm_trace_enabled; @@ -378,13 +357,11 @@ power_attr(pm_trace); static struct attribute * g[] = { &state_attr.attr, &pm_trace_attr.attr, - &s2ram_beep_attr.attr, NULL, }; #else static struct attribute * g[] = { &state_attr.attr, - &s2ram_beep_attr.attr, NULL, }; #endif /* CONFIG_PM_TRACE */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 44a1d699aad..3ed4912bf18 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -660,7 +660,7 @@ static ctl_table kern_table[] = { { .ctl_name = KERN_ACPI_VIDEO_FLAGS, .procname = "acpi_video_flags", - .data = &acpi_video_flags, + .data = &acpi_realmode_flags, .maxlen = sizeof (unsigned long), .mode = 0644, .proc_handler = &proc_doulongvec_minmax, -- cgit v1.2.3-70-g09d2 From bdf4c48af20a3b0f01671799ace345e3d49576da Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Jul 2007 01:48:15 -0700 Subject: audit: rework execve audit The purpose of audit_bprm() is to log the argv array to a userspace daemon at the end of the execve system call. Since user-space hasn't had time to run, this array is still in pristine state on the process' stack; so no need to copy it, we can just grab it from there. In order to minimize the damage to audit_log_*() copy each string into a temporary kernel buffer first. Currently the audit code requires that the full argument vector fits in a single packet. So currently it does clip the argv size to a (sysctl) limit, but only when execve auditing is enabled. If the audit protocol gets extended to allow for multiple packets this check can be removed. Signed-off-by: Peter Zijlstra Signed-off-by: Ollie Wild Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 7 ++++ fs/exec.c | 3 ++ include/linux/binfmts.h | 1 + kernel/auditsc.c | 84 ++++++++++++++++++++++++++++---------- kernel/sysctl.c | 11 +++++ 5 files changed, 85 insertions(+), 21 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index ebffdffb3d9..72e247ef6fa 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -1065,6 +1065,13 @@ check the amount of free space (value is in seconds). Default settings are: 4, resume it if we have a value of 3 or more percent; consider information about the amount of free space valid for 30 seconds +audit_argv_kb +------------- + +The file contains a single value denoting the limit on the argv array size +for execve (in KiB). This limit is only applied when system call auditing for +execve is enabled, otherwise the value is ignored. + ctrl-alt-del ------------ diff --git a/fs/exec.c b/fs/exec.c index f20561ff452..2e3f7950c18 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1154,6 +1154,7 @@ int do_execve(char * filename, { struct linux_binprm *bprm; struct file *file; + unsigned long env_p; int retval; int i; @@ -1208,9 +1209,11 @@ int do_execve(char * filename, if (retval < 0) goto out; + env_p = bprm->p; retval = copy_strings(bprm->argc, argv, bprm); if (retval < 0) goto out; + bprm->argv_len = env_p - bprm->p; retval = search_binary_handler(bprm,regs); if (retval >= 0) { diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index e1a708337be..a0b209cd576 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -40,6 +40,7 @@ struct linux_binprm{ unsigned interp_flags; unsigned interp_data; unsigned long loader, exec; + unsigned long argv_len; }; #define BINPRM_FLAGS_ENFORCE_NONDUMP_BIT 0 diff --git a/kernel/auditsc.c b/kernel/auditsc.c index b7640a5f382..535586fc498 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -153,7 +153,7 @@ struct audit_aux_data_execve { struct audit_aux_data d; int argc; int envc; - char mem[0]; + struct mm_struct *mm; }; struct audit_aux_data_socketcall { @@ -831,6 +831,55 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, return rc; } +static void audit_log_execve_info(struct audit_buffer *ab, + struct audit_aux_data_execve *axi) +{ + int i; + long len, ret; + const char __user *p = (const char __user *)axi->mm->arg_start; + char *buf; + + if (axi->mm != current->mm) + return; /* execve failed, no additional info */ + + for (i = 0; i < axi->argc; i++, p += len) { + len = strnlen_user(p, MAX_ARG_PAGES*PAGE_SIZE); + /* + * We just created this mm, if we can't find the strings + * we just copied into it something is _very_ wrong. Similar + * for strings that are too long, we should not have created + * any. + */ + if (!len || len > MAX_ARG_STRLEN) { + WARN_ON(1); + send_sig(SIGKILL, current, 0); + } + + buf = kmalloc(len, GFP_KERNEL); + if (!buf) { + audit_panic("out of memory for argv string\n"); + break; + } + + ret = copy_from_user(buf, p, len); + /* + * There is no reason for this copy to be short. We just + * copied them here, and the mm hasn't been exposed to user- + * space yet. + */ + if (!ret) { + WARN_ON(1); + send_sig(SIGKILL, current, 0); + } + + audit_log_format(ab, "a%d=", i); + audit_log_untrustedstring(ab, buf); + audit_log_format(ab, "\n"); + + kfree(buf); + } +} + static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) { int i, call_panic = 0; @@ -971,13 +1020,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts case AUDIT_EXECVE: { struct audit_aux_data_execve *axi = (void *)aux; - int i; - const char *p; - for (i = 0, p = axi->mem; i < axi->argc; i++) { - audit_log_format(ab, "a%d=", i); - p = audit_log_untrustedstring(ab, p); - audit_log_format(ab, "\n"); - } + audit_log_execve_info(ab, axi); break; } case AUDIT_SOCKETCALL: { @@ -1821,32 +1864,31 @@ int __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode return 0; } +int audit_argv_kb = 32; + int audit_bprm(struct linux_binprm *bprm) { struct audit_aux_data_execve *ax; struct audit_context *context = current->audit_context; - unsigned long p, next; - void *to; if (likely(!audit_enabled || !context || context->dummy)) return 0; - ax = kmalloc(sizeof(*ax) + PAGE_SIZE * MAX_ARG_PAGES - bprm->p, - GFP_KERNEL); + /* + * Even though the stack code doesn't limit the arg+env size any more, + * the audit code requires that _all_ arguments be logged in a single + * netlink skb. Hence cap it :-( + */ + if (bprm->argv_len > (audit_argv_kb << 10)) + return -E2BIG; + + ax = kmalloc(sizeof(*ax), GFP_KERNEL); if (!ax) return -ENOMEM; ax->argc = bprm->argc; ax->envc = bprm->envc; - for (p = bprm->p, to = ax->mem; p < MAX_ARG_PAGES*PAGE_SIZE; p = next) { - struct page *page = bprm->page[p / PAGE_SIZE]; - void *kaddr = kmap(page); - next = (p + PAGE_SIZE) & ~(PAGE_SIZE - 1); - memcpy(to, kaddr + (p & (PAGE_SIZE - 1)), next - p); - to += next - p; - kunmap(page); - } - + ax->mm = bprm->mm; ax->d.type = AUDIT_EXECVE; ax->d.next = context->aux; context->aux = (void *)ax; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3ed4912bf18..8db41764e2a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -78,6 +78,7 @@ extern int percpu_pagelist_fraction; extern int compat_log; extern int maps_protect; extern int sysctl_stat_interval; +extern int audit_argv_kb; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static int maxolduid = 65535; @@ -306,6 +307,16 @@ static ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, +#ifdef CONFIG_AUDITSYSCALL + { + .ctl_name = CTL_UNNUMBERED, + .procname = "audit_argv_kb", + .data = &audit_argv_kb, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif { .ctl_name = KERN_CORE_PATTERN, .procname = "core_pattern", -- cgit v1.2.3-70-g09d2 From 76fdbb25f963de5dc1e308325f0578a2f92b1c2d Mon Sep 17 00:00:00 2001 From: "Kawai, Hidehiro" Date: Thu, 19 Jul 2007 01:48:26 -0700 Subject: coredump masking: bound suid_dumpable sysctl This patch series is version 5 of the core dump masking feature, which controls which VMAs should be dumped based on their memory types and per-process flags. I adopted most of Andrew's suggestion at the previous version. He also suggested using system call instead of /proc// interface, I decided to use the latter continuously because adding new system call with pid argument will give a big impact on the kernel. You can access the per-process flags via /proc//coredump_filter interface. coredump_filter represents a bitmask of memory types, and if a bit is set, VMAs of corresponding memory type are written into a core file when the process is dumped. The bitmask is inherited from the parent process when a process is created. The original purpose is to avoid longtime system slowdown when a number of processes which share a huge shared memory are dumped at the same time. To achieve this purpose, this patch series adds an ability to suppress dumping anonymous shared memory for specified processes. In this version, three other memory types are also supported. Here are the coredump_filter bits: bit 0: anonymous private memory bit 1: anonymous shared memory bit 2: file-backed private memory bit 3: file-backed shared memory The default value of coredump_filter is 0x3. This means the new core dump routine has the same behavior as conventional behavior by default. In this version, coredump_filter bits and mm.dumpable are merged into mm.flags, and it is accessed by atomic bitops. The supported core file formats are ELF and ELF-FDPIC. ELF has been tested, but ELF-FDPIC has not been built and tested because I don't have the test environment. This patch limits a value of suid_dumpable sysctl to the range of 0 to 2. Signed-off-by: Hidehiro Kawai Cc: Alan Cox Cc: David Howells Cc: Hugh Dickins Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8db41764e2a..2aaa3f98185 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -733,6 +733,7 @@ static ctl_table kern_table[] = { /* Constants for minimum and maximum testing in vm_table. We use these as one-element integer vectors. */ static int zero; +static int two = 2; static int one_hundred = 100; @@ -1123,7 +1124,10 @@ static ctl_table fs_table[] = { .data = &lease_break_time, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &two, }, { .ctl_name = FS_AIO_NR, -- cgit v1.2.3-70-g09d2 From f20786ff4da51e56b1956acf30be2552be266746 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Jul 2007 01:48:56 -0700 Subject: lockstat: core infrastructure Introduce the core lock statistics code. Lock statistics provides lock wait-time and hold-time (as well as the count of corresponding contention and acquisitions events). Also, the first few call-sites that encounter contention are tracked. Lock wait-time is the time spent waiting on the lock. This provides insight into the locking scheme, that is, a heavily contended lock is indicative of a too coarse locking scheme. Lock hold-time is the duration the lock was held, this provides a reference for the wait-time numbers, so they can be put into perspective. 1) lock 2) ... do stuff .. unlock 3) The time between 1 and 2 is the wait-time. The time between 2 and 3 is the hold-time. The lockdep held-lock tracking code is reused, because it already collects locks into meaningful groups (classes), and because it is an existing infrastructure for lock instrumentation. Currently lockdep tracks lock acquisition with two hooks: lock() lock_acquire() _lock() ... code protected by lock ... unlock() lock_release() _unlock() We need to extend this with two more hooks, in order to measure contention. lock_contended() - used to measure contention events lock_acquired() - completion of the contention These are then placed the following way: lock() lock_acquire() if (!_try_lock()) lock_contended() _lock() lock_acquired() ... do locked stuff ... unlock() lock_release() _unlock() (Note: the try_lock() 'trick' is used to avoid instrumenting all platform dependent lock primitive implementations.) It is also possible to toggle the two lockdep features at runtime using: /proc/sys/kernel/prove_locking /proc/sys/kernel/lock_stat (esp. turning off the O(n^2) prove_locking functionaliy can help) [akpm@linux-foundation.org: build fixes] [akpm@linux-foundation.org: nuke unneeded ifdefs] Signed-off-by: Peter Zijlstra Acked-by: Ingo Molnar Acked-by: Jason Baron Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/lockdep.h | 53 +++++++++++ kernel/lockdep.c | 247 ++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sysctl.c | 22 +++++ lib/Kconfig.debug | 11 +++ 4 files changed, 333 insertions(+) (limited to 'kernel/sysctl.c') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 14c937d345c..8f946f614f8 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -9,6 +9,7 @@ #define __LINUX_LOCKDEP_H struct task_struct; +struct lockdep_map; #ifdef CONFIG_LOCKDEP @@ -114,8 +115,32 @@ struct lock_class { const char *name; int name_version; + +#ifdef CONFIG_LOCK_STAT + unsigned long contention_point[4]; +#endif +}; + +#ifdef CONFIG_LOCK_STAT +struct lock_time { + s64 min; + s64 max; + s64 total; + unsigned long nr; }; +struct lock_class_stats { + unsigned long contention_point[4]; + struct lock_time read_waittime; + struct lock_time write_waittime; + struct lock_time read_holdtime; + struct lock_time write_holdtime; +}; + +struct lock_class_stats lock_stats(struct lock_class *class); +void clear_lock_stats(struct lock_class *class); +#endif + /* * Map the lock object (the lock instance) to the lock-class object. * This is embedded into specific lock instances: @@ -165,6 +190,10 @@ struct held_lock { unsigned long acquire_ip; struct lockdep_map *instance; +#ifdef CONFIG_LOCK_STAT + u64 waittime_stamp; + u64 holdtime_stamp; +#endif /* * The lock-stack is unified in that the lock chains of interrupt * contexts nest ontop of process context chains, but we 'separate' @@ -281,6 +310,30 @@ struct lock_class_key { }; #endif /* !LOCKDEP */ +#ifdef CONFIG_LOCK_STAT + +extern void lock_contended(struct lockdep_map *lock, unsigned long ip); +extern void lock_acquired(struct lockdep_map *lock); + +#define LOCK_CONTENDED(_lock, try, lock) \ +do { \ + if (!try(_lock)) { \ + lock_contended(&(_lock)->dep_map, _RET_IP_); \ + lock(_lock); \ + lock_acquired(&(_lock)->dep_map); \ + } \ +} while (0) + +#else /* CONFIG_LOCK_STAT */ + +#define lock_contended(lockdep_map, ip) do {} while (0) +#define lock_acquired(lockdep_map) do {} while (0) + +#define LOCK_CONTENDED(_lock, try, lock) \ + lock(_lock) + +#endif /* CONFIG_LOCK_STAT */ + #if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_GENERIC_HARDIRQS) extern void early_init_irq_lock_class(void); #else diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 87ac3642507..70ca4db28af 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -42,6 +42,20 @@ #include "lockdep_internals.h" +#ifdef CONFIG_PROVE_LOCKING +int prove_locking = 1; +module_param(prove_locking, int, 0644); +#else +#define prove_locking 0 +#endif + +#ifdef CONFIG_LOCK_STAT +int lock_stat = 1; +module_param(lock_stat, int, 0644); +#else +#define lock_stat 0 +#endif + /* * lockdep_lock: protects the lockdep graph, the hashes and the * class/list/hash allocators. @@ -104,6 +118,70 @@ static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; unsigned long nr_lock_classes; static struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; +#ifdef CONFIG_LOCK_STAT +static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats); + +static int lock_contention_point(struct lock_class *class, unsigned long ip) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) { + if (class->contention_point[i] == 0) { + class->contention_point[i] = ip; + break; + } + if (class->contention_point[i] == ip) + break; + } + + return i; +} + +static void lock_time_inc(struct lock_time *lt, s64 time) +{ + if (time > lt->max) + lt->max = time; + + if (time < lt->min || !lt->min) + lt->min = time; + + lt->total += time; + lt->nr++; +} + +static struct lock_class_stats *get_lock_stats(struct lock_class *class) +{ + return &get_cpu_var(lock_stats)[class - lock_classes]; +} + +static void put_lock_stats(struct lock_class_stats *stats) +{ + put_cpu_var(lock_stats); +} + +static void lock_release_holdtime(struct held_lock *hlock) +{ + struct lock_class_stats *stats; + s64 holdtime; + + if (!lock_stat) + return; + + holdtime = sched_clock() - hlock->holdtime_stamp; + + stats = get_lock_stats(hlock->class); + if (hlock->read) + lock_time_inc(&stats->read_holdtime, holdtime); + else + lock_time_inc(&stats->write_holdtime, holdtime); + put_lock_stats(stats); +} +#else +static inline void lock_release_holdtime(struct held_lock *hlock) +{ +} +#endif + /* * We keep a global list of all lock classes. The list only grows, * never shrinks. The list is only accessed with the lockdep @@ -2221,6 +2299,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, int chain_head = 0; u64 chain_key; + if (!prove_locking) + check = 1; + if (unlikely(!debug_locks)) return 0; @@ -2271,6 +2352,10 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, hlock->read = read; hlock->check = check; hlock->hardirqs_off = hardirqs_off; +#ifdef CONFIG_LOCK_STAT + hlock->waittime_stamp = 0; + hlock->holdtime_stamp = sched_clock(); +#endif if (check == 2 && !mark_irqflags(curr, hlock)) return 0; @@ -2411,6 +2496,8 @@ lock_release_non_nested(struct task_struct *curr, return print_unlock_inbalance_bug(curr, lock, ip); found_it: + lock_release_holdtime(hlock); + /* * We have the right lock to unlock, 'hlock' points to it. * Now we remove it from the stack, and add back the other @@ -2463,6 +2550,8 @@ static int lock_release_nested(struct task_struct *curr, curr->curr_chain_key = hlock->prev_chain_key; + lock_release_holdtime(hlock); + #ifdef CONFIG_DEBUG_LOCKDEP hlock->prev_chain_key = 0; hlock->class = NULL; @@ -2537,6 +2626,9 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, { unsigned long flags; + if (unlikely(!lock_stat && !prove_locking)) + return; + if (unlikely(current->lockdep_recursion)) return; @@ -2556,6 +2648,9 @@ void lock_release(struct lockdep_map *lock, int nested, unsigned long ip) { unsigned long flags; + if (unlikely(!lock_stat && !prove_locking)) + return; + if (unlikely(current->lockdep_recursion)) return; @@ -2569,6 +2664,158 @@ void lock_release(struct lockdep_map *lock, int nested, unsigned long ip) EXPORT_SYMBOL_GPL(lock_release); +#ifdef CONFIG_LOCK_STAT +static int +print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, + unsigned long ip) +{ + if (!debug_locks_off()) + return 0; + if (debug_locks_silent) + return 0; + + printk("\n=================================\n"); + printk( "[ BUG: bad contention detected! ]\n"); + printk( "---------------------------------\n"); + printk("%s/%d is trying to contend lock (", + curr->comm, curr->pid); + print_lockdep_cache(lock); + printk(") at:\n"); + print_ip_sym(ip); + printk("but there are no locks held!\n"); + printk("\nother info that might help us debug this:\n"); + lockdep_print_held_locks(curr); + + printk("\nstack backtrace:\n"); + dump_stack(); + + return 0; +} + +static void +__lock_contended(struct lockdep_map *lock, unsigned long ip) +{ + struct task_struct *curr = current; + struct held_lock *hlock, *prev_hlock; + struct lock_class_stats *stats; + unsigned int depth; + int i, point; + + depth = curr->lockdep_depth; + if (DEBUG_LOCKS_WARN_ON(!depth)) + return; + + prev_hlock = NULL; + for (i = depth-1; i >= 0; i--) { + hlock = curr->held_locks + i; + /* + * We must not cross into another context: + */ + if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) + break; + if (hlock->instance == lock) + goto found_it; + prev_hlock = hlock; + } + print_lock_contention_bug(curr, lock, ip); + return; + +found_it: + hlock->waittime_stamp = sched_clock(); + + point = lock_contention_point(hlock->class, ip); + + stats = get_lock_stats(hlock->class); + if (point < ARRAY_SIZE(stats->contention_point)) + stats->contention_point[i]++; + put_lock_stats(stats); +} + +static void +__lock_acquired(struct lockdep_map *lock) +{ + struct task_struct *curr = current; + struct held_lock *hlock, *prev_hlock; + struct lock_class_stats *stats; + unsigned int depth; + u64 now; + s64 waittime; + int i; + + depth = curr->lockdep_depth; + if (DEBUG_LOCKS_WARN_ON(!depth)) + return; + + prev_hlock = NULL; + for (i = depth-1; i >= 0; i--) { + hlock = curr->held_locks + i; + /* + * We must not cross into another context: + */ + if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) + break; + if (hlock->instance == lock) + goto found_it; + prev_hlock = hlock; + } + print_lock_contention_bug(curr, lock, _RET_IP_); + return; + +found_it: + if (!hlock->waittime_stamp) + return; + + now = sched_clock(); + waittime = now - hlock->waittime_stamp; + hlock->holdtime_stamp = now; + + stats = get_lock_stats(hlock->class); + if (hlock->read) + lock_time_inc(&stats->read_waittime, waittime); + else + lock_time_inc(&stats->write_waittime, waittime); + put_lock_stats(stats); +} + +void lock_contended(struct lockdep_map *lock, unsigned long ip) +{ + unsigned long flags; + + if (unlikely(!lock_stat)) + return; + + if (unlikely(current->lockdep_recursion)) + return; + + raw_local_irq_save(flags); + check_flags(flags); + current->lockdep_recursion = 1; + __lock_contended(lock, ip); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lock_contended); + +void lock_acquired(struct lockdep_map *lock) +{ + unsigned long flags; + + if (unlikely(!lock_stat)) + return; + + if (unlikely(current->lockdep_recursion)) + return; + + raw_local_irq_save(flags); + check_flags(flags); + current->lockdep_recursion = 1; + __lock_acquired(lock); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lock_acquired); +#endif + /* * Used by the testsuite, sanitize the validator state * after a simulated failure: diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 2aaa3f98185..e69179b1809 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -161,6 +161,8 @@ extern ctl_table inotify_table[]; int sysctl_legacy_va_layout; #endif +extern int prove_locking; +extern int lock_stat; /* The default sysctl tables: */ @@ -282,6 +284,26 @@ static ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, +#ifdef CONFIG_PROVE_LOCKING + { + .ctl_name = CTL_UNNUMBERED, + .procname = "prove_locking", + .data = &prove_locking, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#ifdef CONFIG_LOCK_STAT + { + .ctl_name = CTL_UNNUMBERED, + .procname = "lock_stat", + .data = &lock_stat, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif { .ctl_name = CTL_UNNUMBERED, .procname = "sched_features", diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 640844024ff..f3e0c2abcbd 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -283,6 +283,17 @@ config LOCKDEP select KALLSYMS select KALLSYMS_ALL +config LOCK_STAT + bool "Lock usage statisitics" + depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT + select LOCKDEP + select DEBUG_SPINLOCK + select DEBUG_MUTEXES + select DEBUG_LOCK_ALLOC + default n + help + This feature enables tracking lock contention points + config DEBUG_LOCKDEP bool "Lock dependency engine debugging" depends on DEBUG_KERNEL && LOCKDEP -- cgit v1.2.3-70-g09d2 From ed2c12f323e8fafbc94f9bcfb924f9df36e64dc7 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 19 Jul 2007 01:50:35 -0700 Subject: kernel/sysctl.c: finish off the warning comments I've been chasing these comments around this file all week. Hopefully we're straight now. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e69179b1809..222299844ad 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -748,7 +748,10 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dostring, .strategy = &sysctl_string, }, - +/* + * NOTE: do not add new entries to this table unless you have read + * Documentation/sysctl/ctl_unnumbered.txt + */ { .ctl_name = 0 } }; -- cgit v1.2.3-70-g09d2 From abd4f7505bafdd6c5319fe3cb5caf9af6104e17a Mon Sep 17 00:00:00 2001 From: Masoud Asgharifard Sharbiani Date: Sun, 22 Jul 2007 11:12:28 +0200 Subject: x86: i386-show-unhandled-signals-v3 This patch makes the i386 behave the same way that x86_64 does when a segfault happens. A line gets printed to the kernel log so that tools that need to check for failures can behave more uniformly between debug.show_unhandled_signals sysctl variable to 0 (or by doing echo 0 > /proc/sys/debug/exception-trace) Also, all of the lines being printed are now using printk_ratelimit() to deny the ability of DoS from a local user with a program like the following: main() { while (1) if (!fork()) *(int *)0 = 0; } This new revision also includes the fix that Andrew did which got rid of new sysctl that was added to the system in earlier versions of this. Also, 'show-unhandled-signals' sysctl has been renamed back to the old 'exception-trace' to avoid breakage of people's scripts. AK: Enabling by default for i386 will be likely controversal, but let's see what happens AK: Really folks, before complaining just fix your segfaults AK: I bet this will find a lot of silent issues Signed-off-by: Masoud Sharbiani Signed-off-by: Andi Kleen [ Personally, I've found the complaints useful on x86-64, so I'm all for this. That said, I wonder if we could do it more prettily.. -Linus ] Signed-off-by: Linus Torvalds --- arch/i386/kernel/signal.c | 7 +++++++ arch/i386/kernel/traps.c | 7 +++++++ arch/i386/mm/fault.c | 10 ++++++++++ arch/x86_64/kernel/signal.c | 2 +- arch/x86_64/kernel/traps.c | 6 ++++-- arch/x86_64/mm/fault.c | 15 +++------------ arch/x86_64/mm/init.c | 33 --------------------------------- include/asm-x86_64/proto.h | 2 -- include/linux/signal.h | 3 +++ kernel/signal.c | 10 ++++++++++ kernel/sysctl.c | 10 ++++++++++ 11 files changed, 55 insertions(+), 50 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c index d574e38f0f7..f5dd85656c1 100644 --- a/arch/i386/kernel/signal.c +++ b/arch/i386/kernel/signal.c @@ -199,6 +199,13 @@ asmlinkage int sys_sigreturn(unsigned long __unused) return eax; badframe: + if (show_unhandled_signals && printk_ratelimit()) + printk("%s%s[%d] bad frame in sigreturn frame:%p eip:%lx" + " esp:%lx oeax:%lx\n", + current->pid > 1 ? KERN_INFO : KERN_EMERG, + current->comm, current->pid, frame, regs->eip, + regs->esp, regs->orig_eax); + force_sig(SIGSEGV, current); return 0; } diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 57772a18c39..438949da3b6 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -618,6 +618,13 @@ fastcall void __kprobes do_general_protection(struct pt_regs * regs, current->thread.error_code = error_code; current->thread.trap_no = 13; + if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) && + printk_ratelimit()) + printk(KERN_INFO + "%s[%d] general protection eip:%lx esp:%lx error:%lx\n", + current->comm, current->pid, + regs->eip, regs->esp, error_code); + force_sig(SIGSEGV, current); return; diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c index e92a1012493..01ffdd4964f 100644 --- a/arch/i386/mm/fault.c +++ b/arch/i386/mm/fault.c @@ -283,6 +283,8 @@ static inline int vmalloc_fault(unsigned long address) return 0; } +int show_unhandled_signals = 1; + /* * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate @@ -469,6 +471,14 @@ bad_area_nosemaphore: if (is_prefetch(regs, address, error_code)) return; + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && + printk_ratelimit()) { + printk("%s%s[%d]: segfault at %08lx eip %08lx " + "esp %08lx error %lx\n", + tsk->pid > 1 ? KERN_INFO : KERN_EMERG, + tsk->comm, tsk->pid, address, regs->eip, + regs->esp, error_code); + } tsk->thread.cr2 = address; /* Kernel addresses are always protection faults */ tsk->thread.error_code = error_code | (address >= TASK_SIZE); diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c index 4886afcd628..739175b01e0 100644 --- a/arch/x86_64/kernel/signal.c +++ b/arch/x86_64/kernel/signal.c @@ -487,7 +487,7 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) void signal_fault(struct pt_regs *regs, void __user *frame, char *where) { struct task_struct *me = current; - if (exception_trace) + if (show_unhandled_signals && printk_ratelimit()) printk("%s[%d] bad frame in %s frame:%p rip:%lx rsp:%lx orax:%lx\n", me->comm,me->pid,where,frame,regs->rip,regs->rsp,regs->orig_rax); diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 8713ad4a4db..03888420775 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -584,7 +584,8 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, tsk->thread.error_code = error_code; tsk->thread.trap_no = trapnr; - if (exception_trace && unhandled_signal(tsk, signr)) + if (show_unhandled_signals && unhandled_signal(tsk, signr) && + printk_ratelimit()) printk(KERN_INFO "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n", tsk->comm, tsk->pid, str, @@ -688,7 +689,8 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs, tsk->thread.error_code = error_code; tsk->thread.trap_no = 13; - if (exception_trace && unhandled_signal(tsk, SIGSEGV)) + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && + printk_ratelimit()) printk(KERN_INFO "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n", tsk->comm, tsk->pid, diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c index 2074bddd4f0..5e9ac70c135 100644 --- a/arch/x86_64/mm/fault.c +++ b/arch/x86_64/mm/fault.c @@ -221,16 +221,6 @@ static int is_errata93(struct pt_regs *regs, unsigned long address) return 0; } -int unhandled_signal(struct task_struct *tsk, int sig) -{ - if (is_init(tsk)) - return 1; - if (tsk->ptrace & PT_PTRACED) - return 0; - return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) || - (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL); -} - static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, unsigned long error_code) { @@ -302,7 +292,7 @@ static int vmalloc_fault(unsigned long address) } static int page_fault_trace; -int exception_trace = 1; +int show_unhandled_signals = 1; /* * This routine handles page faults. It determines the address, @@ -494,7 +484,8 @@ bad_area_nosemaphore: (address >> 32)) return; - if (exception_trace && unhandled_signal(tsk, SIGSEGV)) { + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && + printk_ratelimit()) { printk( "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n", tsk->pid > 1 ? KERN_INFO : KERN_EMERG, diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index 381c2ecd407..88678e82e23 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -697,39 +697,6 @@ int kern_addr_valid(unsigned long addr) return pfn_valid(pte_pfn(*pte)); } -#ifdef CONFIG_SYSCTL -#include - -static ctl_table debug_table2[] = { - { - .ctl_name = 99, - .procname = "exception-trace", - .data = &exception_trace, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - {} -}; - -static ctl_table debug_root_table2[] = { - { - .ctl_name = CTL_DEBUG, - .procname = "debug", - .mode = 0555, - .child = debug_table2 - }, - {} -}; - -static __init int x8664_sysctl_init(void) -{ - register_sysctl_table(debug_root_table2); - return 0; -} -__initcall(x8664_sysctl_init); -#endif - /* A pseudo VMA to allow ptrace access for the vsyscall page. This only covers the 64bit vsyscall page now. 32bit has a real VMA now and does not need special handling anymore. */ diff --git a/include/asm-x86_64/proto.h b/include/asm-x86_64/proto.h index d6e3225549c..31f20ad6587 100644 --- a/include/asm-x86_64/proto.h +++ b/include/asm-x86_64/proto.h @@ -75,8 +75,6 @@ extern void setup_node_bootmem(int nodeid, unsigned long start, unsigned long en extern void early_quirks(void); extern void check_efer(void); -extern int unhandled_signal(struct task_struct *tsk, int sig); - extern void select_idle_routine(const struct cpuinfo_x86 *c); extern unsigned long table_start, table_end; diff --git a/include/linux/signal.h b/include/linux/signal.h index ea91abe740d..0ae33886624 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -237,12 +237,15 @@ extern int group_send_sig_info(int sig, struct siginfo *info, struct task_struct extern int __group_send_sig_info(int, struct siginfo *, struct task_struct *); extern long do_sigpending(void __user *, unsigned long); extern int sigprocmask(int, sigset_t *, sigset_t *); +extern int show_unhandled_signals; struct pt_regs; extern int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, struct pt_regs *regs, void *cookie); extern struct kmem_cache *sighand_cachep; +int unhandled_signal(struct task_struct *tsk, int sig); + /* * In POSIX a signal is sent either to a specific thread (Linux task) * or to the process as a whole (Linux thread group). How the signal diff --git a/kernel/signal.c b/kernel/signal.c index 39d122753ba..ef8156a6aad 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -255,6 +255,16 @@ flush_signal_handlers(struct task_struct *t, int force_default) } } +int unhandled_signal(struct task_struct *tsk, int sig) +{ + if (is_init(tsk)) + return 1; + if (tsk->ptrace & PT_PTRACED) + return 0; + return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) || + (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL); +} + /* Notify the system that a driver wants to block all signals for this * process, and wants to be notified if any signals at all were to be diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 222299844ad..ddebf3f2aff 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1203,6 +1203,16 @@ static ctl_table fs_table[] = { }; static ctl_table debug_table[] = { +#ifdef CONFIG_X86 + { + .ctl_name = CTL_UNNUMBERED, + .procname = "exception-trace", + .data = &show_unhandled_signals, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, +#endif { .ctl_name = 0 } }; -- cgit v1.2.3-70-g09d2 From e8b2fd01228f690c3e0cb3f14facfa8d93d4adae Mon Sep 17 00:00:00 2001 From: Len Brown Date: Tue, 24 Jul 2007 22:26:33 -0400 Subject: ACPI: Kconfig: remove CONFIG_ACPI_SLEEP from source As it was a synonym for (CONFIG_ACPI && CONFIG_X86), the ifdefs for it were more clutter than they were worth. For ia64, just add a few stubs in anticipation of future S3 or S4 support. Signed-off-by: Len Brown --- arch/i386/kernel/acpi/Makefile | 2 +- arch/i386/kernel/setup.c | 2 +- arch/i386/mm/init.c | 2 +- arch/ia64/kernel/acpi.c | 19 +++++++++++++++++++ arch/x86_64/kernel/acpi/Makefile | 2 +- arch/x86_64/kernel/acpi/sleep.c | 4 ---- arch/x86_64/kernel/head.S | 2 +- arch/x86_64/kernel/setup.c | 2 +- drivers/acpi/Kconfig | 10 +++------- drivers/acpi/sleep/Makefile | 4 ++-- drivers/acpi/sleep/main.c | 2 ++ drivers/acpi/sleep/poweroff.c | 2 -- drivers/acpi/sleep/wakeup.c | 2 -- include/acpi/acpi_drivers.h | 4 ---- include/asm-i386/acpi.h | 23 +++++++++-------------- include/asm-i386/suspend.h | 2 +- include/asm-ia64/acpi.h | 5 +++++ include/asm-x86_64/acpi.h | 22 +++++++++------------- include/asm-x86_64/suspend.h | 2 -- kernel/sysctl.c | 2 +- 20 files changed, 57 insertions(+), 58 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/arch/i386/kernel/acpi/Makefile b/arch/i386/kernel/acpi/Makefile index 7f7be01f44e..223f58fc9f4 100644 --- a/arch/i386/kernel/acpi/Makefile +++ b/arch/i386/kernel/acpi/Makefile @@ -2,7 +2,7 @@ obj-$(CONFIG_ACPI) += boot.o ifneq ($(CONFIG_PCI),) obj-$(CONFIG_X86_IO_APIC) += earlyquirk.o endif -obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup.o +obj-$(CONFIG_ACPI) += sleep.o wakeup.o ifneq ($(CONFIG_ACPI_PROCESSOR),) obj-y += cstate.o processor.o diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index d474cd639bc..7fe5da3c932 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -422,7 +422,7 @@ void __init setup_bootmem_allocator(void) */ reserve_bootmem(PAGE_SIZE, PAGE_SIZE); #endif -#ifdef CONFIG_ACPI_SLEEP +#ifdef CONFIG_ACPI /* * Reserve low memory region for sleep support. */ diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index c3b9905af2d..1b1a1e66d09 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -432,7 +432,7 @@ static void __init pagetable_init (void) paravirt_pagetable_setup_done(pgd_base); } -#if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_ACPI_SLEEP) +#if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_ACPI) /* * Swap suspend & friends need this for resume because things like the intel-agp * driver might have split up a kernel 4MB mapping. diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c index 103dd8edda7..c6ede8780de 100644 --- a/arch/ia64/kernel/acpi.c +++ b/arch/ia64/kernel/acpi.c @@ -67,6 +67,8 @@ EXPORT_SYMBOL(pm_power_off); unsigned int acpi_cpei_override; unsigned int acpi_cpei_phys_cpuid; +unsigned long acpi_wakeup_address = 0; + const char __init * acpi_get_sysname(void) { @@ -986,4 +988,21 @@ int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base) EXPORT_SYMBOL(acpi_unregister_ioapic); +/* + * acpi_save_state_mem() - save kernel state + * + * TBD when when IA64 starts to support suspend... + */ +int acpi_save_state_mem(void) { return 0; } + +/* + * acpi_restore_state() + */ +void acpi_restore_state_mem(void) {} + +/* + * do_suspend_lowlevel() + */ +void do_suspend_lowlevel(void) {} + #endif /* CONFIG_ACPI */ diff --git a/arch/x86_64/kernel/acpi/Makefile b/arch/x86_64/kernel/acpi/Makefile index 080b9963f1b..17595d23fee 100644 --- a/arch/x86_64/kernel/acpi/Makefile +++ b/arch/x86_64/kernel/acpi/Makefile @@ -1,6 +1,6 @@ obj-y := boot.o boot-y := ../../../i386/kernel/acpi/boot.o -obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup.o +obj-y += sleep.o wakeup.o ifneq ($(CONFIG_ACPI_PROCESSOR),) obj-y += processor.o diff --git a/arch/x86_64/kernel/acpi/sleep.c b/arch/x86_64/kernel/acpi/sleep.c index 4277f2b27e6..79475d23707 100644 --- a/arch/x86_64/kernel/acpi/sleep.c +++ b/arch/x86_64/kernel/acpi/sleep.c @@ -51,8 +51,6 @@ Low-Level Sleep Support -------------------------------------------------------------------------- */ -#ifdef CONFIG_ACPI_SLEEP - /* address in low memory of the wakeup routine. */ unsigned long acpi_wakeup_address = 0; unsigned long acpi_realmode_flags; @@ -117,8 +115,6 @@ static int __init acpi_sleep_setup(char *str) __setup("acpi_sleep=", acpi_sleep_setup); -#endif /*CONFIG_ACPI_SLEEP */ - void acpi_pci_link_exit(void) { } diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index e89abcdbdde..3a16e417dd8 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -120,7 +120,7 @@ ident_complete: addq %rbp, trampoline_level4_pgt + 0(%rip) addq %rbp, trampoline_level4_pgt + (511*8)(%rip) #endif -#ifdef CONFIG_ACPI_SLEEP +#ifdef CONFIG_ACPI addq %rbp, wakeup_level4_pgt + 0(%rip) addq %rbp, wakeup_level4_pgt + (511*8)(%rip) #endif diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index af838f6b0b7..0f400f3c469 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -333,7 +333,7 @@ void __init setup_arch(char **cmdline_p) reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, 2*PAGE_SIZE); #endif -#ifdef CONFIG_ACPI_SLEEP +#ifdef CONFIG_ACPI /* * Reserve low memory region for sleep support. */ diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig index 524cbf151fc..251344cb29a 100644 --- a/drivers/acpi/Kconfig +++ b/drivers/acpi/Kconfig @@ -11,6 +11,9 @@ menuconfig ACPI depends on PCI depends on PM select PNP + # for sleep + select HOTPLUG_CPU if X86 && SMP + select SUSPEND_SMP if X86 && SMP default y ---help--- Advanced Configuration and Power Interface (ACPI) support for @@ -42,13 +45,6 @@ menuconfig ACPI if ACPI -config ACPI_SLEEP - bool - depends on X86 - select HOTPLUG_CPU if SMP - select SUSPEND_SMP if SMP - default y - config ACPI_PROCFS bool "Deprecated /proc/acpi files" depends on PROC_FS diff --git a/drivers/acpi/sleep/Makefile b/drivers/acpi/sleep/Makefile index 195a4f69c0f..01a993a1d08 100644 --- a/drivers/acpi/sleep/Makefile +++ b/drivers/acpi/sleep/Makefile @@ -1,5 +1,5 @@ obj-y := poweroff.o wakeup.o -obj-$(CONFIG_ACPI_SLEEP) += main.o -obj-$(CONFIG_ACPI_SLEEP) += proc.o +obj-y += main.o +obj-$(CONFIG_X86) += proc.o EXTRA_CFLAGS += $(ACPI_CFLAGS) diff --git a/drivers/acpi/sleep/main.c b/drivers/acpi/sleep/main.c index 3279e72a94f..54c2dfcf865 100644 --- a/drivers/acpi/sleep/main.c +++ b/drivers/acpi/sleep/main.c @@ -136,10 +136,12 @@ static int acpi_pm_finish(suspend_state_t pm_state) /* reset firmware waking vector */ acpi_set_firmware_waking_vector((acpi_physical_address) 0); +#ifdef CONFIG_X86 if (init_8259A_after_S1) { printk("Broken toshiba laptop -> kicking interrupts\n"); init_8259A(0); } +#endif return 0; } diff --git a/drivers/acpi/sleep/poweroff.c b/drivers/acpi/sleep/poweroff.c index 39e40d56b03..b3f68ef0669 100644 --- a/drivers/acpi/sleep/poweroff.c +++ b/drivers/acpi/sleep/poweroff.c @@ -18,7 +18,6 @@ int acpi_sleep_prepare(u32 acpi_state) { -#ifdef CONFIG_ACPI_SLEEP /* do we have a wakeup address for S2 and S3? */ if (acpi_state == ACPI_STATE_S3) { if (!acpi_wakeup_address) { @@ -31,7 +30,6 @@ int acpi_sleep_prepare(u32 acpi_state) } ACPI_FLUSH_CPU_CACHE(); acpi_enable_wakeup_device_prep(acpi_state); -#endif acpi_gpe_sleep_prepare(acpi_state); acpi_enter_sleep_state_prep(acpi_state); return 0; diff --git a/drivers/acpi/sleep/wakeup.c b/drivers/acpi/sleep/wakeup.c index fab8f2694f0..97c27ddb144 100644 --- a/drivers/acpi/sleep/wakeup.c +++ b/drivers/acpi/sleep/wakeup.c @@ -17,7 +17,6 @@ ACPI_MODULE_NAME("wakeup_devices") extern struct list_head acpi_wakeup_device_list; extern spinlock_t acpi_device_lock; -#ifdef CONFIG_ACPI_SLEEP /** * acpi_enable_wakeup_device_prep - prepare wakeup devices * @sleep_state: ACPI state @@ -180,7 +179,6 @@ static int __init acpi_wakeup_device_init(void) } late_initcall(acpi_wakeup_device_init); -#endif /* * Disable all wakeup GPEs before entering requested sleep state. diff --git a/include/acpi/acpi_drivers.h b/include/acpi/acpi_drivers.h index 553515912c0..07b5d76b92c 100644 --- a/include/acpi/acpi_drivers.h +++ b/include/acpi/acpi_drivers.h @@ -142,10 +142,6 @@ static inline void unregister_hotplug_dock_device(acpi_handle handle) /*-------------------------------------------------------------------------- Suspend/Resume -------------------------------------------------------------------------- */ -#ifdef CONFIG_ACPI_SLEEP extern int acpi_sleep_init(void); -#else -#define acpi_sleep_init() do {} while (0) -#endif #endif /*__ACPI_DRIVERS_H__*/ diff --git a/include/asm-i386/acpi.h b/include/asm-i386/acpi.h index 449f3f272e0..125179adf04 100644 --- a/include/asm-i386/acpi.h +++ b/include/asm-i386/acpi.h @@ -121,19 +121,6 @@ static inline void acpi_disable_pci(void) } extern int acpi_irq_balance_set(char *str); -#else /* !CONFIG_ACPI */ - -#define acpi_lapic 0 -#define acpi_ioapic 0 -static inline void acpi_noirq_set(void) { } -static inline void acpi_disable_pci(void) { } -static inline void disable_acpi(void) { } - -#endif /* !CONFIG_ACPI */ - - -#ifdef CONFIG_ACPI_SLEEP - /* routines for saving/restoring kernel state */ extern int acpi_save_state_mem(void); extern void acpi_restore_state_mem(void); @@ -143,7 +130,15 @@ extern unsigned long acpi_wakeup_address; /* early initialization routine */ extern void acpi_reserve_bootmem(void); -#endif /*CONFIG_ACPI_SLEEP*/ +#else /* !CONFIG_ACPI */ + +#define acpi_lapic 0 +#define acpi_ioapic 0 +static inline void acpi_noirq_set(void) { } +static inline void acpi_disable_pci(void) { } +static inline void disable_acpi(void) { } + +#endif /* !CONFIG_ACPI */ #define ARCH_HAS_POWER_INIT 1 diff --git a/include/asm-i386/suspend.h b/include/asm-i386/suspend.h index 8dbaafe611f..a2520732ffd 100644 --- a/include/asm-i386/suspend.h +++ b/include/asm-i386/suspend.h @@ -21,7 +21,7 @@ struct saved_context { unsigned long return_address; } __attribute__((packed)); -#ifdef CONFIG_ACPI_SLEEP +#ifdef CONFIG_ACPI extern unsigned long saved_eip; extern unsigned long saved_esp; extern unsigned long saved_ebp; diff --git a/include/asm-ia64/acpi.h b/include/asm-ia64/acpi.h index 5b526357d17..49730ffbbae 100644 --- a/include/asm-ia64/acpi.h +++ b/include/asm-ia64/acpi.h @@ -100,6 +100,11 @@ const char *acpi_get_sysname (void); int acpi_request_vector (u32 int_type); int acpi_gsi_to_irq (u32 gsi, unsigned int *irq); +/* routines for saving/restoring kernel state */ +extern int acpi_save_state_mem(void); +extern void acpi_restore_state_mem(void); +extern unsigned long acpi_wakeup_address; + /* * Record the cpei override flag and current logical cpu. This is * useful for CPU removal. diff --git a/include/asm-x86_64/acpi.h b/include/asm-x86_64/acpi.h index 1da8f49c0fe..98173357dd8 100644 --- a/include/asm-x86_64/acpi.h +++ b/include/asm-x86_64/acpi.h @@ -108,6 +108,15 @@ static inline void acpi_disable_pci(void) } extern int acpi_irq_balance_set(char *str); +/* routines for saving/restoring kernel state */ +extern int acpi_save_state_mem(void); +extern void acpi_restore_state_mem(void); + +extern unsigned long acpi_wakeup_address; + +/* early initialization routine */ +extern void acpi_reserve_bootmem(void); + #else /* !CONFIG_ACPI */ #define acpi_lapic 0 @@ -121,19 +130,6 @@ extern int acpi_numa; extern int acpi_scan_nodes(unsigned long start, unsigned long end); #define NR_NODE_MEMBLKS (MAX_NUMNODES*2) -#ifdef CONFIG_ACPI_SLEEP - -/* routines for saving/restoring kernel state */ -extern int acpi_save_state_mem(void); -extern void acpi_restore_state_mem(void); - -extern unsigned long acpi_wakeup_address; - -/* early initialization routine */ -extern void acpi_reserve_bootmem(void); - -#endif /*CONFIG_ACPI_SLEEP*/ - extern int acpi_disabled; extern int acpi_pci_disabled; diff --git a/include/asm-x86_64/suspend.h b/include/asm-x86_64/suspend.h index 9c3f8de90d2..b897e8cb55f 100644 --- a/include/asm-x86_64/suspend.h +++ b/include/asm-x86_64/suspend.h @@ -44,7 +44,6 @@ extern unsigned long saved_context_eflags; extern void fix_processor_context(void); -#ifdef CONFIG_ACPI_SLEEP extern unsigned long saved_rip; extern unsigned long saved_rsp; extern unsigned long saved_rbp; @@ -54,4 +53,3 @@ extern unsigned long saved_rdi; /* routines for saving/restoring kernel state */ extern int acpi_save_state_mem(void); -#endif diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ddebf3f2aff..eb26f2ba51e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -689,7 +689,7 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif -#ifdef CONFIG_ACPI_SLEEP +#if defined(CONFIG_ACPI) && defined(CONFIG_X86) { .ctl_name = KERN_ACPI_VIDEO_FLAGS, .procname = "acpi_video_flags", -- cgit v1.2.3-70-g09d2 From 673d5b43daa00b42759cecc6b0760b8bf6be80d2 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sat, 28 Jul 2007 03:33:16 -0400 Subject: ACPI: restore CONFIG_ACPI_SLEEP Restore the 2.6.22 CONFIG_ACPI_SLEEP build option, but now shadowing the new CONFIG_PM_SLEEP option. Signed-off-by: Len Brown [ Modified to work with the PM config setup changes. ] Signed-off-by: Linus Torvalds --- arch/i386/kernel/acpi/Makefile | 2 +- arch/i386/kernel/setup.c | 2 +- arch/x86_64/kernel/acpi/Makefile | 2 +- arch/x86_64/kernel/head.S | 2 +- arch/x86_64/kernel/setup.c | 2 +- drivers/acpi/Kconfig | 5 +++++ drivers/acpi/sleep/Makefile | 4 ++-- drivers/acpi/sleep/poweroff.c | 2 ++ drivers/acpi/sleep/proc.c | 2 +- drivers/pci/pci-acpi.c | 4 ++++ drivers/pnp/pnpacpi/core.c | 4 ++++ include/acpi/acpi_drivers.h | 2 +- kernel/sysctl.c | 2 +- 13 files changed, 25 insertions(+), 10 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/arch/i386/kernel/acpi/Makefile b/arch/i386/kernel/acpi/Makefile index 223f58fc9f4..7f7be01f44e 100644 --- a/arch/i386/kernel/acpi/Makefile +++ b/arch/i386/kernel/acpi/Makefile @@ -2,7 +2,7 @@ obj-$(CONFIG_ACPI) += boot.o ifneq ($(CONFIG_PCI),) obj-$(CONFIG_X86_IO_APIC) += earlyquirk.o endif -obj-$(CONFIG_ACPI) += sleep.o wakeup.o +obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup.o ifneq ($(CONFIG_ACPI_PROCESSOR),) obj-y += cstate.o processor.o diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index 7fe5da3c932..d474cd639bc 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -422,7 +422,7 @@ void __init setup_bootmem_allocator(void) */ reserve_bootmem(PAGE_SIZE, PAGE_SIZE); #endif -#ifdef CONFIG_ACPI +#ifdef CONFIG_ACPI_SLEEP /* * Reserve low memory region for sleep support. */ diff --git a/arch/x86_64/kernel/acpi/Makefile b/arch/x86_64/kernel/acpi/Makefile index 17595d23fee..080b9963f1b 100644 --- a/arch/x86_64/kernel/acpi/Makefile +++ b/arch/x86_64/kernel/acpi/Makefile @@ -1,6 +1,6 @@ obj-y := boot.o boot-y := ../../../i386/kernel/acpi/boot.o -obj-y += sleep.o wakeup.o +obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup.o ifneq ($(CONFIG_ACPI_PROCESSOR),) obj-y += processor.o diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index 3a16e417dd8..e89abcdbdde 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -120,7 +120,7 @@ ident_complete: addq %rbp, trampoline_level4_pgt + 0(%rip) addq %rbp, trampoline_level4_pgt + (511*8)(%rip) #endif -#ifdef CONFIG_ACPI +#ifdef CONFIG_ACPI_SLEEP addq %rbp, wakeup_level4_pgt + 0(%rip) addq %rbp, wakeup_level4_pgt + (511*8)(%rip) #endif diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 0f400f3c469..af838f6b0b7 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -333,7 +333,7 @@ void __init setup_arch(char **cmdline_p) reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, 2*PAGE_SIZE); #endif -#ifdef CONFIG_ACPI +#ifdef CONFIG_ACPI_SLEEP /* * Reserve low memory region for sleep support. */ diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig index 66e78d52a09..934d639b368 100644 --- a/drivers/acpi/Kconfig +++ b/drivers/acpi/Kconfig @@ -42,6 +42,11 @@ menuconfig ACPI if ACPI +config ACPI_SLEEP + bool + depends on PM_SLEEP + default y + config ACPI_PROCFS bool "Deprecated /proc/acpi files" depends on PROC_FS diff --git a/drivers/acpi/sleep/Makefile b/drivers/acpi/sleep/Makefile index 2bec897ab1e..195a4f69c0f 100644 --- a/drivers/acpi/sleep/Makefile +++ b/drivers/acpi/sleep/Makefile @@ -1,5 +1,5 @@ obj-y := poweroff.o wakeup.o -obj-$(CONFIG_PM_SLEEP) += main.o -obj-$(CONFIG_X86) += proc.o +obj-$(CONFIG_ACPI_SLEEP) += main.o +obj-$(CONFIG_ACPI_SLEEP) += proc.o EXTRA_CFLAGS += $(ACPI_CFLAGS) diff --git a/drivers/acpi/sleep/poweroff.c b/drivers/acpi/sleep/poweroff.c index b3f68ef0669..39e40d56b03 100644 --- a/drivers/acpi/sleep/poweroff.c +++ b/drivers/acpi/sleep/poweroff.c @@ -18,6 +18,7 @@ int acpi_sleep_prepare(u32 acpi_state) { +#ifdef CONFIG_ACPI_SLEEP /* do we have a wakeup address for S2 and S3? */ if (acpi_state == ACPI_STATE_S3) { if (!acpi_wakeup_address) { @@ -30,6 +31,7 @@ int acpi_sleep_prepare(u32 acpi_state) } ACPI_FLUSH_CPU_CACHE(); acpi_enable_wakeup_device_prep(acpi_state); +#endif acpi_gpe_sleep_prepare(acpi_state); acpi_enter_sleep_state_prep(acpi_state); return 0; diff --git a/drivers/acpi/sleep/proc.c b/drivers/acpi/sleep/proc.c index 5dfe8b78963..66b62b0d360 100644 --- a/drivers/acpi/sleep/proc.c +++ b/drivers/acpi/sleep/proc.c @@ -78,7 +78,7 @@ acpi_system_write_sleep(struct file *file, } #endif /* CONFIG_ACPI_PROCFS_SLEEP */ -#if defined(CONFIG_RTC_DRV_CMOS) || defined(CONFIG_RTC_DRV_CMOS_MODULE) +#if defined(CONFIG_RTC_DRV_CMOS) || defined(CONFIG_RTC_DRV_CMOS_MODULE) || !defined(CONFIG_X86) /* use /sys/class/rtc/rtcX/wakealarm instead; it's not ACPI-specific */ #else #define HAVE_ACPI_LEGACY_ALARM diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c index 67c63d1f158..5c6a5d04300 100644 --- a/drivers/pci/pci-acpi.c +++ b/drivers/pci/pci-acpi.c @@ -220,6 +220,7 @@ acpi_status pci_osc_control_set(acpi_handle handle, u32 flags) } EXPORT_SYMBOL(pci_osc_control_set); +#ifdef CONFIG_ACPI_SLEEP /* * _SxD returns the D-state with the highest power * (lowest D-state number) supported in the S-state "x". @@ -267,6 +268,7 @@ static pci_power_t acpi_pci_choose_state(struct pci_dev *pdev, } return PCI_POWER_ERROR; } +#endif static int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state) { @@ -340,7 +342,9 @@ static int __init acpi_pci_init(void) ret = register_acpi_bus_type(&acpi_pci_bus); if (ret) return 0; +#ifdef CONFIG_ACPI_SLEEP platform_pci_choose_state = acpi_pci_choose_state; +#endif platform_pci_set_power_state = acpi_pci_set_power_state; return 0; } diff --git a/drivers/pnp/pnpacpi/core.c b/drivers/pnp/pnpacpi/core.c index 6a2a3c2f4d5..616fc72190b 100644 --- a/drivers/pnp/pnpacpi/core.c +++ b/drivers/pnp/pnpacpi/core.c @@ -127,6 +127,7 @@ static int pnpacpi_disable_resources(struct pnp_dev *dev) return ACPI_FAILURE(status) ? -ENODEV : 0; } +#ifdef CONFIG_ACPI_SLEEP static int pnpacpi_suspend(struct pnp_dev *dev, pm_message_t state) { return acpi_bus_set_power((acpi_handle) dev->data, @@ -140,14 +141,17 @@ static int pnpacpi_resume(struct pnp_dev *dev) { return acpi_bus_set_power((acpi_handle) dev->data, ACPI_STATE_D0); } +#endif static struct pnp_protocol pnpacpi_protocol = { .name = "Plug and Play ACPI", .get = pnpacpi_get_resources, .set = pnpacpi_set_resources, .disable = pnpacpi_disable_resources, +#ifdef CONFIG_ACPI_SLEEP .suspend = pnpacpi_suspend, .resume = pnpacpi_resume, +#endif }; static int __init pnpacpi_add_device(struct acpi_device *device) diff --git a/include/acpi/acpi_drivers.h b/include/acpi/acpi_drivers.h index 777d37ae81a..202acb9ff4d 100644 --- a/include/acpi/acpi_drivers.h +++ b/include/acpi/acpi_drivers.h @@ -147,7 +147,7 @@ static inline void unregister_hotplug_dock_device(acpi_handle handle) /*-------------------------------------------------------------------------- Suspend/Resume -------------------------------------------------------------------------- */ -#ifdef CONFIG_PM_SLEEP +#ifdef CONFIG_ACPI_SLEEP extern int acpi_sleep_init(void); #else static inline int acpi_sleep_init(void) { return 0; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index eb26f2ba51e..79c891e6266 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -689,7 +689,7 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif -#if defined(CONFIG_ACPI) && defined(CONFIG_X86) +#if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86) { .ctl_name = KERN_ACPI_VIDEO_FLAGS, .procname = "acpi_video_flags", -- cgit v1.2.3-70-g09d2 From 8daec965e7035bbf8d364fe7585bffac7222b87a Mon Sep 17 00:00:00 2001 From: Lee Schermerhorn Date: Fri, 10 Aug 2007 13:00:51 -0700 Subject: Fix missing numa_zonelist_order sysctl Misplaced #endif is hiding the numa_zonelist_order sysctl when !SECURITY. Signed-off-by: Lee Schermerhorn Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 79c891e6266..8bdb8c07e04 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1023,6 +1023,7 @@ static ctl_table vm_table[] = { .mode = 0644, .proc_handler = &proc_doulongvec_minmax, }, +#endif #ifdef CONFIG_NUMA { .ctl_name = CTL_UNNUMBERED, @@ -1034,7 +1035,6 @@ static ctl_table vm_table[] = { .strategy = &sysctl_string, }, #endif -#endif #if defined(CONFIG_X86_32) || \ (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) { -- cgit v1.2.3-70-g09d2 From e598fbaabdb6608915cbc5e80409d70f4f857e5c Mon Sep 17 00:00:00 2001 From: Christian Heim Date: Sun, 19 Aug 2007 13:07:59 +0200 Subject: Remove double inclusion of linux/capability.h Remove the second inclusion of linux/capability.h, which has been introduced with "[PATCH] move capable() to capability.h" (commit c59ede7b78db329949d9cdcd7064e22d357560ef) Signed-off-by: Christian Heim Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8bdb8c07e04..9029690f4fa 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3-70-g09d2 From 1fc84aaae3bae9646dd4c7798b8c0ff934338909 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 25 Aug 2007 18:41:52 +0200 Subject: sched: fix CONFIG_SCHED_DEBUG dependency of lockdep sysctls Make the lockdep sysctls not depend on CONFIG_SCHED_DEBUG. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sysctl.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9029690f4fa..ea90ef51085 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -283,6 +283,15 @@ static ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_features", + .data = &sysctl_sched_features, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif #ifdef CONFIG_PROVE_LOCKING { .ctl_name = CTL_UNNUMBERED, @@ -302,15 +311,6 @@ static ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, -#endif - { - .ctl_name = CTL_UNNUMBERED, - .procname = "sched_features", - .data = &sysctl_sched_features, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, #endif { .ctl_name = KERN_PANIC, -- cgit v1.2.3-70-g09d2 From 218050855ece4e923106ab614ac65afa0f618df3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 25 Aug 2007 18:41:53 +0200 Subject: sched: adaptive scheduler granularity Instead of specifying the preemption granularity, specify the wanted latency. By fixing the granlarity to a constany the wakeup latency it a function of the number of running tasks on the rq. Invert this relation. sysctl_sched_granularity becomes a minimum for the dynamic granularity computed from the new sysctl_sched_latency. Then use this latency to do more intelligent granularity decisions: if there are fewer tasks running then we can schedule coarser. This helps performance while still always keeping the latency target. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/sched.c | 14 ++++++---- kernel/sched_fair.c | 77 +++++++++++++++++++++++++++++++++++++++++++-------- kernel/sysctl.c | 11 ++++++++ 4 files changed, 86 insertions(+), 17 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/include/linux/sched.h b/include/linux/sched.h index ba78807eab9..322764e0405 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1399,6 +1399,7 @@ static inline void idle_task_exit(void) {} extern void sched_idle_next(void); +extern unsigned int sysctl_sched_latency; extern unsigned int sysctl_sched_granularity; extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_batch_wakeup_granularity; diff --git a/kernel/sched.c b/kernel/sched.c index 6798328a2e0..da26f46d50d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4911,14 +4911,18 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE; static inline void sched_init_granularity(void) { unsigned int factor = 1 + ilog2(num_online_cpus()); - const unsigned long gran_limit = 100000000; + const unsigned long limit = 100000000; sysctl_sched_granularity *= factor; - if (sysctl_sched_granularity > gran_limit) - sysctl_sched_granularity = gran_limit; + if (sysctl_sched_granularity > limit) + sysctl_sched_granularity = limit; - sysctl_sched_runtime_limit = sysctl_sched_granularity * 5; - sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2; + sysctl_sched_latency *= factor; + if (sysctl_sched_latency > limit) + sysctl_sched_latency = limit; + + sysctl_sched_runtime_limit = sysctl_sched_latency * 5; + sysctl_sched_wakeup_granularity = sysctl_sched_latency / 2; } #ifdef CONFIG_SMP diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 4d6b7e2df2a..0ba1e60f08d 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -15,23 +15,32 @@ * * Scaled math optimizations by Thomas Gleixner * Copyright (C) 2007, Thomas Gleixner + * + * Adaptive scheduling granularity, math enhancements by Peter Zijlstra + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra */ /* - * Preemption granularity: - * (default: 10 msec, units: nanoseconds) + * Targeted preemption latency for CPU-bound tasks: + * (default: 20ms, units: nanoseconds) * - * NOTE: this granularity value is not the same as the concept of - * 'timeslice length' - timeslices in CFS will typically be somewhat - * larger than this value. (to see the precise effective timeslice - * length of your workload, run vmstat and monitor the context-switches - * field) + * NOTE: this latency value is not the same as the concept of + * 'timeslice length' - timeslices in CFS are of variable length. + * (to see the precise effective timeslice length of your workload, + * run vmstat and monitor the context-switches field) * * On SMP systems the value of this is multiplied by the log2 of the * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way * systems, 4x on 8-way systems, 5x on 16-way systems, etc.) + * Targeted preemption latency for CPU-bound tasks: */ -unsigned int sysctl_sched_granularity __read_mostly = 10000000UL; +unsigned int sysctl_sched_latency __read_mostly = 20000000ULL; + +/* + * Minimal preemption granularity for CPU-bound tasks: + * (default: 2 msec, units: nanoseconds) + */ +unsigned int sysctl_sched_granularity __read_mostly = 2000000ULL; /* * SCHED_BATCH wake-up granularity. @@ -212,6 +221,49 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) * Scheduling class statistics methods: */ +/* + * Calculate the preemption granularity needed to schedule every + * runnable task once per sysctl_sched_latency amount of time. + * (down to a sensible low limit on granularity) + * + * For example, if there are 2 tasks running and latency is 10 msecs, + * we switch tasks every 5 msecs. If we have 3 tasks running, we have + * to switch tasks every 3.33 msecs to get a 10 msecs observed latency + * for each task. We do finer and finer scheduling up to until we + * reach the minimum granularity value. + * + * To achieve this we use the following dynamic-granularity rule: + * + * gran = lat/nr - lat/nr/nr + * + * This comes out of the following equations: + * + * kA1 + gran = kB1 + * kB2 + gran = kA2 + * kA2 = kA1 + * kB2 = kB1 - d + d/nr + * lat = d * nr + * + * Where 'k' is key, 'A' is task A (waiting), 'B' is task B (running), + * '1' is start of time, '2' is end of time, 'd' is delay between + * 1 and 2 (during which task B was running), 'nr' is number of tasks + * running, 'lat' is the the period of each task. ('lat' is the + * sched_latency that we aim for.) + */ +static long +sched_granularity(struct cfs_rq *cfs_rq) +{ + unsigned int gran = sysctl_sched_latency; + unsigned int nr = cfs_rq->nr_running; + + if (nr > 1) { + gran = gran/nr - gran/nr/nr; + gran = max(gran, sysctl_sched_granularity); + } + + return gran; +} + /* * We rescale the rescheduling granularity of tasks according to their * nice level, but only linearly, not exponentially: @@ -302,7 +354,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr) delta_fair = calc_delta_fair(delta_exec, lw); delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw); - if (cfs_rq->sleeper_bonus > sysctl_sched_granularity) { + if (cfs_rq->sleeper_bonus > sysctl_sched_latency) { delta = min((u64)delta_mine, cfs_rq->sleeper_bonus); delta = min(delta, (unsigned long)( (long)sysctl_sched_runtime_limit - curr->wait_runtime)); @@ -689,7 +741,8 @@ static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) if (next == curr) return; - __check_preempt_curr_fair(cfs_rq, next, curr, sysctl_sched_granularity); + __check_preempt_curr_fair(cfs_rq, next, curr, + sched_granularity(cfs_rq)); } /************************************************** @@ -1034,7 +1087,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) * it will preempt the parent: */ p->se.fair_key = current->se.fair_key - - niced_granularity(&rq->curr->se, sysctl_sched_granularity) - 1; + niced_granularity(&rq->curr->se, sched_granularity(cfs_rq)) - 1; /* * The first wait is dominated by the child-runs-first logic, * so do not credit it with that waiting time yet: @@ -1047,7 +1100,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) * -granularity/2, so initialize the task with that: */ if (sysctl_sched_features & SCHED_FEAT_START_DEBIT) - p->se.wait_runtime = -((long)sysctl_sched_granularity / 2); + p->se.wait_runtime = -(sched_granularity(cfs_rq) / 2); __enqueue_entity(cfs_rq, se); } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ea90ef51085..9e3d2960faf 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -231,6 +231,17 @@ static ctl_table kern_table[] = { .extra1 = &min_sched_granularity_ns, .extra2 = &max_sched_granularity_ns, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_latency_ns", + .data = &sysctl_sched_latency, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_sched_granularity_ns, + .extra2 = &max_sched_granularity_ns, + }, { .ctl_name = CTL_UNNUMBERED, .procname = "sched_wakeup_granularity_ns", -- cgit v1.2.3-70-g09d2 From 172ac3dbb7d3e528ac53d08a34df88d1ac53c534 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 25 Aug 2007 18:41:53 +0200 Subject: sched: cleanup, sched_granularity -> sched_min_granularity due to adaptive granularity scheduling the role of sched_granularity has changed to "minimum granularity", so rename the variable (and the tunable) accordingly. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra --- include/linux/sched.h | 2 +- kernel/sched.c | 6 +++--- kernel/sched_fair.c | 4 ++-- kernel/sysctl.c | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/include/linux/sched.h b/include/linux/sched.h index 322764e0405..bd6a0320a77 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1400,7 +1400,7 @@ static inline void idle_task_exit(void) {} extern void sched_idle_next(void); extern unsigned int sysctl_sched_latency; -extern unsigned int sysctl_sched_granularity; +extern unsigned int sysctl_sched_min_granularity; extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_batch_wakeup_granularity; extern unsigned int sysctl_sched_stat_granularity; diff --git a/kernel/sched.c b/kernel/sched.c index da26f46d50d..a40ab657ad1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4913,9 +4913,9 @@ static inline void sched_init_granularity(void) unsigned int factor = 1 + ilog2(num_online_cpus()); const unsigned long limit = 100000000; - sysctl_sched_granularity *= factor; - if (sysctl_sched_granularity > limit) - sysctl_sched_granularity = limit; + sysctl_sched_min_granularity *= factor; + if (sysctl_sched_min_granularity > limit) + sysctl_sched_min_granularity = limit; sysctl_sched_latency *= factor; if (sysctl_sched_latency > limit) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 0ba1e60f08d..ee3771850aa 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -40,7 +40,7 @@ unsigned int sysctl_sched_latency __read_mostly = 20000000ULL; * Minimal preemption granularity for CPU-bound tasks: * (default: 2 msec, units: nanoseconds) */ -unsigned int sysctl_sched_granularity __read_mostly = 2000000ULL; +unsigned int sysctl_sched_min_granularity __read_mostly = 2000000ULL; /* * SCHED_BATCH wake-up granularity. @@ -258,7 +258,7 @@ sched_granularity(struct cfs_rq *cfs_rq) if (nr > 1) { gran = gran/nr - gran/nr/nr; - gran = max(gran, sysctl_sched_granularity); + gran = max(gran, sysctl_sched_min_granularity); } return gran; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9e3d2960faf..6ace893c17c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -222,8 +222,8 @@ static ctl_table kern_table[] = { #ifdef CONFIG_SCHED_DEBUG { .ctl_name = CTL_UNNUMBERED, - .procname = "sched_granularity_ns", - .data = &sysctl_sched_granularity, + .procname = "sched_min_granularity_ns", + .data = &sysctl_sched_min_granularity, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = &proc_dointvec_minmax, -- cgit v1.2.3-70-g09d2 From 1799e35d5baab6e06168b46cc78b968e728ea3d1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 19 Sep 2007 23:34:46 +0200 Subject: sched: add /proc/sys/kernel/sched_compat_yield add /proc/sys/kernel/sched_compat_yield to make sys_sched_yield() more agressive, by moving the yielding task to the last position in the rbtree. with sched_compat_yield=0: PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND 2539 mingo 20 0 1576 252 204 R 50 0.0 0:02.03 loop_yield 2541 mingo 20 0 1576 244 196 R 50 0.0 0:02.05 loop with sched_compat_yield=1: PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND 2584 mingo 20 0 1576 248 196 R 99 0.0 0:52.45 loop 2582 mingo 20 0 1576 256 204 R 0 0.0 0:00.00 loop_yield Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra --- include/linux/sched.h | 1 + kernel/sched.c | 5 +--- kernel/sched_fair.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++----- kernel/sysctl.c | 8 +++++++ 4 files changed, 67 insertions(+), 10 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/include/linux/sched.h b/include/linux/sched.h index 5445eaec690..3de79016f2a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1406,6 +1406,7 @@ extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_batch_wakeup_granularity; extern unsigned int sysctl_sched_stat_granularity; extern unsigned int sysctl_sched_runtime_limit; +extern unsigned int sysctl_sched_compat_yield; extern unsigned int sysctl_sched_child_runs_first; extern unsigned int sysctl_sched_features; diff --git a/kernel/sched.c b/kernel/sched.c index deeb1f8e0c3..63e0971c8fb 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4550,10 +4550,7 @@ asmlinkage long sys_sched_yield(void) struct rq *rq = this_rq_lock(); schedstat_inc(rq, yld_cnt); - if (unlikely(rq->nr_running == 1)) - schedstat_inc(rq, yld_act_empty); - else - current->sched_class->yield_task(rq, current); + current->sched_class->yield_task(rq, current); /* * Since we are going to call schedule() anyway, there's diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 892616bf2c7..c9fbe8e73a4 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -42,6 +42,14 @@ unsigned int sysctl_sched_latency __read_mostly = 20000000ULL; */ unsigned int sysctl_sched_min_granularity __read_mostly = 2000000ULL; +/* + * sys_sched_yield() compat mode + * + * This option switches the agressive yield implementation of the + * old scheduler back on. + */ +unsigned int __read_mostly sysctl_sched_compat_yield; + /* * SCHED_BATCH wake-up granularity. * (default: 25 msec, units: nanoseconds) @@ -897,19 +905,62 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) } /* - * sched_yield() support is very simple - we dequeue and enqueue + * sched_yield() support is very simple - we dequeue and enqueue. + * + * If compat_yield is turned on then we requeue to the end of the tree. */ static void yield_task_fair(struct rq *rq, struct task_struct *p) { struct cfs_rq *cfs_rq = task_cfs_rq(p); + struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; + struct sched_entity *rightmost, *se = &p->se; + struct rb_node *parent; - __update_rq_clock(rq); /* - * Dequeue and enqueue the task to update its - * position within the tree: + * Are we the only task in the tree? + */ + if (unlikely(cfs_rq->nr_running == 1)) + return; + + if (likely(!sysctl_sched_compat_yield)) { + __update_rq_clock(rq); + /* + * Dequeue and enqueue the task to update its + * position within the tree: + */ + dequeue_entity(cfs_rq, &p->se, 0); + enqueue_entity(cfs_rq, &p->se, 0); + + return; + } + /* + * Find the rightmost entry in the rbtree: */ - dequeue_entity(cfs_rq, &p->se, 0); - enqueue_entity(cfs_rq, &p->se, 0); + do { + parent = *link; + link = &parent->rb_right; + } while (*link); + + rightmost = rb_entry(parent, struct sched_entity, run_node); + /* + * Already in the rightmost position? + */ + if (unlikely(rightmost == se)) + return; + + /* + * Minimally necessary key value to be last in the tree: + */ + se->fair_key = rightmost->fair_key + 1; + + if (cfs_rq->rb_leftmost == &se->run_node) + cfs_rq->rb_leftmost = rb_next(&se->run_node); + /* + * Relink the task to the rightmost position: + */ + rb_erase(&se->run_node, &cfs_rq->tasks_timeline); + rb_link_node(&se->run_node, parent, link); + rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); } /* diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6ace893c17c..53a456ebf6d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -303,6 +303,14 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif + { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_compat_yield", + .data = &sysctl_sched_compat_yield, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, #ifdef CONFIG_PROVE_LOCKING { .ctl_name = CTL_UNNUMBERED, -- cgit v1.2.3-70-g09d2 From d0c3d534a4388a465101b634a95f2ec586415254 Mon Sep 17 00:00:00 2001 From: Olof Johansson Date: Fri, 12 Oct 2007 10:20:07 +1000 Subject: [POWERPC] Implement logging of unhandled signals Implement show_unhandled_signals sysctl + support to print when a process is killed due to unhandled signals just as i386 and x86_64 does. Default to having it off, unlike x86 that defaults on. Signed-off-by: Olof Johansson Signed-off-by: Paul Mackerras --- arch/powerpc/kernel/signal.c | 6 ++++++ arch/powerpc/kernel/signal_32.c | 38 ++++++++++++++++++++++++++++++++++++++ arch/powerpc/kernel/signal_64.c | 15 +++++++++++++++ arch/powerpc/kernel/traps.c | 12 +++++++++++- kernel/sysctl.c | 2 +- 5 files changed, 71 insertions(+), 2 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c index c434d6c4e4e..a65a44fbe52 100644 --- a/arch/powerpc/kernel/signal.c +++ b/arch/powerpc/kernel/signal.c @@ -16,6 +16,12 @@ #include "signal.h" +/* Log an error when sending an unhandled signal to a process. Controlled + * through debug.exception-trace sysctl. + */ + +int show_unhandled_signals = 0; + /* * Allocate space for the signal frame */ diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c index 590057e9e98..6126bca8b70 100644 --- a/arch/powerpc/kernel/signal_32.c +++ b/arch/powerpc/kernel/signal_32.c @@ -705,11 +705,13 @@ int handle_rt_signal32(unsigned long sig, struct k_sigaction *ka, { struct rt_sigframe __user *rt_sf; struct mcontext __user *frame; + void __user *addr; unsigned long newsp = 0; /* Set up Signal Frame */ /* Put a Real Time Context onto stack */ rt_sf = get_sigframe(ka, regs, sizeof(*rt_sf)); + addr = rt_sf; if (unlikely(rt_sf == NULL)) goto badframe; @@ -728,6 +730,7 @@ int handle_rt_signal32(unsigned long sig, struct k_sigaction *ka, /* Save user registers on the stack */ frame = &rt_sf->uc.uc_mcontext; + addr = frame; if (vdso32_rt_sigtramp && current->mm->context.vdso_base) { if (save_user_regs(regs, frame, 0)) goto badframe; @@ -742,6 +745,7 @@ int handle_rt_signal32(unsigned long sig, struct k_sigaction *ka, /* create a stack frame for the caller of the handler */ newsp = ((unsigned long)rt_sf) - (__SIGNAL_FRAMESIZE + 16); + addr = (void __user *)regs->gpr[1]; if (put_user(regs->gpr[1], (u32 __user *)newsp)) goto badframe; @@ -762,6 +766,12 @@ badframe: printk("badframe in handle_rt_signal, regs=%p frame=%p newsp=%lx\n", regs, frame, newsp); #endif + if (show_unhandled_signals && printk_ratelimit()) + printk(KERN_INFO "%s[%d]: bad frame in handle_rt_signal32: " + "%p nip %08lx lr %08lx\n", + current->comm, current->pid, + addr, regs->nip, regs->link); + force_sigsegv(sig, current); return 0; } @@ -886,6 +896,12 @@ long sys_rt_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8, return 0; bad: + if (show_unhandled_signals && printk_ratelimit()) + printk(KERN_INFO "%s[%d]: bad frame in sys_rt_sigreturn: " + "%p nip %08lx lr %08lx\n", + current->comm, current->pid, + rt_sf, regs->nip, regs->link); + force_sig(SIGSEGV, current); return 0; } @@ -967,6 +983,13 @@ int sys_debug_setcontext(struct ucontext __user *ctx, * We kill the task with a SIGSEGV in this situation. */ if (do_setcontext(ctx, regs, 1)) { + if (show_unhandled_signals && printk_ratelimit()) + printk(KERN_INFO "%s[%d]: bad frame in " + "sys_debug_setcontext: %p nip %08lx " + "lr %08lx\n", + current->comm, current->pid, + ctx, regs->nip, regs->link); + force_sig(SIGSEGV, current); goto out; } @@ -1048,6 +1071,12 @@ badframe: printk("badframe in handle_signal, regs=%p frame=%p newsp=%lx\n", regs, frame, newsp); #endif + if (show_unhandled_signals && printk_ratelimit()) + printk(KERN_INFO "%s[%d]: bad frame in handle_signal32: " + "%p nip %08lx lr %08lx\n", + current->comm, current->pid, + frame, regs->nip, regs->link); + force_sigsegv(sig, current); return 0; } @@ -1061,12 +1090,14 @@ long sys_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8, struct sigcontext __user *sc; struct sigcontext sigctx; struct mcontext __user *sr; + void __user *addr; sigset_t set; /* Always make any pending restarted system calls return -EINTR */ current_thread_info()->restart_block.fn = do_no_restart_syscall; sc = (struct sigcontext __user *)(regs->gpr[1] + __SIGNAL_FRAMESIZE); + addr = sc; if (copy_from_user(&sigctx, sc, sizeof(sigctx))) goto badframe; @@ -1083,6 +1114,7 @@ long sys_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8, restore_sigmask(&set); sr = (struct mcontext __user *)from_user_ptr(sigctx.regs); + addr = sr; if (!access_ok(VERIFY_READ, sr, sizeof(*sr)) || restore_user_regs(regs, sr, 1)) goto badframe; @@ -1091,6 +1123,12 @@ long sys_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8, return 0; badframe: + if (show_unhandled_signals && printk_ratelimit()) + printk(KERN_INFO "%s[%d]: bad frame in sys_sigreturn: " + "%p nip %08lx lr %08lx\n", + current->comm, current->pid, + addr, regs->nip, regs->link); + force_sig(SIGSEGV, current); return 0; } diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c index de895e6d8c6..faeb8f207ea 100644 --- a/arch/powerpc/kernel/signal_64.c +++ b/arch/powerpc/kernel/signal_64.c @@ -64,6 +64,11 @@ struct rt_sigframe { char abigap[288]; } __attribute__ ((aligned (16))); +static const char fmt32[] = KERN_INFO \ + "%s[%d]: bad frame in %s: %08lx nip %08lx lr %08lx\n"; +static const char fmt64[] = KERN_INFO \ + "%s[%d]: bad frame in %s: %016lx nip %016lx lr %016lx\n"; + /* * Set up the sigcontext for the signal frame. */ @@ -315,6 +320,11 @@ badframe: printk("badframe in sys_rt_sigreturn, regs=%p uc=%p &uc->uc_mcontext=%p\n", regs, uc, &uc->uc_mcontext); #endif + if (show_unhandled_signals && printk_ratelimit()) + printk(regs->msr & MSR_SF ? fmt64 : fmt32, + current->comm, current->pid, "rt_sigreturn", + (long)uc, regs->nip, regs->link); + force_sig(SIGSEGV, current); return 0; } @@ -398,6 +408,11 @@ badframe: printk("badframe in setup_rt_frame, regs=%p frame=%p newsp=%lx\n", regs, frame, newsp); #endif + if (show_unhandled_signals && printk_ratelimit()) + printk(regs->msr & MSR_SF ? fmt64 : fmt32, + current->comm, current->pid, "setup_rt_frame", + (long)frame, regs->nip, regs->link); + force_sigsegv(signr, current); return 0; } diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 2f1857c9819..bf9e39c6e29 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -172,11 +172,21 @@ int die(const char *str, struct pt_regs *regs, long err) void _exception(int signr, struct pt_regs *regs, int code, unsigned long addr) { siginfo_t info; + const char fmt32[] = KERN_INFO "%s[%d]: unhandled signal %d " \ + "at %08lx nip %08lx lr %08lx code %x\n"; + const char fmt64[] = KERN_INFO "%s[%d]: unhandled signal %d " \ + "at %016lx nip %016lx lr %016lx code %x\n"; if (!user_mode(regs)) { if (die("Exception in kernel mode", regs, signr)) return; - } + } else if (show_unhandled_signals && + unhandled_signal(current, signr) && + printk_ratelimit()) { + printk(regs->msr & MSR_SF ? fmt64 : fmt32, + current->comm, current->pid, signr, + addr, regs->nip, regs->link, code); + } memset(&info, 0, sizeof(info)); info.si_signo = signr; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 53a456ebf6d..c7314f95264 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1221,7 +1221,7 @@ static ctl_table fs_table[] = { }; static ctl_table debug_table[] = { -#ifdef CONFIG_X86 +#if defined(CONFIG_X86) || defined(CONFIG_PPC) { .ctl_name = CTL_UNNUMBERED, .procname = "exception-trace", -- cgit v1.2.3-70-g09d2