diff options
Diffstat (limited to 'kernel')
57 files changed, 5940 insertions, 2875 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 4ae0fbde815..58908f9d156 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -12,6 +12,9 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o obj-$(CONFIG_FUTEX) += futex.o +ifeq ($(CONFIG_COMPAT),y) +obj-$(CONFIG_FUTEX) += futex_compat.o +endif obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += cpu.o spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o @@ -26,7 +29,7 @@ obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CPUSETS) += cpuset.o obj-$(CONFIG_IKCONFIG) += configs.o obj-$(CONFIG_STOP_MACHINE) += stop_machine.o -obj-$(CONFIG_AUDIT) += audit.o +obj-$(CONFIG_AUDIT) += audit.o auditfilter.o obj-$(CONFIG_AUDITSYSCALL) += auditsc.o obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_SYSFS) += ksysfs.o @@ -34,6 +37,7 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o +obj-$(CONFIG_RELAY) += relay.o ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is diff --git a/kernel/audit.c b/kernel/audit.c index 0a813d2883e..04fe2e301b6 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -52,6 +52,7 @@ #include <linux/audit.h> #include <net/sock.h> +#include <net/netlink.h> #include <linux/skbuff.h> #include <linux/netlink.h> @@ -72,7 +73,7 @@ static int audit_failure = AUDIT_FAIL_PRINTK; * contains the (non-zero) pid. */ int audit_pid; -/* If audit_limit is non-zero, limit the rate of sending audit records +/* If audit_rate_limit is non-zero, limit the rate of sending audit records * to that number per second. This prevents DoS attacks, but results in * audit records being dropped. */ static int audit_rate_limit; @@ -102,7 +103,7 @@ static struct sock *audit_sock; * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of * being placed on the freelist). */ static DEFINE_SPINLOCK(audit_freelist_lock); -static int audit_freelist_count = 0; +static int audit_freelist_count; static LIST_HEAD(audit_freelist); static struct sk_buff_head audit_skb_queue; @@ -113,7 +114,7 @@ static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); /* The netlink socket is only to be read by 1 CPU, which lets us assume * that list additions and deletions never happen simultaneously in * auditsc.c */ -DECLARE_MUTEX(audit_netlink_sem); +DEFINE_MUTEX(audit_netlink_mutex); /* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting * audit records. Since printk uses a 1024 byte buffer, this buffer @@ -142,7 +143,7 @@ static void audit_set_pid(struct audit_buffer *ab, pid_t pid) nlh->nlmsg_pid = pid; } -static void audit_panic(const char *message) +void audit_panic(const char *message) { switch (audit_failure) { @@ -186,8 +187,14 @@ static inline int audit_rate_check(void) return retval; } -/* Emit at least 1 message per second, even if audit_rate_check is - * throttling. */ +/** + * audit_log_lost - conditionally log lost audit message event + * @message: the message stating reason for lost audit message + * + * Emit at least 1 message per second, even if audit_rate_check is + * throttling. + * Always increment the lost messages counter. +*/ void audit_log_lost(const char *message) { static unsigned long last_msg = 0; @@ -218,7 +225,6 @@ void audit_log_lost(const char *message) audit_backlog_limit); audit_panic(message); } - } static int audit_set_rate_limit(int limit, uid_t loginuid) @@ -300,8 +306,22 @@ static int kauditd_thread(void *dummy) remove_wait_queue(&kauditd_wait, &wait); } } + return 0; } +/** + * audit_send_reply - send an audit reply message via netlink + * @pid: process id to send reply to + * @seq: sequence number + * @type: audit message type + * @done: done (last) flag + * @multi: multi-part message flag + * @payload: payload data + * @size: payload size + * + * Allocates an skb, builds the netlink message, and sends it to the pid. + * No failure notifications. + */ void audit_send_reply(int pid, int seq, int type, int done, int multi, void *payload, int size) { @@ -342,15 +362,19 @@ static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type) switch (msg_type) { case AUDIT_GET: case AUDIT_LIST: + case AUDIT_LIST_RULES: case AUDIT_SET: case AUDIT_ADD: + case AUDIT_ADD_RULE: case AUDIT_DEL: + case AUDIT_DEL_RULE: case AUDIT_SIGNAL_INFO: if (!cap_raised(eff_cap, CAP_AUDIT_CONTROL)) err = -EPERM; break; case AUDIT_USER: case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG: + case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2: if (!cap_raised(eff_cap, CAP_AUDIT_WRITE)) err = -EPERM; break; @@ -376,7 +400,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (err) return err; - /* As soon as there's any sign of userspace auditd, start kauditd to talk to it */ + /* As soon as there's any sign of userspace auditd, + * start kauditd to talk to it */ if (!kauditd_task) kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd"); if (IS_ERR(kauditd_task)) { @@ -430,6 +455,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) break; case AUDIT_USER: case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG: + case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2: if (!audit_enabled && msg_type != AUDIT_USER_AVC) return 0; @@ -448,12 +474,23 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) break; case AUDIT_ADD: case AUDIT_DEL: - if (nlh->nlmsg_len < sizeof(struct audit_rule)) + if (nlmsg_len(nlh) < sizeof(struct audit_rule)) return -EINVAL; /* fallthrough */ case AUDIT_LIST: err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, - uid, seq, data, loginuid); + uid, seq, data, nlmsg_len(nlh), + loginuid); + break; + case AUDIT_ADD_RULE: + case AUDIT_DEL_RULE: + if (nlmsg_len(nlh) < sizeof(struct audit_rule_data)) + return -EINVAL; + /* fallthrough */ + case AUDIT_LIST_RULES: + err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, + uid, seq, data, nlmsg_len(nlh), + loginuid); break; case AUDIT_SIGNAL_INFO: sig_data.uid = audit_sig_uid; @@ -469,9 +506,11 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return err < 0 ? err : 0; } -/* Get message from skb (based on rtnetlink_rcv_skb). Each message is +/* + * Get message from skb (based on rtnetlink_rcv_skb). Each message is * processed by audit_receive_msg. Malformed skbs with wrong length are - * discarded silently. */ + * discarded silently. + */ static void audit_receive_skb(struct sk_buff *skb) { int err; @@ -499,14 +538,14 @@ static void audit_receive(struct sock *sk, int length) struct sk_buff *skb; unsigned int qlen; - down(&audit_netlink_sem); + mutex_lock(&audit_netlink_mutex); for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) { skb = skb_dequeue(&sk->sk_receive_queue); audit_receive_skb(skb); kfree_skb(skb); } - up(&audit_netlink_sem); + mutex_unlock(&audit_netlink_mutex); } @@ -519,8 +558,9 @@ static int __init audit_init(void) THIS_MODULE); if (!audit_sock) audit_panic("cannot initialize netlink socket"); + else + audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; - audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; skb_queue_head_init(&audit_skb_queue); audit_initialized = 1; audit_enabled = audit_default; @@ -600,7 +640,10 @@ err: return NULL; } -/* Compute a serial number for the audit record. Audit records are +/** + * audit_serial - compute a serial number for the audit record + * + * Compute a serial number for the audit record. Audit records are * written to user-space as soon as they are generated, so a complete * audit record may be written in several pieces. The timestamp of the * record and this serial number are used by the user-space tools to @@ -612,8 +655,8 @@ err: * audit context (for those records that have a context), and emit them * all at syscall exit. However, this could delay the reporting of * significant errors until syscall exit (or never, if the system - * halts). */ - + * halts). + */ unsigned int audit_serial(void) { static spinlock_t serial_lock = SPIN_LOCK_UNLOCKED; @@ -649,6 +692,21 @@ static inline void audit_get_stamp(struct audit_context *ctx, * will be written at syscall exit. If there is no associated task, tsk * should be NULL. */ +/** + * audit_log_start - obtain an audit buffer + * @ctx: audit_context (may be NULL) + * @gfp_mask: type of allocation + * @type: audit message type + * + * Returns audit_buffer pointer on success or NULL on error. + * + * Obtain an audit buffer. This routine does locking to obtain the + * audit buffer, but then no locking is required for calls to + * audit_log_*format. If the task (ctx) is a task that is currently in a + * syscall, then the syscall is marked as auditable and an audit record + * will be written at syscall exit. If there is no associated task, then + * task context (ctx) should be NULL. + */ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type) { @@ -661,6 +719,9 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, if (!audit_initialized) return NULL; + if (unlikely(audit_filter_type(type))) + return NULL; + if (gfp_mask & __GFP_WAIT) reserve = 0; else @@ -713,6 +774,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, /** * audit_expand - expand skb in the audit buffer * @ab: audit_buffer + * @extra: space to add at tail of the skb * * Returns 0 (no space) on failed expansion, or available space if * successful. @@ -729,10 +791,12 @@ static inline int audit_expand(struct audit_buffer *ab, int extra) return skb_tailroom(skb); } -/* Format an audit message into the audit buffer. If there isn't enough +/* + * Format an audit message into the audit buffer. If there isn't enough * room in the audit buffer, more room will be allocated and vsnprint * will be called a second time. Currently, we assume that a printk - * can't format message larger than 1024 bytes, so we don't either. */ + * can't format message larger than 1024 bytes, so we don't either. + */ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt, va_list args) { @@ -757,7 +821,8 @@ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt, /* The printk buffer is 1024 bytes long, so if we get * here and AUDIT_BUFSIZ is at least 1024, then we can * log everything that printk could have logged. */ - avail = audit_expand(ab, max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail)); + avail = audit_expand(ab, + max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail)); if (!avail) goto out; len = vsnprintf(skb->tail, avail, fmt, args2); @@ -768,8 +833,14 @@ out: return; } -/* Format a message into the audit buffer. All the work is done in - * audit_log_vformat. */ +/** + * audit_log_format - format a message into the audit buffer. + * @ab: audit_buffer + * @fmt: format string + * @...: optional parameters matching @fmt string + * + * All the work is done in audit_log_vformat. + */ void audit_log_format(struct audit_buffer *ab, const char *fmt, ...) { va_list args; @@ -781,9 +852,18 @@ void audit_log_format(struct audit_buffer *ab, const char *fmt, ...) va_end(args); } -/* This function will take the passed buf and convert it into a string of - * ascii hex digits. The new string is placed onto the skb. */ -void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, +/** + * audit_log_hex - convert a buffer to hex and append it to the audit skb + * @ab: the audit_buffer + * @buf: buffer to convert to hex + * @len: length of @buf to be converted + * + * No return value; failure to expand is silently ignored. + * + * This function will take the passed buf and convert it into a string of + * ascii hex digits. The new string is placed onto the skb. + */ +void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, size_t len) { int i, avail, new_len; @@ -812,10 +892,16 @@ void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, skb_put(skb, len << 1); /* new string is twice the old string */ } -/* This code will escape a string that is passed to it if the string - * contains a control character, unprintable character, double quote mark, +/** + * audit_log_unstrustedstring - log a string that may contain random characters + * @ab: audit_buffer + * @string: string to be logged + * + * This code will escape a string that is passed to it if the string + * contains a control character, unprintable character, double quote mark, * or a space. Unescaped strings will start and end with a double quote mark. - * Strings that are escaped are printed in hex (2 digits per char). */ + * Strings that are escaped are printed in hex (2 digits per char). + */ void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) { const unsigned char *p = string; @@ -854,10 +940,15 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix, kfree(path); } -/* The netlink_* functions cannot be called inside an irq context, so - * the audit buffer is places on a queue and a tasklet is scheduled to +/** + * audit_log_end - end one audit record + * @ab: the audit_buffer + * + * The netlink_* functions cannot be called inside an irq context, so + * the audit buffer is placed on a queue and a tasklet is scheduled to * remove them from the queue outside the irq context. May be called in - * any context. */ + * any context. + */ void audit_log_end(struct audit_buffer *ab) { if (!ab) @@ -878,9 +969,18 @@ void audit_log_end(struct audit_buffer *ab) audit_buffer_free(ab); } -/* Log an audit record. This is a convenience function that calls - * audit_log_start, audit_log_vformat, and audit_log_end. It may be - * called in any context. */ +/** + * audit_log - Log an audit record + * @ctx: audit context + * @gfp_mask: type of allocation + * @type: audit message type + * @fmt: format string to use + * @...: variable parameters matching the format string + * + * This is a convenience function that calls audit_log_start, + * audit_log_vformat, and audit_log_end. It may be called + * in any context. + */ void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, const char *fmt, ...) { @@ -895,3 +995,8 @@ void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, audit_log_end(ab); } } + +EXPORT_SYMBOL(audit_log_start); +EXPORT_SYMBOL(audit_log_end); +EXPORT_SYMBOL(audit_log_format); +EXPORT_SYMBOL(audit_log); diff --git a/kernel/audit.h b/kernel/audit.h new file mode 100644 index 00000000000..bc5392076e2 --- /dev/null +++ b/kernel/audit.h @@ -0,0 +1,88 @@ +/* audit -- definition of audit_context structure and supporting types + * + * Copyright 2003-2004 Red Hat, Inc. + * Copyright 2005 Hewlett-Packard Development Company, L.P. + * Copyright 2005 IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/mutex.h> +#include <linux/fs.h> +#include <linux/audit.h> + +/* 0 = no checking + 1 = put_count checking + 2 = verbose put_count checking +*/ +#define AUDIT_DEBUG 0 + +/* At task start time, the audit_state is set in the audit_context using + a per-task filter. At syscall entry, the audit_state is augmented by + the syscall filter. */ +enum audit_state { + AUDIT_DISABLED, /* Do not create per-task audit_context. + * No syscall-specific audit records can + * be generated. */ + AUDIT_SETUP_CONTEXT, /* Create the per-task audit_context, + * but don't necessarily fill it in at + * syscall entry time (i.e., filter + * instead). */ + AUDIT_BUILD_CONTEXT, /* Create the per-task audit_context, + * and always fill it in at syscall + * entry time. This makes a full + * syscall record available if some + * other part of the kernel decides it + * should be recorded. */ + AUDIT_RECORD_CONTEXT /* Create the per-task audit_context, + * always fill it in at syscall entry + * time, and always write out the audit + * record at syscall exit time. */ +}; + +/* Rule lists */ +struct audit_field { + u32 type; + u32 val; + u32 op; +}; + +struct audit_krule { + int vers_ops; + u32 flags; + u32 listnr; + u32 action; + u32 mask[AUDIT_BITMASK_SIZE]; + u32 buflen; /* for data alloc on list rules */ + u32 field_count; + struct audit_field *fields; +}; + +struct audit_entry { + struct list_head list; + struct rcu_head rcu; + struct audit_krule rule; +}; + + +extern int audit_pid; +extern int audit_comparator(const u32 left, const u32 op, const u32 right); + +extern void audit_send_reply(int pid, int seq, int type, + int done, int multi, + void *payload, int size); +extern void audit_log_lost(const char *message); +extern void audit_panic(const char *message); +extern struct mutex audit_netlink_mutex; diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c new file mode 100644 index 00000000000..d3a8539f3a8 --- /dev/null +++ b/kernel/auditfilter.c @@ -0,0 +1,630 @@ +/* auditfilter.c -- filtering of audit events + * + * Copyright 2003-2004 Red Hat, Inc. + * Copyright 2005 Hewlett-Packard Development Company, L.P. + * Copyright 2005 IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/kernel.h> +#include <linux/audit.h> +#include <linux/kthread.h> +#include <linux/netlink.h> +#include "audit.h" + +/* There are three lists of rules -- one to search at task creation + * time, one to search at syscall entry time, and another to search at + * syscall exit time. */ +struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { + LIST_HEAD_INIT(audit_filter_list[0]), + LIST_HEAD_INIT(audit_filter_list[1]), + LIST_HEAD_INIT(audit_filter_list[2]), + LIST_HEAD_INIT(audit_filter_list[3]), + LIST_HEAD_INIT(audit_filter_list[4]), + LIST_HEAD_INIT(audit_filter_list[5]), +#if AUDIT_NR_FILTERS != 6 +#error Fix audit_filter_list initialiser +#endif +}; + +static inline void audit_free_rule(struct audit_entry *e) +{ + kfree(e->rule.fields); + kfree(e); +} + +static inline void audit_free_rule_rcu(struct rcu_head *head) +{ + struct audit_entry *e = container_of(head, struct audit_entry, rcu); + audit_free_rule(e); +} + +/* Unpack a filter field's string representation from user-space + * buffer. */ +static __attribute__((unused)) char *audit_unpack_string(void **bufp, size_t *remain, size_t len) +{ + char *str; + + if (!*bufp || (len == 0) || (len > *remain)) + return ERR_PTR(-EINVAL); + + /* Of the currently implemented string fields, PATH_MAX + * defines the longest valid length. + */ + if (len > PATH_MAX) + return ERR_PTR(-ENAMETOOLONG); + + str = kmalloc(len + 1, GFP_KERNEL); + if (unlikely(!str)) + return ERR_PTR(-ENOMEM); + + memcpy(str, *bufp, len); + str[len] = 0; + *bufp += len; + *remain -= len; + + return str; +} + +/* Common user-space to kernel rule translation. */ +static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) +{ + unsigned listnr; + struct audit_entry *entry; + struct audit_field *fields; + int i, err; + + err = -EINVAL; + listnr = rule->flags & ~AUDIT_FILTER_PREPEND; + switch(listnr) { + default: + goto exit_err; + case AUDIT_FILTER_USER: + case AUDIT_FILTER_TYPE: +#ifdef CONFIG_AUDITSYSCALL + case AUDIT_FILTER_ENTRY: + case AUDIT_FILTER_EXIT: + case AUDIT_FILTER_TASK: +#endif + ; + } + if (rule->action != AUDIT_NEVER && rule->action != AUDIT_POSSIBLE && + rule->action != AUDIT_ALWAYS) + goto exit_err; + if (rule->field_count > AUDIT_MAX_FIELDS) + goto exit_err; + + err = -ENOMEM; + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (unlikely(!entry)) + goto exit_err; + fields = kmalloc(sizeof(*fields) * rule->field_count, GFP_KERNEL); + if (unlikely(!fields)) { + kfree(entry); + goto exit_err; + } + + memset(&entry->rule, 0, sizeof(struct audit_krule)); + memset(fields, 0, sizeof(struct audit_field)); + + entry->rule.flags = rule->flags & AUDIT_FILTER_PREPEND; + entry->rule.listnr = listnr; + entry->rule.action = rule->action; + entry->rule.field_count = rule->field_count; + entry->rule.fields = fields; + + for (i = 0; i < AUDIT_BITMASK_SIZE; i++) + entry->rule.mask[i] = rule->mask[i]; + + return entry; + +exit_err: + return ERR_PTR(err); +} + +/* Translate struct audit_rule to kernel's rule respresentation. + * Exists for backward compatibility with userspace. */ +static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) +{ + struct audit_entry *entry; + int err = 0; + int i; + + entry = audit_to_entry_common(rule); + if (IS_ERR(entry)) + goto exit_nofree; + + for (i = 0; i < rule->field_count; i++) { + struct audit_field *f = &entry->rule.fields[i]; + + if (rule->fields[i] & AUDIT_UNUSED_BITS) { + err = -EINVAL; + goto exit_free; + } + + f->op = rule->fields[i] & (AUDIT_NEGATE|AUDIT_OPERATORS); + f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS); + f->val = rule->values[i]; + + entry->rule.vers_ops = (f->op & AUDIT_OPERATORS) ? 2 : 1; + + /* Support for legacy operators where + * AUDIT_NEGATE bit signifies != and otherwise assumes == */ + if (f->op & AUDIT_NEGATE) + f->op = AUDIT_NOT_EQUAL; + else if (!f->op) + f->op = AUDIT_EQUAL; + else if (f->op == AUDIT_OPERATORS) { + err = -EINVAL; + goto exit_free; + } + } + +exit_nofree: + return entry; + +exit_free: + audit_free_rule(entry); + return ERR_PTR(err); +} + +/* Translate struct audit_rule_data to kernel's rule respresentation. */ +static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, + size_t datasz) +{ + int err = 0; + struct audit_entry *entry; + void *bufp; + /* size_t remain = datasz - sizeof(struct audit_rule_data); */ + int i; + + entry = audit_to_entry_common((struct audit_rule *)data); + if (IS_ERR(entry)) + goto exit_nofree; + + bufp = data->buf; + entry->rule.vers_ops = 2; + for (i = 0; i < data->field_count; i++) { + struct audit_field *f = &entry->rule.fields[i]; + + err = -EINVAL; + if (!(data->fieldflags[i] & AUDIT_OPERATORS) || + data->fieldflags[i] & ~AUDIT_OPERATORS) + goto exit_free; + + f->op = data->fieldflags[i] & AUDIT_OPERATORS; + f->type = data->fields[i]; + switch(f->type) { + /* call type-specific conversion routines here */ + default: + f->val = data->values[i]; + } + } + +exit_nofree: + return entry; + +exit_free: + audit_free_rule(entry); + return ERR_PTR(err); +} + +/* Pack a filter field's string representation into data block. */ +static inline size_t audit_pack_string(void **bufp, char *str) +{ + size_t len = strlen(str); + + memcpy(*bufp, str, len); + *bufp += len; + + return len; +} + +/* Translate kernel rule respresentation to struct audit_rule. + * Exists for backward compatibility with userspace. */ +static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule) +{ + struct audit_rule *rule; + int i; + + rule = kmalloc(sizeof(*rule), GFP_KERNEL); + if (unlikely(!rule)) + return ERR_PTR(-ENOMEM); + memset(rule, 0, sizeof(*rule)); + + rule->flags = krule->flags | krule->listnr; + rule->action = krule->action; + rule->field_count = krule->field_count; + for (i = 0; i < rule->field_count; i++) { + rule->values[i] = krule->fields[i].val; + rule->fields[i] = krule->fields[i].type; + + if (krule->vers_ops == 1) { + if (krule->fields[i].op & AUDIT_NOT_EQUAL) + rule->fields[i] |= AUDIT_NEGATE; + } else { + rule->fields[i] |= krule->fields[i].op; + } + } + for (i = 0; i < AUDIT_BITMASK_SIZE; i++) rule->mask[i] = krule->mask[i]; + + return rule; +} + +/* Translate kernel rule respresentation to struct audit_rule_data. */ +static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) +{ + struct audit_rule_data *data; + void *bufp; + int i; + + data = kmalloc(sizeof(*data) + krule->buflen, GFP_KERNEL); + if (unlikely(!data)) + return ERR_PTR(-ENOMEM); + memset(data, 0, sizeof(*data)); + + data->flags = krule->flags | krule->listnr; + data->action = krule->action; + data->field_count = krule->field_count; + bufp = data->buf; + for (i = 0; i < data->field_count; i++) { + struct audit_field *f = &krule->fields[i]; + + data->fields[i] = f->type; + data->fieldflags[i] = f->op; + switch(f->type) { + /* call type-specific conversion routines here */ + default: + data->values[i] = f->val; + } + } + for (i = 0; i < AUDIT_BITMASK_SIZE; i++) data->mask[i] = krule->mask[i]; + + return data; +} + +/* Compare two rules in kernel format. Considered success if rules + * don't match. */ +static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) +{ + int i; + + if (a->flags != b->flags || + a->listnr != b->listnr || + a->action != b->action || + a->field_count != b->field_count) + return 1; + + for (i = 0; i < a->field_count; i++) { + if (a->fields[i].type != b->fields[i].type || + a->fields[i].op != b->fields[i].op) + return 1; + + switch(a->fields[i].type) { + /* call type-specific comparison routines here */ + default: + if (a->fields[i].val != b->fields[i].val) + return 1; + } + } + + for (i = 0; i < AUDIT_BITMASK_SIZE; i++) + if (a->mask[i] != b->mask[i]) + return 1; + + return 0; +} + +/* Add rule to given filterlist if not a duplicate. Protected by + * audit_netlink_mutex. */ +static inline int audit_add_rule(struct audit_entry *entry, + struct list_head *list) +{ + struct audit_entry *e; + + /* Do not use the _rcu iterator here, since this is the only + * addition routine. */ + list_for_each_entry(e, list, list) { + if (!audit_compare_rule(&entry->rule, &e->rule)) + return -EEXIST; + } + + if (entry->rule.flags & AUDIT_FILTER_PREPEND) { + list_add_rcu(&entry->list, list); + } else { + list_add_tail_rcu(&entry->list, list); + } + + return 0; +} + +/* Remove an existing rule from filterlist. Protected by + * audit_netlink_mutex. */ +static inline int audit_del_rule(struct audit_entry *entry, + struct list_head *list) +{ + struct audit_entry *e; + + /* Do not use the _rcu iterator here, since this is the only + * deletion routine. */ + list_for_each_entry(e, list, list) { + if (!audit_compare_rule(&entry->rule, &e->rule)) { + list_del_rcu(&e->list); + call_rcu(&e->rcu, audit_free_rule_rcu); + return 0; + } + } + return -ENOENT; /* No matching rule */ +} + +/* List rules using struct audit_rule. Exists for backward + * compatibility with userspace. */ +static int audit_list(void *_dest) +{ + int pid, seq; + int *dest = _dest; + struct audit_entry *entry; + int i; + + pid = dest[0]; + seq = dest[1]; + kfree(dest); + + mutex_lock(&audit_netlink_mutex); + + /* The *_rcu iterators not needed here because we are + always called with audit_netlink_mutex held. */ + for (i=0; i<AUDIT_NR_FILTERS; i++) { + list_for_each_entry(entry, &audit_filter_list[i], list) { + struct audit_rule *rule; + + rule = audit_krule_to_rule(&entry->rule); + if (unlikely(!rule)) + break; + audit_send_reply(pid, seq, AUDIT_LIST, 0, 1, + rule, sizeof(*rule)); + kfree(rule); + } + } + audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0); + + mutex_unlock(&audit_netlink_mutex); + return 0; +} + +/* List rules using struct audit_rule_data. */ +static int audit_list_rules(void *_dest) +{ + int pid, seq; + int *dest = _dest; + struct audit_entry *e; + int i; + + pid = dest[0]; + seq = dest[1]; + kfree(dest); + + mutex_lock(&audit_netlink_mutex); + + /* The *_rcu iterators not needed here because we are + always called with audit_netlink_mutex held. */ + for (i=0; i<AUDIT_NR_FILTERS; i++) { + list_for_each_entry(e, &audit_filter_list[i], list) { + struct audit_rule_data *data; + + data = audit_krule_to_data(&e->rule); + if (unlikely(!data)) + break; + audit_send_reply(pid, seq, AUDIT_LIST_RULES, 0, 1, + data, sizeof(*data)); + kfree(data); + } + } + audit_send_reply(pid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0); + + mutex_unlock(&audit_netlink_mutex); + return 0; +} + +/** + * audit_receive_filter - apply all rules to the specified message type + * @type: audit message type + * @pid: target pid for netlink audit messages + * @uid: target uid for netlink audit messages + * @seq: netlink audit message sequence (serial) number + * @data: payload data + * @datasz: size of payload data + * @loginuid: loginuid of sender + */ +int audit_receive_filter(int type, int pid, int uid, int seq, void *data, + size_t datasz, uid_t loginuid) +{ + struct task_struct *tsk; + int *dest; + int err = 0; + struct audit_entry *entry; + + switch (type) { + case AUDIT_LIST: + case AUDIT_LIST_RULES: + /* We can't just spew out the rules here because we might fill + * the available socket buffer space and deadlock waiting for + * auditctl to read from it... which isn't ever going to + * happen if we're actually running in the context of auditctl + * trying to _send_ the stuff */ + + dest = kmalloc(2 * sizeof(int), GFP_KERNEL); + if (!dest) + return -ENOMEM; + dest[0] = pid; + dest[1] = seq; + + if (type == AUDIT_LIST) + tsk = kthread_run(audit_list, dest, "audit_list"); + else + tsk = kthread_run(audit_list_rules, dest, + "audit_list_rules"); + if (IS_ERR(tsk)) { + kfree(dest); + err = PTR_ERR(tsk); + } + break; + case AUDIT_ADD: + case AUDIT_ADD_RULE: + if (type == AUDIT_ADD) + entry = audit_rule_to_entry(data); + else + entry = audit_data_to_entry(data, datasz); + if (IS_ERR(entry)) + return PTR_ERR(entry); + + err = audit_add_rule(entry, + &audit_filter_list[entry->rule.listnr]); + audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, + "auid=%u add rule to list=%d res=%d\n", + loginuid, entry->rule.listnr, !err); + + if (err) + audit_free_rule(entry); + break; + case AUDIT_DEL: + case AUDIT_DEL_RULE: + if (type == AUDIT_DEL) + entry = audit_rule_to_entry(data); + else + entry = audit_data_to_entry(data, datasz); + if (IS_ERR(entry)) + return PTR_ERR(entry); + + err = audit_del_rule(entry, + &audit_filter_list[entry->rule.listnr]); + audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, + "auid=%u remove rule from list=%d res=%d\n", + loginuid, entry->rule.listnr, !err); + + audit_free_rule(entry); + break; + default: + return -EINVAL; + } + + return err; +} + +int audit_comparator(const u32 left, const u32 op, const u32 right) +{ + switch (op) { + case AUDIT_EQUAL: + return (left == right); + case AUDIT_NOT_EQUAL: + return (left != right); + case AUDIT_LESS_THAN: + return (left < right); + case AUDIT_LESS_THAN_OR_EQUAL: + return (left <= right); + case AUDIT_GREATER_THAN: + return (left > right); + case AUDIT_GREATER_THAN_OR_EQUAL: + return (left >= right); + } + BUG(); + return 0; +} + + + +static int audit_filter_user_rules(struct netlink_skb_parms *cb, + struct audit_krule *rule, + enum audit_state *state) +{ + int i; + + for (i = 0; i < rule->field_count; i++) { + struct audit_field *f = &rule->fields[i]; + int result = 0; + + switch (f->type) { + case AUDIT_PID: + result = audit_comparator(cb->creds.pid, f->op, f->val); + break; + case AUDIT_UID: + result = audit_comparator(cb->creds.uid, f->op, f->val); + break; + case AUDIT_GID: + result = audit_comparator(cb->creds.gid, f->op, f->val); + break; + case AUDIT_LOGINUID: + result = audit_comparator(cb->loginuid, f->op, f->val); + break; + } + + if (!result) + return 0; + } + switch (rule->action) { + case AUDIT_NEVER: *state = AUDIT_DISABLED; break; + case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT; break; + case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; + } + return 1; +} + +int audit_filter_user(struct netlink_skb_parms *cb, int type) +{ + struct audit_entry *e; + enum audit_state state; + int ret = 1; + + rcu_read_lock(); + list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) { + if (audit_filter_user_rules(cb, &e->rule, &state)) { + if (state == AUDIT_DISABLED) + ret = 0; + break; + } + } + rcu_read_unlock(); + + return ret; /* Audit by default */ +} + +int audit_filter_type(int type) +{ + struct audit_entry *e; + int result = 0; + + rcu_read_lock(); + if (list_empty(&audit_filter_list[AUDIT_FILTER_TYPE])) + goto unlock_and_return; + + list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TYPE], + list) { + int i; + for (i = 0; i < e->rule.field_count; i++) { + struct audit_field *f = &e->rule.fields[i]; + if (f->type == AUDIT_MSGTYPE) { + result = audit_comparator(type, f->op, f->val); + if (!result) + break; + } + } + if (result) + goto unlock_and_return; + } +unlock_and_return: + rcu_read_unlock(); + return result; +} diff --git a/kernel/auditsc.c b/kernel/auditsc.c index d7e7e637b92..7f160df21a2 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2,6 +2,8 @@ * Handles all system-call specific auditing features. * * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. + * Copyright 2005 Hewlett-Packard Development Company, L.P. + * Copyright (C) 2005 IBM Corporation * All Rights Reserved. * * This program is free software; you can redistribute it and/or modify @@ -27,11 +29,22 @@ * this file -- see entry.S) is based on a GPL'd patch written by * okir@suse.de and Copyright 2003 SuSE Linux AG. * + * The support of additional filter rules compares (>, <, >=, <=) was + * added by Dustin Kirkland <dustin.kirkland@us.ibm.com>, 2005. + * + * Modified by Amy Griffis <amy.griffis@hp.com> to collect additional + * filesystem information. + * + * Subject and object context labeling support added by <danjones@us.ibm.com> + * and <dustin.kirkland@us.ibm.com> for LSPP certification compliance. */ #include <linux/init.h> #include <asm/types.h> #include <asm/atomic.h> +#include <asm/types.h> +#include <linux/fs.h> +#include <linux/namei.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/mount.h> @@ -39,16 +52,16 @@ #include <linux/audit.h> #include <linux/personality.h> #include <linux/time.h> -#include <linux/kthread.h> #include <linux/netlink.h> #include <linux/compiler.h> #include <asm/unistd.h> +#include <linux/security.h> +#include <linux/list.h> +#include <linux/tty.h> + +#include "audit.h" -/* 0 = no checking - 1 = put_count checking - 2 = verbose put_count checking -*/ -#define AUDIT_DEBUG 0 +extern struct list_head audit_filter_list[]; /* No syscall auditing will take place unless audit_enabled != 0. */ extern int audit_enabled; @@ -62,29 +75,6 @@ extern int audit_enabled; * path_lookup. */ #define AUDIT_NAMES_RESERVED 7 -/* At task start time, the audit_state is set in the audit_context using - a per-task filter. At syscall entry, the audit_state is augmented by - the syscall filter. */ -enum audit_state { - AUDIT_DISABLED, /* Do not create per-task audit_context. - * No syscall-specific audit records can - * be generated. */ - AUDIT_SETUP_CONTEXT, /* Create the per-task audit_context, - * but don't necessarily fill it in at - * syscall entry time (i.e., filter - * instead). */ - AUDIT_BUILD_CONTEXT, /* Create the per-task audit_context, - * and always fill it in at syscall - * entry time. This makes a full - * syscall record available if some - * other part of the kernel decides it - * should be recorded. */ - AUDIT_RECORD_CONTEXT /* Create the per-task audit_context, - * always fill it in at syscall entry - * time, and always write out the audit - * record at syscall exit time. */ -}; - /* When fs/namei.c:getname() is called, we store the pointer in name and * we don't let putname() free it (instead we free all of the saved * pointers at syscall exit time). @@ -93,12 +83,13 @@ enum audit_state { struct audit_names { const char *name; unsigned long ino; + unsigned long pino; dev_t dev; umode_t mode; uid_t uid; gid_t gid; dev_t rdev; - unsigned flags; + char *ctx; }; struct audit_aux_data { @@ -115,6 +106,7 @@ struct audit_aux_data_ipcctl { uid_t uid; gid_t gid; mode_t mode; + char *ctx; }; struct audit_aux_data_socketcall { @@ -167,290 +159,72 @@ struct audit_context { #endif }; - /* Public API */ -/* There are three lists of rules -- one to search at task creation - * time, one to search at syscall entry time, and another to search at - * syscall exit time. */ -static struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { - LIST_HEAD_INIT(audit_filter_list[0]), - LIST_HEAD_INIT(audit_filter_list[1]), - LIST_HEAD_INIT(audit_filter_list[2]), - LIST_HEAD_INIT(audit_filter_list[3]), - LIST_HEAD_INIT(audit_filter_list[4]), -#if AUDIT_NR_FILTERS != 5 -#error Fix audit_filter_list initialiser -#endif -}; - -struct audit_entry { - struct list_head list; - struct rcu_head rcu; - struct audit_rule rule; -}; - -extern int audit_pid; - -/* Copy rule from user-space to kernel-space. Called from - * audit_add_rule during AUDIT_ADD. */ -static inline int audit_copy_rule(struct audit_rule *d, struct audit_rule *s) -{ - int i; - - if (s->action != AUDIT_NEVER - && s->action != AUDIT_POSSIBLE - && s->action != AUDIT_ALWAYS) - return -1; - if (s->field_count < 0 || s->field_count > AUDIT_MAX_FIELDS) - return -1; - if ((s->flags & ~AUDIT_FILTER_PREPEND) >= AUDIT_NR_FILTERS) - return -1; - - d->flags = s->flags; - d->action = s->action; - d->field_count = s->field_count; - for (i = 0; i < d->field_count; i++) { - d->fields[i] = s->fields[i]; - d->values[i] = s->values[i]; - } - for (i = 0; i < AUDIT_BITMASK_SIZE; i++) d->mask[i] = s->mask[i]; - return 0; -} - -/* Check to see if two rules are identical. It is called from - * audit_add_rule during AUDIT_ADD and - * audit_del_rule during AUDIT_DEL. */ -static inline int audit_compare_rule(struct audit_rule *a, struct audit_rule *b) -{ - int i; - - if (a->flags != b->flags) - return 1; - - if (a->action != b->action) - return 1; - - if (a->field_count != b->field_count) - return 1; - - for (i = 0; i < a->field_count; i++) { - if (a->fields[i] != b->fields[i] - || a->values[i] != b->values[i]) - return 1; - } - - for (i = 0; i < AUDIT_BITMASK_SIZE; i++) - if (a->mask[i] != b->mask[i]) - return 1; - - return 0; -} - -/* Note that audit_add_rule and audit_del_rule are called via - * audit_receive() in audit.c, and are protected by - * audit_netlink_sem. */ -static inline int audit_add_rule(struct audit_rule *rule, - struct list_head *list) -{ - struct audit_entry *entry; - - /* Do not use the _rcu iterator here, since this is the only - * addition routine. */ - list_for_each_entry(entry, list, list) { - if (!audit_compare_rule(rule, &entry->rule)) { - return -EEXIST; - } - } - - if (!(entry = kmalloc(sizeof(*entry), GFP_KERNEL))) - return -ENOMEM; - if (audit_copy_rule(&entry->rule, rule)) { - kfree(entry); - return -EINVAL; - } - - if (entry->rule.flags & AUDIT_FILTER_PREPEND) { - entry->rule.flags &= ~AUDIT_FILTER_PREPEND; - list_add_rcu(&entry->list, list); - } else { - list_add_tail_rcu(&entry->list, list); - } - - return 0; -} - -static inline void audit_free_rule(struct rcu_head *head) -{ - struct audit_entry *e = container_of(head, struct audit_entry, rcu); - kfree(e); -} - -/* Note that audit_add_rule and audit_del_rule are called via - * audit_receive() in audit.c, and are protected by - * audit_netlink_sem. */ -static inline int audit_del_rule(struct audit_rule *rule, - struct list_head *list) -{ - struct audit_entry *e; - - /* Do not use the _rcu iterator here, since this is the only - * deletion routine. */ - list_for_each_entry(e, list, list) { - if (!audit_compare_rule(rule, &e->rule)) { - list_del_rcu(&e->list); - call_rcu(&e->rcu, audit_free_rule); - return 0; - } - } - return -ENOENT; /* No matching rule */ -} - -static int audit_list_rules(void *_dest) -{ - int pid, seq; - int *dest = _dest; - struct audit_entry *entry; - int i; - - pid = dest[0]; - seq = dest[1]; - kfree(dest); - - down(&audit_netlink_sem); - - /* The *_rcu iterators not needed here because we are - always called with audit_netlink_sem held. */ - for (i=0; i<AUDIT_NR_FILTERS; i++) { - list_for_each_entry(entry, &audit_filter_list[i], list) - audit_send_reply(pid, seq, AUDIT_LIST, 0, 1, - &entry->rule, sizeof(entry->rule)); - } - audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0); - - up(&audit_netlink_sem); - return 0; -} - -int audit_receive_filter(int type, int pid, int uid, int seq, void *data, - uid_t loginuid) -{ - struct task_struct *tsk; - int *dest; - int err = 0; - unsigned listnr; - - switch (type) { - case AUDIT_LIST: - /* We can't just spew out the rules here because we might fill - * the available socket buffer space and deadlock waiting for - * auditctl to read from it... which isn't ever going to - * happen if we're actually running in the context of auditctl - * trying to _send_ the stuff */ - - dest = kmalloc(2 * sizeof(int), GFP_KERNEL); - if (!dest) - return -ENOMEM; - dest[0] = pid; - dest[1] = seq; - - tsk = kthread_run(audit_list_rules, dest, "audit_list_rules"); - if (IS_ERR(tsk)) { - kfree(dest); - err = PTR_ERR(tsk); - } - break; - case AUDIT_ADD: - listnr =((struct audit_rule *)data)->flags & ~AUDIT_FILTER_PREPEND; - if (listnr >= AUDIT_NR_FILTERS) - return -EINVAL; - - err = audit_add_rule(data, &audit_filter_list[listnr]); - if (!err) - audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, - "auid=%u added an audit rule\n", loginuid); - break; - case AUDIT_DEL: - listnr =((struct audit_rule *)data)->flags & ~AUDIT_FILTER_PREPEND; - if (listnr >= AUDIT_NR_FILTERS) - return -EINVAL; - - err = audit_del_rule(data, &audit_filter_list[listnr]); - if (!err) - audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, - "auid=%u removed an audit rule\n", loginuid); - break; - default: - return -EINVAL; - } - - return err; -} /* Compare a task_struct with an audit_rule. Return 1 on match, 0 * otherwise. */ static int audit_filter_rules(struct task_struct *tsk, - struct audit_rule *rule, + struct audit_krule *rule, struct audit_context *ctx, enum audit_state *state) { int i, j; for (i = 0; i < rule->field_count; i++) { - u32 field = rule->fields[i] & ~AUDIT_NEGATE; - u32 value = rule->values[i]; + struct audit_field *f = &rule->fields[i]; int result = 0; - switch (field) { + switch (f->type) { case AUDIT_PID: - result = (tsk->pid == value); + result = audit_comparator(tsk->pid, f->op, f->val); break; case AUDIT_UID: - result = (tsk->uid == value); + result = audit_comparator(tsk->uid, f->op, f->val); break; case AUDIT_EUID: - result = (tsk->euid == value); + result = audit_comparator(tsk->euid, f->op, f->val); break; case AUDIT_SUID: - result = (tsk->suid == value); + result = audit_comparator(tsk->suid, f->op, f->val); break; case AUDIT_FSUID: - result = (tsk->fsuid == value); + result = audit_comparator(tsk->fsuid, f->op, f->val); break; case AUDIT_GID: - result = (tsk->gid == value); + result = audit_comparator(tsk->gid, f->op, f->val); break; case AUDIT_EGID: - result = (tsk->egid == value); + result = audit_comparator(tsk->egid, f->op, f->val); break; case AUDIT_SGID: - result = (tsk->sgid == value); + result = audit_comparator(tsk->sgid, f->op, f->val); break; case AUDIT_FSGID: - result = (tsk->fsgid == value); + result = audit_comparator(tsk->fsgid, f->op, f->val); break; case AUDIT_PERS: - result = (tsk->personality == value); + result = audit_comparator(tsk->personality, f->op, f->val); break; case AUDIT_ARCH: - if (ctx) - result = (ctx->arch == value); + if (ctx) + result = audit_comparator(ctx->arch, f->op, f->val); break; case AUDIT_EXIT: if (ctx && ctx->return_valid) - result = (ctx->return_code == value); + result = audit_comparator(ctx->return_code, f->op, f->val); break; case AUDIT_SUCCESS: if (ctx && ctx->return_valid) { - if (value) - result = (ctx->return_valid == AUDITSC_SUCCESS); + if (f->val) + result = audit_comparator(ctx->return_valid, f->op, AUDITSC_SUCCESS); else - result = (ctx->return_valid == AUDITSC_FAILURE); + result = audit_comparator(ctx->return_valid, f->op, AUDITSC_FAILURE); } break; case AUDIT_DEVMAJOR: if (ctx) { for (j = 0; j < ctx->name_count; j++) { - if (MAJOR(ctx->names[j].dev)==value) { + if (audit_comparator(MAJOR(ctx->names[j].dev), f->op, f->val)) { ++result; break; } @@ -460,7 +234,7 @@ static int audit_filter_rules(struct task_struct *tsk, case AUDIT_DEVMINOR: if (ctx) { for (j = 0; j < ctx->name_count; j++) { - if (MINOR(ctx->names[j].dev)==value) { + if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) { ++result; break; } @@ -470,7 +244,8 @@ static int audit_filter_rules(struct task_struct *tsk, case AUDIT_INODE: if (ctx) { for (j = 0; j < ctx->name_count; j++) { - if (ctx->names[j].ino == value) { + if (audit_comparator(ctx->names[j].ino, f->op, f->val) || + audit_comparator(ctx->names[j].pino, f->op, f->val)) { ++result; break; } @@ -480,19 +255,17 @@ static int audit_filter_rules(struct task_struct *tsk, case AUDIT_LOGINUID: result = 0; if (ctx) - result = (ctx->loginuid == value); + result = audit_comparator(ctx->loginuid, f->op, f->val); break; case AUDIT_ARG0: case AUDIT_ARG1: case AUDIT_ARG2: case AUDIT_ARG3: if (ctx) - result = (ctx->argv[field-AUDIT_ARG0]==value); + result = audit_comparator(ctx->argv[f->type-AUDIT_ARG0], f->op, f->val); break; } - if (rule->fields[i] & AUDIT_NEGATE) - result = !result; if (!result) return 0; } @@ -527,7 +300,7 @@ static enum audit_state audit_filter_task(struct task_struct *tsk) /* At syscall entry and exit time, this filter is called if the * audit_state is not low enough that auditing cannot take place, but is * also not high enough that we already know we have to write an audit - * record (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT). + * record (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT). */ static enum audit_state audit_filter_syscall(struct task_struct *tsk, struct audit_context *ctx, @@ -541,77 +314,19 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, rcu_read_lock(); if (!list_empty(list)) { - int word = AUDIT_WORD(ctx->major); - int bit = AUDIT_BIT(ctx->major); - - list_for_each_entry_rcu(e, list, list) { - if ((e->rule.mask[word] & bit) == bit - && audit_filter_rules(tsk, &e->rule, ctx, &state)) { - rcu_read_unlock(); - return state; - } - } - } - rcu_read_unlock(); - return AUDIT_BUILD_CONTEXT; -} - -static int audit_filter_user_rules(struct netlink_skb_parms *cb, - struct audit_rule *rule, - enum audit_state *state) -{ - int i; - - for (i = 0; i < rule->field_count; i++) { - u32 field = rule->fields[i] & ~AUDIT_NEGATE; - u32 value = rule->values[i]; - int result = 0; - - switch (field) { - case AUDIT_PID: - result = (cb->creds.pid == value); - break; - case AUDIT_UID: - result = (cb->creds.uid == value); - break; - case AUDIT_GID: - result = (cb->creds.gid == value); - break; - case AUDIT_LOGINUID: - result = (cb->loginuid == value); - break; - } - - if (rule->fields[i] & AUDIT_NEGATE) - result = !result; - if (!result) - return 0; - } - switch (rule->action) { - case AUDIT_NEVER: *state = AUDIT_DISABLED; break; - case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT; break; - case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; - } - return 1; -} - -int audit_filter_user(struct netlink_skb_parms *cb, int type) -{ - struct audit_entry *e; - enum audit_state state; - int ret = 1; - - rcu_read_lock(); - list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) { - if (audit_filter_user_rules(cb, &e->rule, &state)) { - if (state == AUDIT_DISABLED) - ret = 0; - break; + int word = AUDIT_WORD(ctx->major); + int bit = AUDIT_BIT(ctx->major); + + list_for_each_entry_rcu(e, list, list) { + if ((e->rule.mask[word] & bit) == bit + && audit_filter_rules(tsk, &e->rule, ctx, &state)) { + rcu_read_unlock(); + return state; + } } } rcu_read_unlock(); - - return ret; /* Audit by default */ + return AUDIT_BUILD_CONTEXT; } /* This should be called with task_lock() held. */ @@ -654,17 +369,18 @@ static inline void audit_free_names(struct audit_context *context) #if AUDIT_DEBUG == 2 if (context->auditable ||context->put_count + context->ino_count != context->name_count) { - printk(KERN_ERR "audit.c:%d(:%d): major=%d in_syscall=%d" + printk(KERN_ERR "%s:%d(:%d): major=%d in_syscall=%d" " name_count=%d put_count=%d" " ino_count=%d [NOT freeing]\n", - __LINE__, + __FILE__, __LINE__, context->serial, context->major, context->in_syscall, context->name_count, context->put_count, context->ino_count); - for (i = 0; i < context->name_count; i++) + for (i = 0; i < context->name_count; i++) { printk(KERN_ERR "names[%d] = %p = %s\n", i, context->names[i].name, - context->names[i].name); + context->names[i].name ?: "(null)"); + } dump_stack(); return; } @@ -674,9 +390,13 @@ static inline void audit_free_names(struct audit_context *context) context->ino_count = 0; #endif - for (i = 0; i < context->name_count; i++) + for (i = 0; i < context->name_count; i++) { + char *p = context->names[i].ctx; + context->names[i].ctx = NULL; + kfree(p); if (context->names[i].name) __putname(context->names[i].name); + } context->name_count = 0; if (context->pwd) dput(context->pwd); @@ -696,6 +416,12 @@ static inline void audit_free_aux(struct audit_context *context) dput(axi->dentry); mntput(axi->mnt); } + if ( aux->type == AUDIT_IPC ) { + struct audit_aux_data_ipcctl *axi = (void *)aux; + if (axi->ctx) + kfree(axi->ctx); + } + context->aux = aux->next; kfree(aux); } @@ -721,10 +447,15 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state) return context; } -/* Filter on the task information and allocate a per-task audit context +/** + * audit_alloc - allocate an audit context block for a task + * @tsk: task + * + * Filter on the task information and allocate a per-task audit context * if necessary. Doing so turns on system call auditing for the * specified task. This is called from copy_process, so no lock is - * needed. */ + * needed. + */ int audit_alloc(struct task_struct *tsk) { struct audit_context *context; @@ -775,7 +506,37 @@ static inline void audit_free_context(struct audit_context *context) printk(KERN_ERR "audit: freed %d contexts\n", count); } -static void audit_log_task_info(struct audit_buffer *ab) +static void audit_log_task_context(struct audit_buffer *ab, gfp_t gfp_mask) +{ + char *ctx = NULL; + ssize_t len = 0; + + len = security_getprocattr(current, "current", NULL, 0); + if (len < 0) { + if (len != -EINVAL) + goto error_path; + return; + } + + ctx = kmalloc(len, gfp_mask); + if (!ctx) + goto error_path; + + len = security_getprocattr(current, "current", ctx, len); + if (len < 0 ) + goto error_path; + + audit_log_format(ab, " subj=%s", ctx); + return; + +error_path: + if (ctx) + kfree(ctx); + audit_panic("error in audit_log_task_context"); + return; +} + +static void audit_log_task_info(struct audit_buffer *ab, gfp_t gfp_mask) { char name[sizeof(current->comm)]; struct mm_struct *mm = current->mm; @@ -788,6 +549,10 @@ static void audit_log_task_info(struct audit_buffer *ab) if (!mm) return; + /* + * this is brittle; all callers that pass GFP_ATOMIC will have + * NULL current->mm and we won't get here. + */ down_read(&mm->mmap_sem); vma = mm->mmap; while (vma) { @@ -801,6 +566,7 @@ static void audit_log_task_info(struct audit_buffer *ab) vma = vma->vm_next; } up_read(&mm->mmap_sem); + audit_log_task_context(ab, gfp_mask); } static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask) @@ -808,6 +574,7 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask) int i; struct audit_buffer *ab; struct audit_aux_data *aux; + const char *tty; ab = audit_log_start(context, gfp_mask, AUDIT_SYSCALL); if (!ab) @@ -820,11 +587,15 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask) audit_log_format(ab, " success=%s exit=%ld", (context->return_valid==AUDITSC_SUCCESS)?"yes":"no", context->return_code); + if (current->signal->tty && current->signal->tty->name) + tty = current->signal->tty->name; + else + tty = "(none)"; audit_log_format(ab, " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" " pid=%d auid=%u uid=%u gid=%u" " euid=%u suid=%u fsuid=%u" - " egid=%u sgid=%u fsgid=%u", + " egid=%u sgid=%u fsgid=%u tty=%s", context->argv[0], context->argv[1], context->argv[2], @@ -835,8 +606,8 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask) context->uid, context->gid, context->euid, context->suid, context->fsuid, - context->egid, context->sgid, context->fsgid); - audit_log_task_info(ab); + context->egid, context->sgid, context->fsgid, tty); + audit_log_task_info(ab, gfp_mask); audit_log_end(ab); for (aux = context->aux; aux; aux = aux->next) { @@ -849,8 +620,8 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask) case AUDIT_IPC: { struct audit_aux_data_ipcctl *axi = (void *)aux; audit_log_format(ab, - " qbytes=%lx iuid=%u igid=%u mode=%x", - axi->qbytes, axi->uid, axi->gid, axi->mode); + " qbytes=%lx iuid=%u igid=%u mode=%x obj=%s", + axi->qbytes, axi->uid, axi->gid, axi->mode, axi->ctx); break; } case AUDIT_SOCKETCALL: { @@ -885,42 +656,62 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask) } } for (i = 0; i < context->name_count; i++) { + unsigned long ino = context->names[i].ino; + unsigned long pino = context->names[i].pino; + ab = audit_log_start(context, gfp_mask, AUDIT_PATH); if (!ab) continue; /* audit_panic has been called */ audit_log_format(ab, "item=%d", i); - if (context->names[i].name) { - audit_log_format(ab, " name="); + + audit_log_format(ab, " name="); + if (context->names[i].name) audit_log_untrustedstring(ab, context->names[i].name); - } - audit_log_format(ab, " flags=%x\n", context->names[i].flags); - - if (context->names[i].ino != (unsigned long)-1) - audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#o" - " ouid=%u ogid=%u rdev=%02x:%02x", - context->names[i].ino, - MAJOR(context->names[i].dev), - MINOR(context->names[i].dev), - context->names[i].mode, - context->names[i].uid, - context->names[i].gid, - MAJOR(context->names[i].rdev), + else + audit_log_format(ab, "(null)"); + + if (pino != (unsigned long)-1) + audit_log_format(ab, " parent=%lu", pino); + if (ino != (unsigned long)-1) + audit_log_format(ab, " inode=%lu", ino); + if ((pino != (unsigned long)-1) || (ino != (unsigned long)-1)) + audit_log_format(ab, " dev=%02x:%02x mode=%#o" + " ouid=%u ogid=%u rdev=%02x:%02x", + MAJOR(context->names[i].dev), + MINOR(context->names[i].dev), + context->names[i].mode, + context->names[i].uid, + context->names[i].gid, + MAJOR(context->names[i].rdev), MINOR(context->names[i].rdev)); + if (context->names[i].ctx) { + audit_log_format(ab, " obj=%s", + context->names[i].ctx); + } + audit_log_end(ab); } } -/* Free a per-task audit context. Called from copy_process and - * __put_task_struct. */ +/** + * audit_free - free a per-task audit context + * @tsk: task whose audit context block to free + * + * Called from copy_process and __put_task_struct. + */ void audit_free(struct task_struct *tsk) { struct audit_context *context; - task_lock(tsk); + /* + * No need to lock the task - when we execute audit_free() + * then the task has no external references anymore, and + * we are tearing it down. (The locking also confuses + * DEBUG_LOCKDEP - this freeing may occur in softirq + * contexts as well, via RCU.) + */ context = audit_get_context(tsk, 0, 0); - task_unlock(tsk); - if (likely(!context)) return; @@ -934,13 +725,24 @@ void audit_free(struct task_struct *tsk) audit_free_context(context); } -/* Fill in audit context at syscall entry. This only happens if the +/** + * audit_syscall_entry - fill in an audit record at syscall entry + * @tsk: task being audited + * @arch: architecture type + * @major: major syscall type (function) + * @a1: additional syscall register 1 + * @a2: additional syscall register 2 + * @a3: additional syscall register 3 + * @a4: additional syscall register 4 + * + * Fill in audit context at syscall entry. This only happens if the * audit context was created when the task was created and the state or * filters demand the audit context be built. If the state from the * per-task filter or from the per-syscall filter is AUDIT_RECORD_CONTEXT, * then the record will be written at syscall exit time (otherwise, it * will only be written if another part of the kernel requests that it - * be written). */ + * be written). + */ void audit_syscall_entry(struct task_struct *tsk, int arch, int major, unsigned long a1, unsigned long a2, unsigned long a3, unsigned long a4) @@ -950,7 +752,8 @@ void audit_syscall_entry(struct task_struct *tsk, int arch, int major, BUG_ON(!context); - /* This happens only on certain architectures that make system + /* + * This happens only on certain architectures that make system * calls in kernel_thread via the entry.S interface, instead of * with direct calls. (If you are porting to a new * architecture, hitting this condition can indicate that you @@ -958,7 +761,7 @@ void audit_syscall_entry(struct task_struct *tsk, int arch, int major, * * i386 no * x86_64 no - * ppc64 yes (see arch/ppc64/kernel/misc.S) + * ppc64 yes (see arch/powerpc/platforms/iseries/misc.S) * * This also happens with vm86 emulation in a non-nested manner * (entries without exits), so this case must be caught. @@ -966,11 +769,6 @@ void audit_syscall_entry(struct task_struct *tsk, int arch, int major, if (context->in_syscall) { struct audit_context *newctx; -#if defined(__NR_vm86) && defined(__NR_vm86old) - /* vm86 mode should only be entered once */ - if (major == __NR_vm86 || major == __NR_vm86old) - return; -#endif #if AUDIT_DEBUG printk(KERN_ERR "audit(:%d) pid=%d in syscall=%d;" @@ -1014,11 +812,18 @@ void audit_syscall_entry(struct task_struct *tsk, int arch, int major, context->auditable = !!(state == AUDIT_RECORD_CONTEXT); } -/* Tear down after system call. If the audit context has been marked as +/** + * audit_syscall_exit - deallocate audit context after a system call + * @tsk: task being audited + * @valid: success/failure flag + * @return_code: syscall return value + * + * Tear down after system call. If the audit context has been marked as * auditable (either because of the AUDIT_RECORD_CONTEXT state from * filtering, or because some other part of the kernel write an audit * message), then write out the syscall information. In call cases, - * free the names stored from getname(). */ + * free the names stored from getname(). + */ void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code) { struct audit_context *context; @@ -1053,7 +858,13 @@ void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code) put_task_struct(tsk); } -/* Add a name to the list. Called from fs/namei.c:getname(). */ +/** + * audit_getname - add a name to the list + * @name: name to add + * + * Add a name to the list of audit names for this context. + * Called from fs/namei.c:getname(). + */ void audit_getname(const char *name) { struct audit_context *context = current->audit_context; @@ -1082,10 +893,13 @@ void audit_getname(const char *name) } -/* Intercept a putname request. Called from - * include/linux/fs.h:putname(). If we have stored the name from - * getname in the audit context, then we delay the putname until syscall - * exit. */ +/* audit_putname - intercept a putname request + * @name: name to intercept and delay for putname + * + * If we have stored the name from getname in the audit context, + * then we delay the putname until syscall exit. + * Called from include/linux/fs.h:putname(). + */ void audit_putname(const char *name) { struct audit_context *context = current->audit_context; @@ -1100,7 +914,7 @@ void audit_putname(const char *name) for (i = 0; i < context->name_count; i++) printk(KERN_ERR "name[%d] = %p = %s\n", i, context->names[i].name, - context->names[i].name); + context->names[i].name ?: "(null)"); } #endif __putname(name); @@ -1122,9 +936,52 @@ void audit_putname(const char *name) #endif } -/* Store the inode and device from a lookup. Called from - * fs/namei.c:path_lookup(). */ -void audit_inode(const char *name, const struct inode *inode, unsigned flags) +void audit_inode_context(int idx, const struct inode *inode) +{ + struct audit_context *context = current->audit_context; + const char *suffix = security_inode_xattr_getsuffix(); + char *ctx = NULL; + int len = 0; + + if (!suffix) + goto ret; + + len = security_inode_getsecurity(inode, suffix, NULL, 0, 0); + if (len == -EOPNOTSUPP) + goto ret; + if (len < 0) + goto error_path; + + ctx = kmalloc(len, GFP_KERNEL); + if (!ctx) + goto error_path; + + len = security_inode_getsecurity(inode, suffix, ctx, len, 0); + if (len < 0) + goto error_path; + + kfree(context->names[idx].ctx); + context->names[idx].ctx = ctx; + goto ret; + +error_path: + if (ctx) + kfree(ctx); + audit_panic("error in audit_inode_context"); +ret: + return; +} + + +/** + * audit_inode - store the inode and device from a lookup + * @name: name being audited + * @inode: inode being audited + * @flags: lookup flags (as used in path_lookup()) + * + * Called from fs/namei.c:path_lookup(). + */ +void __audit_inode(const char *name, const struct inode *inode, unsigned flags) { int idx; struct audit_context *context = current->audit_context; @@ -1150,15 +1007,105 @@ void audit_inode(const char *name, const struct inode *inode, unsigned flags) ++context->ino_count; #endif } - context->names[idx].flags = flags; - context->names[idx].ino = inode->i_ino; context->names[idx].dev = inode->i_sb->s_dev; context->names[idx].mode = inode->i_mode; context->names[idx].uid = inode->i_uid; context->names[idx].gid = inode->i_gid; context->names[idx].rdev = inode->i_rdev; + audit_inode_context(idx, inode); + if ((flags & LOOKUP_PARENT) && (strcmp(name, "/") != 0) && + (strcmp(name, ".") != 0)) { + context->names[idx].ino = (unsigned long)-1; + context->names[idx].pino = inode->i_ino; + } else { + context->names[idx].ino = inode->i_ino; + context->names[idx].pino = (unsigned long)-1; + } +} + +/** + * audit_inode_child - collect inode info for created/removed objects + * @dname: inode's dentry name + * @inode: inode being audited + * @pino: inode number of dentry parent + * + * For syscalls that create or remove filesystem objects, audit_inode + * can only collect information for the filesystem object's parent. + * This call updates the audit context with the child's information. + * Syscalls that create a new filesystem object must be hooked after + * the object is created. Syscalls that remove a filesystem object + * must be hooked prior, in order to capture the target inode during + * unsuccessful attempts. + */ +void __audit_inode_child(const char *dname, const struct inode *inode, + unsigned long pino) +{ + int idx; + struct audit_context *context = current->audit_context; + + if (!context->in_syscall) + return; + + /* determine matching parent */ + if (dname) + for (idx = 0; idx < context->name_count; idx++) + if (context->names[idx].pino == pino) { + const char *n; + const char *name = context->names[idx].name; + int dlen = strlen(dname); + int nlen = name ? strlen(name) : 0; + + if (nlen < dlen) + continue; + + /* disregard trailing slashes */ + n = name + nlen - 1; + while ((*n == '/') && (n > name)) + n--; + + /* find last path component */ + n = n - dlen + 1; + if (n < name) + continue; + else if (n > name) { + if (*--n != '/') + continue; + else + n++; + } + + if (strncmp(n, dname, dlen) == 0) + goto update_context; + } + + /* catch-all in case match not found */ + idx = context->name_count++; + context->names[idx].name = NULL; + context->names[idx].pino = pino; +#if AUDIT_DEBUG + context->ino_count++; +#endif + +update_context: + if (inode) { + context->names[idx].ino = inode->i_ino; + context->names[idx].dev = inode->i_sb->s_dev; + context->names[idx].mode = inode->i_mode; + context->names[idx].uid = inode->i_uid; + context->names[idx].gid = inode->i_gid; + context->names[idx].rdev = inode->i_rdev; + audit_inode_context(idx, inode); + } } +/** + * auditsc_get_stamp - get local copies of audit_context values + * @ctx: audit_context for the task + * @t: timespec to store time recorded in the audit_context + * @serial: serial value that is recorded in the audit_context + * + * Also sets the context as auditable. + */ void auditsc_get_stamp(struct audit_context *ctx, struct timespec *t, unsigned int *serial) { @@ -1170,6 +1117,15 @@ void auditsc_get_stamp(struct audit_context *ctx, ctx->auditable = 1; } +/** + * audit_set_loginuid - set a task's audit_context loginuid + * @task: task whose audit context is being modified + * @loginuid: loginuid value + * + * Returns 0. + * + * Called (set) from fs/proc/base.c::proc_loginuid_write(). + */ int audit_set_loginuid(struct task_struct *task, uid_t loginuid) { if (task->audit_context) { @@ -1188,12 +1144,59 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid) return 0; } +/** + * audit_get_loginuid - get the loginuid for an audit_context + * @ctx: the audit_context + * + * Returns the context's loginuid or -1 if @ctx is NULL. + */ uid_t audit_get_loginuid(struct audit_context *ctx) { return ctx ? ctx->loginuid : -1; } -int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) +static char *audit_ipc_context(struct kern_ipc_perm *ipcp) +{ + struct audit_context *context = current->audit_context; + char *ctx = NULL; + int len = 0; + + if (likely(!context)) + return NULL; + + len = security_ipc_getsecurity(ipcp, NULL, 0); + if (len == -EOPNOTSUPP) + goto ret; + if (len < 0) + goto error_path; + + ctx = kmalloc(len, GFP_ATOMIC); + if (!ctx) + goto error_path; + + len = security_ipc_getsecurity(ipcp, ctx, len); + if (len < 0) + goto error_path; + + return ctx; + +error_path: + kfree(ctx); + audit_panic("error in audit_ipc_context"); +ret: + return NULL; +} + +/** + * audit_ipc_perms - record audit data for ipc + * @qbytes: msgq bytes + * @uid: msgq user id + * @gid: msgq group id + * @mode: msgq mode (permissions) + * + * Returns 0 for success or NULL context or < 0 on error. + */ +int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, struct kern_ipc_perm *ipcp) { struct audit_aux_data_ipcctl *ax; struct audit_context *context = current->audit_context; @@ -1201,7 +1204,7 @@ int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) if (likely(!context)) return 0; - ax = kmalloc(sizeof(*ax), GFP_KERNEL); + ax = kmalloc(sizeof(*ax), GFP_ATOMIC); if (!ax) return -ENOMEM; @@ -1209,6 +1212,7 @@ int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) ax->uid = uid; ax->gid = gid; ax->mode = mode; + ax->ctx = audit_ipc_context(ipcp); ax->d.type = AUDIT_IPC; ax->d.next = context->aux; @@ -1216,6 +1220,13 @@ int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) return 0; } +/** + * audit_socketcall - record audit data for sys_socketcall + * @nargs: number of args + * @args: args array + * + * Returns 0 for success or NULL context or < 0 on error. + */ int audit_socketcall(int nargs, unsigned long *args) { struct audit_aux_data_socketcall *ax; @@ -1237,6 +1248,13 @@ int audit_socketcall(int nargs, unsigned long *args) return 0; } +/** + * audit_sockaddr - record audit data for sys_bind, sys_connect, sys_sendto + * @len: data length in user space + * @a: data address in kernel space + * + * Returns 0 for success or NULL context or < 0 on error. + */ int audit_sockaddr(int len, void *a) { struct audit_aux_data_sockaddr *ax; @@ -1258,6 +1276,15 @@ int audit_sockaddr(int len, void *a) return 0; } +/** + * audit_avc_path - record the granting or denial of permissions + * @dentry: dentry to record + * @mnt: mnt to record + * + * Returns 0 for success or NULL context or < 0 on error. + * + * Called from security/selinux/avc.c::avc_audit() + */ int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt) { struct audit_aux_data_path *ax; @@ -1279,6 +1306,14 @@ int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt) return 0; } +/** + * audit_signal_info - record signal info for shutting down audit subsystem + * @sig: signal value + * @t: task being signaled + * + * If the audit subsystem is being terminated, record the task (pid) + * and uid that is doing that. + */ void audit_signal_info(int sig, struct task_struct *t) { extern pid_t audit_sig_pid; @@ -1295,4 +1330,3 @@ void audit_signal_info(int sig, struct task_struct *t) } } } - diff --git a/kernel/capability.c b/kernel/capability.c index bfa3c92e16f..1a4d8a40d3f 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -233,3 +233,19 @@ out: return ret; } + +int __capable(struct task_struct *t, int cap) +{ + if (security_capable(t, cap) == 0) { + t->flags |= PF_SUPERPRIV; + return 1; + } + return 0; +} +EXPORT_SYMBOL(__capable); + +int capable(int cap) +{ + return __capable(current, cap); +} +EXPORT_SYMBOL(capable); diff --git a/kernel/compat.c b/kernel/compat.c index 8c9cd88b678..c1601a84f8d 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -17,10 +17,10 @@ #include <linux/time.h> #include <linux/signal.h> #include <linux/sched.h> /* for MAX_SCHEDULE_TIMEOUT */ -#include <linux/futex.h> /* for FUTEX_WAIT */ #include <linux/syscalls.h> #include <linux/unistd.h> #include <linux/security.h> +#include <linux/timex.h> #include <asm/uaccess.h> @@ -238,28 +238,6 @@ asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set, return ret; } -#ifdef CONFIG_FUTEX -asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, int val, - struct compat_timespec __user *utime, u32 __user *uaddr2, - int val3) -{ - struct timespec t; - unsigned long timeout = MAX_SCHEDULE_TIMEOUT; - int val2 = 0; - - if ((op == FUTEX_WAIT) && utime) { - if (get_compat_timespec(&t, utime)) - return -EFAULT; - timeout = timespec_to_jiffies(&t) + 1; - } - if (op >= FUTEX_REQUEUE) - val2 = (int) (unsigned long) utime; - - return do_futex((unsigned long)uaddr, op, val, timeout, - (unsigned long)uaddr2, val2, val3); -} -#endif - asmlinkage long compat_sys_setrlimit(unsigned int resource, struct compat_rlimit __user *rlim) { @@ -898,3 +876,61 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat return -ERESTARTNOHAND; } #endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */ + +asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) +{ + struct timex txc; + int ret; + + memset(&txc, 0, sizeof(struct timex)); + + if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) || + __get_user(txc.modes, &utp->modes) || + __get_user(txc.offset, &utp->offset) || + __get_user(txc.freq, &utp->freq) || + __get_user(txc.maxerror, &utp->maxerror) || + __get_user(txc.esterror, &utp->esterror) || + __get_user(txc.status, &utp->status) || + __get_user(txc.constant, &utp->constant) || + __get_user(txc.precision, &utp->precision) || + __get_user(txc.tolerance, &utp->tolerance) || + __get_user(txc.time.tv_sec, &utp->time.tv_sec) || + __get_user(txc.time.tv_usec, &utp->time.tv_usec) || + __get_user(txc.tick, &utp->tick) || + __get_user(txc.ppsfreq, &utp->ppsfreq) || + __get_user(txc.jitter, &utp->jitter) || + __get_user(txc.shift, &utp->shift) || + __get_user(txc.stabil, &utp->stabil) || + __get_user(txc.jitcnt, &utp->jitcnt) || + __get_user(txc.calcnt, &utp->calcnt) || + __get_user(txc.errcnt, &utp->errcnt) || + __get_user(txc.stbcnt, &utp->stbcnt)) + return -EFAULT; + + ret = do_adjtimex(&txc); + + if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) || + __put_user(txc.modes, &utp->modes) || + __put_user(txc.offset, &utp->offset) || + __put_user(txc.freq, &utp->freq) || + __put_user(txc.maxerror, &utp->maxerror) || + __put_user(txc.esterror, &utp->esterror) || + __put_user(txc.status, &utp->status) || + __put_user(txc.constant, &utp->constant) || + __put_user(txc.precision, &utp->precision) || + __put_user(txc.tolerance, &utp->tolerance) || + __put_user(txc.time.tv_sec, &utp->time.tv_sec) || + __put_user(txc.time.tv_usec, &utp->time.tv_usec) || + __put_user(txc.tick, &utp->tick) || + __put_user(txc.ppsfreq, &utp->ppsfreq) || + __put_user(txc.jitter, &utp->jitter) || + __put_user(txc.shift, &utp->shift) || + __put_user(txc.stabil, &utp->stabil) || + __put_user(txc.jitcnt, &utp->jitcnt) || + __put_user(txc.calcnt, &utp->calcnt) || + __put_user(txc.errcnt, &utp->errcnt) || + __put_user(txc.stbcnt, &utp->stbcnt)) + ret = -EFAULT; + + return ret; +} diff --git a/kernel/cpu.c b/kernel/cpu.c index e882c6babf4..fe2b8d0bfe4 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -18,7 +18,7 @@ /* This protects CPUs going up and down... */ static DECLARE_MUTEX(cpucontrol); -static struct notifier_block *cpu_chain; +static BLOCKING_NOTIFIER_HEAD(cpu_chain); #ifdef CONFIG_HOTPLUG_CPU static struct task_struct *lock_cpu_hotplug_owner; @@ -71,21 +71,13 @@ EXPORT_SYMBOL_GPL(lock_cpu_hotplug_interruptible); /* Need to know about CPUs going up/down? */ int register_cpu_notifier(struct notifier_block *nb) { - int ret; - - if ((ret = lock_cpu_hotplug_interruptible()) != 0) - return ret; - ret = notifier_chain_register(&cpu_chain, nb); - unlock_cpu_hotplug(); - return ret; + return blocking_notifier_chain_register(&cpu_chain, nb); } EXPORT_SYMBOL(register_cpu_notifier); void unregister_cpu_notifier(struct notifier_block *nb) { - lock_cpu_hotplug(); - notifier_chain_unregister(&cpu_chain, nb); - unlock_cpu_hotplug(); + blocking_notifier_chain_unregister(&cpu_chain, nb); } EXPORT_SYMBOL(unregister_cpu_notifier); @@ -141,7 +133,7 @@ int cpu_down(unsigned int cpu) goto out; } - err = notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE, + err = blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE, (void *)(long)cpu); if (err == NOTIFY_BAD) { printk("%s: attempt to take down CPU %u failed\n", @@ -159,7 +151,7 @@ int cpu_down(unsigned int cpu) p = __stop_machine_run(take_cpu_down, NULL, cpu); if (IS_ERR(p)) { /* CPU didn't die: tell everyone. Can't complain. */ - if (notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED, + if (blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED, (void *)(long)cpu) == NOTIFY_BAD) BUG(); @@ -182,8 +174,8 @@ int cpu_down(unsigned int cpu) put_cpu(); /* CPU is completely dead: tell everyone. Too late to complain. */ - if (notifier_call_chain(&cpu_chain, CPU_DEAD, (void *)(long)cpu) - == NOTIFY_BAD) + if (blocking_notifier_call_chain(&cpu_chain, CPU_DEAD, + (void *)(long)cpu) == NOTIFY_BAD) BUG(); check_for_tasks(cpu); @@ -211,7 +203,7 @@ int __devinit cpu_up(unsigned int cpu) goto out; } - ret = notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu); + ret = blocking_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu); if (ret == NOTIFY_BAD) { printk("%s: attempt to bring up CPU %u failed\n", __FUNCTION__, cpu); @@ -223,15 +215,15 @@ int __devinit cpu_up(unsigned int cpu) ret = __cpu_up(cpu); if (ret != 0) goto out_notify; - if (!cpu_online(cpu)) - BUG(); + BUG_ON(!cpu_online(cpu)); /* Now call notifier in preparation. */ - notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu); + blocking_notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu); out_notify: if (ret != 0) - notifier_call_chain(&cpu_chain, CPU_UP_CANCELED, hcpu); + blocking_notifier_call_chain(&cpu_chain, + CPU_UP_CANCELED, hcpu); out: unlock_cpu_hotplug(); return ret; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 12815d3f1a0..18aea1bd128 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -4,15 +4,14 @@ * Processor and Memory placement constraints for sets of tasks. * * Copyright (C) 2003 BULL SA. - * Copyright (C) 2004 Silicon Graphics, Inc. + * Copyright (C) 2004-2006 Silicon Graphics, Inc. * * Portions derived from Patrick Mochel's sysfs code. * sysfs is Copyright (c) 2001-3 Patrick Mochel - * Portions Copyright (c) 2004 Silicon Graphics, Inc. * - * 2003-10-10 Written by Simon Derr <simon.derr@bull.net> + * 2003-10-10 Written by Simon Derr. * 2003-10-22 Updates by Stephen Hemminger. - * 2004 May-July Rework by Paul Jackson <pj@sgi.com> + * 2004 May-July Rework by Paul Jackson. * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of the Linux @@ -53,7 +52,7 @@ #include <asm/uaccess.h> #include <asm/atomic.h> -#include <asm/semaphore.h> +#include <linux/mutex.h> #define CPUSET_SUPER_MAGIC 0x27e0eb @@ -108,37 +107,49 @@ typedef enum { CS_MEM_EXCLUSIVE, CS_MEMORY_MIGRATE, CS_REMOVED, - CS_NOTIFY_ON_RELEASE + CS_NOTIFY_ON_RELEASE, + CS_SPREAD_PAGE, + CS_SPREAD_SLAB, } cpuset_flagbits_t; /* convenient tests for these bits */ static inline int is_cpu_exclusive(const struct cpuset *cs) { - return !!test_bit(CS_CPU_EXCLUSIVE, &cs->flags); + return test_bit(CS_CPU_EXCLUSIVE, &cs->flags); } static inline int is_mem_exclusive(const struct cpuset *cs) { - return !!test_bit(CS_MEM_EXCLUSIVE, &cs->flags); + return test_bit(CS_MEM_EXCLUSIVE, &cs->flags); } static inline int is_removed(const struct cpuset *cs) { - return !!test_bit(CS_REMOVED, &cs->flags); + return test_bit(CS_REMOVED, &cs->flags); } static inline int notify_on_release(const struct cpuset *cs) { - return !!test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); + return test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); } static inline int is_memory_migrate(const struct cpuset *cs) { - return !!test_bit(CS_MEMORY_MIGRATE, &cs->flags); + return test_bit(CS_MEMORY_MIGRATE, &cs->flags); +} + +static inline int is_spread_page(const struct cpuset *cs) +{ + return test_bit(CS_SPREAD_PAGE, &cs->flags); +} + +static inline int is_spread_slab(const struct cpuset *cs) +{ + return test_bit(CS_SPREAD_SLAB, &cs->flags); } /* - * Increment this atomic integer everytime any cpuset changes its + * Increment this integer everytime any cpuset changes its * mems_allowed value. Users of cpusets can track this generation * number, and avoid having to lock and reload mems_allowed unless * the cpuset they're using changes generation. @@ -152,8 +163,11 @@ static inline int is_memory_migrate(const struct cpuset *cs) * on every visit to __alloc_pages(), to efficiently check whether * its current->cpuset->mems_allowed has changed, requiring an update * of its current->mems_allowed. + * + * Since cpuset_mems_generation is guarded by manage_mutex, + * there is no need to mark it atomic. */ -static atomic_t cpuset_mems_generation = ATOMIC_INIT(1); +static int cpuset_mems_generation; static struct cpuset top_cpuset = { .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), @@ -168,63 +182,57 @@ static struct vfsmount *cpuset_mount; static struct super_block *cpuset_sb; /* - * We have two global cpuset semaphores below. They can nest. - * It is ok to first take manage_sem, then nest callback_sem. We also + * We have two global cpuset mutexes below. They can nest. + * It is ok to first take manage_mutex, then nest callback_mutex. We also * require taking task_lock() when dereferencing a tasks cpuset pointer. * See "The task_lock() exception", at the end of this comment. * - * A task must hold both semaphores to modify cpusets. If a task - * holds manage_sem, then it blocks others wanting that semaphore, - * ensuring that it is the only task able to also acquire callback_sem + * A task must hold both mutexes to modify cpusets. If a task + * holds manage_mutex, then it blocks others wanting that mutex, + * ensuring that it is the only task able to also acquire callback_mutex * and be able to modify cpusets. It can perform various checks on * the cpuset structure first, knowing nothing will change. It can - * also allocate memory while just holding manage_sem. While it is + * also allocate memory while just holding manage_mutex. While it is * performing these checks, various callback routines can briefly - * acquire callback_sem to query cpusets. Once it is ready to make - * the changes, it takes callback_sem, blocking everyone else. + * acquire callback_mutex to query cpusets. Once it is ready to make + * the changes, it takes callback_mutex, blocking everyone else. * * Calls to the kernel memory allocator can not be made while holding - * callback_sem, as that would risk double tripping on callback_sem + * callback_mutex, as that would risk double tripping on callback_mutex * from one of the callbacks into the cpuset code from within * __alloc_pages(). * - * If a task is only holding callback_sem, then it has read-only + * If a task is only holding callback_mutex, then it has read-only * access to cpusets. * * The task_struct fields mems_allowed and mems_generation may only * be accessed in the context of that task, so require no locks. * * Any task can increment and decrement the count field without lock. - * So in general, code holding manage_sem or callback_sem can't rely + * So in general, code holding manage_mutex or callback_mutex can't rely * on the count field not changing. However, if the count goes to - * zero, then only attach_task(), which holds both semaphores, can + * zero, then only attach_task(), which holds both mutexes, can * increment it again. Because a count of zero means that no tasks * are currently attached, therefore there is no way a task attached * to that cpuset can fork (the other way to increment the count). - * So code holding manage_sem or callback_sem can safely assume that + * So code holding manage_mutex or callback_mutex can safely assume that * if the count is zero, it will stay zero. Similarly, if a task - * holds manage_sem or callback_sem on a cpuset with zero count, it + * holds manage_mutex or callback_mutex on a cpuset with zero count, it * knows that the cpuset won't be removed, as cpuset_rmdir() needs - * both of those semaphores. - * - * A possible optimization to improve parallelism would be to make - * callback_sem a R/W semaphore (rwsem), allowing the callback routines - * to proceed in parallel, with read access, until the holder of - * manage_sem needed to take this rwsem for exclusive write access - * and modify some cpusets. + * both of those mutexes. * * The cpuset_common_file_write handler for operations that modify - * the cpuset hierarchy holds manage_sem across the entire operation, + * the cpuset hierarchy holds manage_mutex across the entire operation, * single threading all such cpuset modifications across the system. * - * The cpuset_common_file_read() handlers only hold callback_sem across + * The cpuset_common_file_read() handlers only hold callback_mutex across * small pieces of code, such as when reading out possibly multi-word * cpumasks and nodemasks. * * The fork and exit callbacks cpuset_fork() and cpuset_exit(), don't - * (usually) take either semaphore. These are the two most performance + * (usually) take either mutex. These are the two most performance * critical pieces of code here. The exception occurs on cpuset_exit(), - * when a task in a notify_on_release cpuset exits. Then manage_sem + * when a task in a notify_on_release cpuset exits. Then manage_mutex * is taken, and if the cpuset count is zero, a usermode call made * to /sbin/cpuset_release_agent with the name of the cpuset (path * relative to the root of cpuset file system) as the argument. @@ -242,9 +250,9 @@ static struct super_block *cpuset_sb; * * The need for this exception arises from the action of attach_task(), * which overwrites one tasks cpuset pointer with another. It does - * so using both semaphores, however there are several performance + * so using both mutexes, however there are several performance * critical places that need to reference task->cpuset without the - * expense of grabbing a system global semaphore. Therefore except as + * expense of grabbing a system global mutex. Therefore except as * noted below, when dereferencing or, as in attach_task(), modifying * a tasks cpuset pointer we use task_lock(), which acts on a spinlock * (task->alloc_lock) already in the task_struct routinely used for @@ -256,8 +264,8 @@ static struct super_block *cpuset_sb; * the routine cpuset_update_task_memory_state(). */ -static DECLARE_MUTEX(manage_sem); -static DECLARE_MUTEX(callback_sem); +static DEFINE_MUTEX(manage_mutex); +static DEFINE_MUTEX(callback_mutex); /* * A couple of forward declarations required, due to cyclic reference loop: @@ -432,7 +440,7 @@ static inline struct cftype *__d_cft(struct dentry *dentry) } /* - * Call with manage_sem held. Writes path of cpuset into buf. + * Call with manage_mutex held. Writes path of cpuset into buf. * Returns 0 on success, -errno on error. */ @@ -484,11 +492,11 @@ static int cpuset_path(const struct cpuset *cs, char *buf, int buflen) * status of the /sbin/cpuset_release_agent task, so no sense holding * our caller up for that. * - * When we had only one cpuset semaphore, we had to call this + * When we had only one cpuset mutex, we had to call this * without holding it, to avoid deadlock when call_usermodehelper() * allocated memory. With two locks, we could now call this while - * holding manage_sem, but we still don't, so as to minimize - * the time manage_sem is held. + * holding manage_mutex, but we still don't, so as to minimize + * the time manage_mutex is held. */ static void cpuset_release_agent(const char *pathbuf) @@ -520,15 +528,15 @@ static void cpuset_release_agent(const char *pathbuf) * cs is notify_on_release() and now both the user count is zero and * the list of children is empty, prepare cpuset path in a kmalloc'd * buffer, to be returned via ppathbuf, so that the caller can invoke - * cpuset_release_agent() with it later on, once manage_sem is dropped. - * Call here with manage_sem held. + * cpuset_release_agent() with it later on, once manage_mutex is dropped. + * Call here with manage_mutex held. * * This check_for_release() routine is responsible for kmalloc'ing * pathbuf. The above cpuset_release_agent() is responsible for * kfree'ing pathbuf. The caller of these routines is responsible * for providing a pathbuf pointer, initialized to NULL, then - * calling check_for_release() with manage_sem held and the address - * of the pathbuf pointer, then dropping manage_sem, then calling + * calling check_for_release() with manage_mutex held and the address + * of the pathbuf pointer, then dropping manage_mutex, then calling * cpuset_release_agent() with pathbuf, as set by check_for_release(). */ @@ -559,7 +567,7 @@ static void check_for_release(struct cpuset *cs, char **ppathbuf) * One way or another, we guarantee to return some non-empty subset * of cpu_online_map. * - * Call with callback_sem held. + * Call with callback_mutex held. */ static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) @@ -583,7 +591,7 @@ static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) * One way or another, we guarantee to return some non-empty subset * of node_online_map. * - * Call with callback_sem held. + * Call with callback_mutex held. */ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) @@ -608,12 +616,12 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) * current->cpuset if a task has its memory placement changed. * Do not call this routine if in_interrupt(). * - * Call without callback_sem or task_lock() held. May be called - * with or without manage_sem held. Doesn't need task_lock to guard + * Call without callback_mutex or task_lock() held. May be called + * with or without manage_mutex held. Doesn't need task_lock to guard * against another task changing a non-NULL cpuset pointer to NULL, * as that is only done by a task on itself, and if the current task * is here, it is not simultaneously in the exit code NULL'ing its - * cpuset pointer. This routine also might acquire callback_sem and + * cpuset pointer. This routine also might acquire callback_mutex and * current->mm->mmap_sem during call. * * Reading current->cpuset->mems_generation doesn't need task_lock @@ -658,13 +666,21 @@ void cpuset_update_task_memory_state(void) } if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) { - down(&callback_sem); + mutex_lock(&callback_mutex); task_lock(tsk); cs = tsk->cpuset; /* Maybe changed when task not locked */ guarantee_online_mems(cs, &tsk->mems_allowed); tsk->cpuset_mems_generation = cs->mems_generation; + if (is_spread_page(cs)) + tsk->flags |= PF_SPREAD_PAGE; + else + tsk->flags &= ~PF_SPREAD_PAGE; + if (is_spread_slab(cs)) + tsk->flags |= PF_SPREAD_SLAB; + else + tsk->flags &= ~PF_SPREAD_SLAB; task_unlock(tsk); - up(&callback_sem); + mutex_unlock(&callback_mutex); mpol_rebind_task(tsk, &tsk->mems_allowed); } } @@ -674,7 +690,7 @@ void cpuset_update_task_memory_state(void) * * One cpuset is a subset of another if all its allowed CPUs and * Memory Nodes are a subset of the other, and its exclusive flags - * are only set if the other's are set. Call holding manage_sem. + * are only set if the other's are set. Call holding manage_mutex. */ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) @@ -692,7 +708,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) * If we replaced the flag and mask values of the current cpuset * (cur) with those values in the trial cpuset (trial), would * our various subset and exclusive rules still be valid? Presumes - * manage_sem held. + * manage_mutex held. * * 'cur' is the address of an actual, in-use cpuset. Operations * such as list traversal that depend on the actual address of the @@ -746,7 +762,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) * exclusive child cpusets * Build these two partitions by calling partition_sched_domains * - * Call with manage_sem held. May nest a call to the + * Call with manage_mutex held. May nest a call to the * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. */ @@ -792,7 +808,7 @@ static void update_cpu_domains(struct cpuset *cur) } /* - * Call with manage_sem held. May take callback_sem during call. + * Call with manage_mutex held. May take callback_mutex during call. */ static int update_cpumask(struct cpuset *cs, char *buf) @@ -811,9 +827,9 @@ static int update_cpumask(struct cpuset *cs, char *buf) if (retval < 0) return retval; cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed); - down(&callback_sem); + mutex_lock(&callback_mutex); cs->cpus_allowed = trialcs.cpus_allowed; - up(&callback_sem); + mutex_unlock(&callback_mutex); if (is_cpu_exclusive(cs) && !cpus_unchanged) update_cpu_domains(cs); return 0; @@ -827,7 +843,7 @@ static int update_cpumask(struct cpuset *cs, char *buf) * the cpuset is marked 'memory_migrate', migrate the tasks * pages to the new memory. * - * Call with manage_sem held. May take callback_sem during call. + * Call with manage_mutex held. May take callback_mutex during call. * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, * lock each such tasks mm->mmap_sem, scan its vma's and rebind * their mempolicies to the cpusets new mems_allowed. @@ -862,11 +878,10 @@ static int update_nodemask(struct cpuset *cs, char *buf) if (retval < 0) goto done; - down(&callback_sem); + mutex_lock(&callback_mutex); cs->mems_allowed = trialcs.mems_allowed; - atomic_inc(&cpuset_mems_generation); - cs->mems_generation = atomic_read(&cpuset_mems_generation); - up(&callback_sem); + cs->mems_generation = cpuset_mems_generation++; + mutex_unlock(&callback_mutex); set_cpuset_being_rebound(cs); /* causes mpol_copy() rebind */ @@ -922,7 +937,7 @@ static int update_nodemask(struct cpuset *cs, char *buf) * tasklist_lock. Forks can happen again now - the mpol_copy() * cpuset_being_rebound check will catch such forks, and rebind * their vma mempolicies too. Because we still hold the global - * cpuset manage_sem, we know that no other rebind effort will + * cpuset manage_mutex, we know that no other rebind effort will * be contending for the global variable cpuset_being_rebound. * It's ok if we rebind the same mm twice; mpol_rebind_mm() * is idempotent. Also migrate pages in each mm to new nodes. @@ -948,7 +963,7 @@ done: } /* - * Call with manage_sem held. + * Call with manage_mutex held. */ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf) @@ -963,11 +978,12 @@ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf) /* * update_flag - read a 0 or a 1 in a file and update associated flag * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, - * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE) + * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE, + * CS_SPREAD_PAGE, CS_SPREAD_SLAB) * cs: the cpuset to update * buf: the buffer where we read the 0 or 1 * - * Call with manage_sem held. + * Call with manage_mutex held. */ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) @@ -989,12 +1005,12 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) return err; cpu_exclusive_changed = (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs)); - down(&callback_sem); + mutex_lock(&callback_mutex); if (turning_on) set_bit(bit, &cs->flags); else clear_bit(bit, &cs->flags); - up(&callback_sem); + mutex_unlock(&callback_mutex); if (cpu_exclusive_changed) update_cpu_domains(cs); @@ -1104,7 +1120,7 @@ static int fmeter_getrate(struct fmeter *fmp) * writing the path of the old cpuset in 'ppathbuf' if it needs to be * notified on release. * - * Call holding manage_sem. May take callback_sem and task_lock of + * Call holding manage_mutex. May take callback_mutex and task_lock of * the task 'pid' during call. */ @@ -1144,13 +1160,13 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) get_task_struct(tsk); } - down(&callback_sem); + mutex_lock(&callback_mutex); task_lock(tsk); oldcs = tsk->cpuset; if (!oldcs) { task_unlock(tsk); - up(&callback_sem); + mutex_unlock(&callback_mutex); put_task_struct(tsk); return -ESRCH; } @@ -1164,7 +1180,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) from = oldcs->mems_allowed; to = cs->mems_allowed; - up(&callback_sem); + mutex_unlock(&callback_mutex); mm = get_task_mm(tsk); if (mm) { @@ -1194,6 +1210,8 @@ typedef enum { FILE_NOTIFY_ON_RELEASE, FILE_MEMORY_PRESSURE_ENABLED, FILE_MEMORY_PRESSURE, + FILE_SPREAD_PAGE, + FILE_SPREAD_SLAB, FILE_TASKLIST, } cpuset_filetype_t; @@ -1221,7 +1239,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us } buffer[nbytes] = 0; /* nul-terminate */ - down(&manage_sem); + mutex_lock(&manage_mutex); if (is_removed(cs)) { retval = -ENODEV; @@ -1253,6 +1271,14 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us case FILE_MEMORY_PRESSURE: retval = -EACCES; break; + case FILE_SPREAD_PAGE: + retval = update_flag(CS_SPREAD_PAGE, cs, buffer); + cs->mems_generation = cpuset_mems_generation++; + break; + case FILE_SPREAD_SLAB: + retval = update_flag(CS_SPREAD_SLAB, cs, buffer); + cs->mems_generation = cpuset_mems_generation++; + break; case FILE_TASKLIST: retval = attach_task(cs, buffer, &pathbuf); break; @@ -1264,7 +1290,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us if (retval == 0) retval = nbytes; out2: - up(&manage_sem); + mutex_unlock(&manage_mutex); cpuset_release_agent(pathbuf); out1: kfree(buffer); @@ -1304,9 +1330,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) { cpumask_t mask; - down(&callback_sem); + mutex_lock(&callback_mutex); mask = cs->cpus_allowed; - up(&callback_sem); + mutex_unlock(&callback_mutex); return cpulist_scnprintf(page, PAGE_SIZE, mask); } @@ -1315,9 +1341,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) { nodemask_t mask; - down(&callback_sem); + mutex_lock(&callback_mutex); mask = cs->mems_allowed; - up(&callback_sem); + mutex_unlock(&callback_mutex); return nodelist_scnprintf(page, PAGE_SIZE, mask); } @@ -1362,6 +1388,12 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, case FILE_MEMORY_PRESSURE: s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter)); break; + case FILE_SPREAD_PAGE: + *s++ = is_spread_page(cs) ? '1' : '0'; + break; + case FILE_SPREAD_SLAB: + *s++ = is_spread_slab(cs) ? '1' : '0'; + break; default: retval = -EINVAL; goto out; @@ -1598,7 +1630,7 @@ static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids) * Handle an open on 'tasks' file. Prepare a buffer listing the * process id's of tasks currently attached to the cpuset being opened. * - * Does not require any specific cpuset semaphores, and does not take any. + * Does not require any specific cpuset mutexes, and does not take any. */ static int cpuset_tasks_open(struct inode *unused, struct file *file) { @@ -1725,6 +1757,16 @@ static struct cftype cft_memory_pressure = { .private = FILE_MEMORY_PRESSURE, }; +static struct cftype cft_spread_page = { + .name = "memory_spread_page", + .private = FILE_SPREAD_PAGE, +}; + +static struct cftype cft_spread_slab = { + .name = "memory_spread_slab", + .private = FILE_SPREAD_SLAB, +}; + static int cpuset_populate_dir(struct dentry *cs_dentry) { int err; @@ -1743,6 +1785,10 @@ static int cpuset_populate_dir(struct dentry *cs_dentry) return err; if ((err = cpuset_add_file(cs_dentry, &cft_memory_pressure)) < 0) return err; + if ((err = cpuset_add_file(cs_dentry, &cft_spread_page)) < 0) + return err; + if ((err = cpuset_add_file(cs_dentry, &cft_spread_slab)) < 0) + return err; if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0) return err; return 0; @@ -1754,7 +1800,7 @@ static int cpuset_populate_dir(struct dentry *cs_dentry) * name: name of the new cpuset. Will be strcpy'ed. * mode: mode to set on new inode * - * Must be called with the semaphore on the parent inode held + * Must be called with the mutex on the parent inode held */ static long cpuset_create(struct cpuset *parent, const char *name, int mode) @@ -1766,44 +1812,47 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) if (!cs) return -ENOMEM; - down(&manage_sem); + mutex_lock(&manage_mutex); cpuset_update_task_memory_state(); cs->flags = 0; if (notify_on_release(parent)) set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); + if (is_spread_page(parent)) + set_bit(CS_SPREAD_PAGE, &cs->flags); + if (is_spread_slab(parent)) + set_bit(CS_SPREAD_SLAB, &cs->flags); cs->cpus_allowed = CPU_MASK_NONE; cs->mems_allowed = NODE_MASK_NONE; atomic_set(&cs->count, 0); INIT_LIST_HEAD(&cs->sibling); INIT_LIST_HEAD(&cs->children); - atomic_inc(&cpuset_mems_generation); - cs->mems_generation = atomic_read(&cpuset_mems_generation); + cs->mems_generation = cpuset_mems_generation++; fmeter_init(&cs->fmeter); cs->parent = parent; - down(&callback_sem); + mutex_lock(&callback_mutex); list_add(&cs->sibling, &cs->parent->children); number_of_cpusets++; - up(&callback_sem); + mutex_unlock(&callback_mutex); err = cpuset_create_dir(cs, name, mode); if (err < 0) goto err; /* - * Release manage_sem before cpuset_populate_dir() because it + * Release manage_mutex before cpuset_populate_dir() because it * will down() this new directory's i_mutex and if we race with * another mkdir, we might deadlock. */ - up(&manage_sem); + mutex_unlock(&manage_mutex); err = cpuset_populate_dir(cs->dentry); /* If err < 0, we have a half-filled directory - oh well ;) */ return 0; err: list_del(&cs->sibling); - up(&manage_sem); + mutex_unlock(&manage_mutex); kfree(cs); return err; } @@ -1825,18 +1874,18 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) /* the vfs holds both inode->i_mutex already */ - down(&manage_sem); + mutex_lock(&manage_mutex); cpuset_update_task_memory_state(); if (atomic_read(&cs->count) > 0) { - up(&manage_sem); + mutex_unlock(&manage_mutex); return -EBUSY; } if (!list_empty(&cs->children)) { - up(&manage_sem); + mutex_unlock(&manage_mutex); return -EBUSY; } parent = cs->parent; - down(&callback_sem); + mutex_lock(&callback_mutex); set_bit(CS_REMOVED, &cs->flags); if (is_cpu_exclusive(cs)) update_cpu_domains(cs); @@ -1848,10 +1897,10 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) cpuset_d_remove_dir(d); dput(d); number_of_cpusets--; - up(&callback_sem); + mutex_unlock(&callback_mutex); if (list_empty(&parent->children)) check_for_release(parent, &pathbuf); - up(&manage_sem); + mutex_unlock(&manage_mutex); cpuset_release_agent(pathbuf); return 0; } @@ -1867,7 +1916,7 @@ int __init cpuset_init_early(void) struct task_struct *tsk = current; tsk->cpuset = &top_cpuset; - tsk->cpuset->mems_generation = atomic_read(&cpuset_mems_generation); + tsk->cpuset->mems_generation = cpuset_mems_generation++; return 0; } @@ -1886,8 +1935,7 @@ int __init cpuset_init(void) top_cpuset.mems_allowed = NODE_MASK_ALL; fmeter_init(&top_cpuset.fmeter); - atomic_inc(&cpuset_mems_generation); - top_cpuset.mems_generation = atomic_read(&cpuset_mems_generation); + top_cpuset.mems_generation = cpuset_mems_generation++; init_task.cpuset = &top_cpuset; @@ -1960,25 +2008,25 @@ void cpuset_fork(struct task_struct *child) * Description: Detach cpuset from @tsk and release it. * * Note that cpusets marked notify_on_release force every task in - * them to take the global manage_sem semaphore when exiting. + * them to take the global manage_mutex mutex when exiting. * This could impact scaling on very large systems. Be reluctant to * use notify_on_release cpusets where very high task exit scaling * is required on large systems. * * Don't even think about derefencing 'cs' after the cpuset use count - * goes to zero, except inside a critical section guarded by manage_sem - * or callback_sem. Otherwise a zero cpuset use count is a license to + * goes to zero, except inside a critical section guarded by manage_mutex + * or callback_mutex. Otherwise a zero cpuset use count is a license to * any other task to nuke the cpuset immediately, via cpuset_rmdir(). * - * This routine has to take manage_sem, not callback_sem, because - * it is holding that semaphore while calling check_for_release(), - * which calls kmalloc(), so can't be called holding callback__sem(). + * This routine has to take manage_mutex, not callback_mutex, because + * it is holding that mutex while calling check_for_release(), + * which calls kmalloc(), so can't be called holding callback_mutex(). * * We don't need to task_lock() this reference to tsk->cpuset, * because tsk is already marked PF_EXITING, so attach_task() won't * mess with it, or task is a failed fork, never visible to attach_task. * - * Hack: + * the_top_cpuset_hack: * * Set the exiting tasks cpuset to the root cpuset (top_cpuset). * @@ -2017,15 +2065,15 @@ void cpuset_exit(struct task_struct *tsk) struct cpuset *cs; cs = tsk->cpuset; - tsk->cpuset = &top_cpuset; /* Hack - see comment above */ + tsk->cpuset = &top_cpuset; /* the_top_cpuset_hack - see above */ if (notify_on_release(cs)) { char *pathbuf = NULL; - down(&manage_sem); + mutex_lock(&manage_mutex); if (atomic_dec_and_test(&cs->count)) check_for_release(cs, &pathbuf); - up(&manage_sem); + mutex_unlock(&manage_mutex); cpuset_release_agent(pathbuf); } else { atomic_dec(&cs->count); @@ -2046,11 +2094,11 @@ cpumask_t cpuset_cpus_allowed(struct task_struct *tsk) { cpumask_t mask; - down(&callback_sem); + mutex_lock(&callback_mutex); task_lock(tsk); guarantee_online_cpus(tsk->cpuset, &mask); task_unlock(tsk); - up(&callback_sem); + mutex_unlock(&callback_mutex); return mask; } @@ -2074,11 +2122,11 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk) { nodemask_t mask; - down(&callback_sem); + mutex_lock(&callback_mutex); task_lock(tsk); guarantee_online_mems(tsk->cpuset, &mask); task_unlock(tsk); - up(&callback_sem); + mutex_unlock(&callback_mutex); return mask; } @@ -2104,7 +2152,7 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl) /* * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive - * ancestor to the specified cpuset. Call holding callback_sem. + * ancestor to the specified cpuset. Call holding callback_mutex. * If no ancestor is mem_exclusive (an unusual configuration), then * returns the root cpuset. */ @@ -2131,12 +2179,12 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) * GFP_KERNEL allocations are not so marked, so can escape to the * nearest mem_exclusive ancestor cpuset. * - * Scanning up parent cpusets requires callback_sem. The __alloc_pages() + * Scanning up parent cpusets requires callback_mutex. The __alloc_pages() * routine only calls here with __GFP_HARDWALL bit _not_ set if * it's a GFP_KERNEL allocation, and all nodes in the current tasks * mems_allowed came up empty on the first pass over the zonelist. * So only GFP_KERNEL allocations, if all nodes in the cpuset are - * short of memory, might require taking the callback_sem semaphore. + * short of memory, might require taking the callback_mutex mutex. * * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages() * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing @@ -2157,7 +2205,7 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) { int node; /* node that zone z is on */ const struct cpuset *cs; /* current cpuset ancestors */ - int allowed = 1; /* is allocation in zone z allowed? */ + int allowed; /* is allocation in zone z allowed? */ if (in_interrupt()) return 1; @@ -2171,31 +2219,31 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) return 1; /* Not hardwall and node outside mems_allowed: scan up cpusets */ - down(&callback_sem); + mutex_lock(&callback_mutex); task_lock(current); cs = nearest_exclusive_ancestor(current->cpuset); task_unlock(current); allowed = node_isset(node, cs->mems_allowed); - up(&callback_sem); + mutex_unlock(&callback_mutex); return allowed; } /** * cpuset_lock - lock out any changes to cpuset structures * - * The out of memory (oom) code needs to lock down cpusets + * The out of memory (oom) code needs to mutex_lock cpusets * from being changed while it scans the tasklist looking for a - * task in an overlapping cpuset. Expose callback_sem via this + * task in an overlapping cpuset. Expose callback_mutex via this * cpuset_lock() routine, so the oom code can lock it, before * locking the task list. The tasklist_lock is a spinlock, so - * must be taken inside callback_sem. + * must be taken inside callback_mutex. */ void cpuset_lock(void) { - down(&callback_sem); + mutex_lock(&callback_mutex); } /** @@ -2206,10 +2254,48 @@ void cpuset_lock(void) void cpuset_unlock(void) { - up(&callback_sem); + mutex_unlock(&callback_mutex); } /** + * cpuset_mem_spread_node() - On which node to begin search for a page + * + * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for + * tasks in a cpuset with is_spread_page or is_spread_slab set), + * and if the memory allocation used cpuset_mem_spread_node() + * to determine on which node to start looking, as it will for + * certain page cache or slab cache pages such as used for file + * system buffers and inode caches, then instead of starting on the + * local node to look for a free page, rather spread the starting + * node around the tasks mems_allowed nodes. + * + * We don't have to worry about the returned node being offline + * because "it can't happen", and even if it did, it would be ok. + * + * The routines calling guarantee_online_mems() are careful to + * only set nodes in task->mems_allowed that are online. So it + * should not be possible for the following code to return an + * offline node. But if it did, that would be ok, as this routine + * is not returning the node where the allocation must be, only + * the node where the search should start. The zonelist passed to + * __alloc_pages() will include all nodes. If the slab allocator + * is passed an offline node, it will fall back to the local node. + * See kmem_cache_alloc_node(). + */ + +int cpuset_mem_spread_node(void) +{ + int node; + + node = next_node(current->cpuset_mem_spread_rotor, current->mems_allowed); + if (node == MAX_NUMNODES) + node = first_node(current->mems_allowed); + current->cpuset_mem_spread_rotor = node; + return node; +} +EXPORT_SYMBOL_GPL(cpuset_mem_spread_node); + +/** * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors? * @p: pointer to task_struct of some other task. * @@ -2218,7 +2304,7 @@ void cpuset_unlock(void) * determine if task @p's memory usage might impact the memory * available to the current task. * - * Call while holding callback_sem. + * Call while holding callback_mutex. **/ int cpuset_excl_nodes_overlap(const struct task_struct *p) @@ -2289,13 +2375,13 @@ void __cpuset_memory_pressure_bump(void) * - Used for /proc/<pid>/cpuset. * - No need to task_lock(tsk) on this tsk->cpuset reference, as it * doesn't really matter if tsk->cpuset changes after we read it, - * and we take manage_sem, keeping attach_task() from changing it - * anyway. + * and we take manage_mutex, keeping attach_task() from changing it + * anyway. No need to check that tsk->cpuset != NULL, thanks to + * the_top_cpuset_hack in cpuset_exit(), which sets an exiting tasks + * cpuset to top_cpuset. */ - static int proc_cpuset_show(struct seq_file *m, void *v) { - struct cpuset *cs; struct task_struct *tsk; char *buf; int retval = 0; @@ -2305,20 +2391,14 @@ static int proc_cpuset_show(struct seq_file *m, void *v) return -ENOMEM; tsk = m->private; - down(&manage_sem); - cs = tsk->cpuset; - if (!cs) { - retval = -EINVAL; - goto out; - } - - retval = cpuset_path(cs, buf, PAGE_SIZE); + mutex_lock(&manage_mutex); + retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE); if (retval < 0) goto out; seq_puts(m, buf); seq_putc(m, '\n'); out: - up(&manage_sem); + mutex_unlock(&manage_mutex); kfree(buf); return retval; } diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index 867d6dbeb57..c01cead2cfd 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c @@ -140,6 +140,7 @@ __set_personality(u_long personality) ep = lookup_exec_domain(personality); if (ep == current_thread_info()->exec_domain) { current->personality = personality; + module_put(ep->module); return 0; } diff --git a/kernel/exit.c b/kernel/exit.c index 531aadca553..bc0ec674d3f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -29,8 +29,11 @@ #include <linux/cpuset.h> #include <linux/syscalls.h> #include <linux/signal.h> +#include <linux/posix-timers.h> #include <linux/cn_proc.h> #include <linux/mutex.h> +#include <linux/futex.h> +#include <linux/compat.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -48,15 +51,80 @@ static void __unhash_process(struct task_struct *p) { nr_threads--; detach_pid(p, PIDTYPE_PID); - detach_pid(p, PIDTYPE_TGID); if (thread_group_leader(p)) { detach_pid(p, PIDTYPE_PGID); detach_pid(p, PIDTYPE_SID); - if (p->pid) - __get_cpu_var(process_counts)--; + + list_del_init(&p->tasks); + __get_cpu_var(process_counts)--; } + list_del_rcu(&p->thread_group); + remove_parent(p); +} + +/* + * This function expects the tasklist_lock write-locked. + */ +static void __exit_signal(struct task_struct *tsk) +{ + struct signal_struct *sig = tsk->signal; + struct sighand_struct *sighand; + + BUG_ON(!sig); + BUG_ON(!atomic_read(&sig->count)); + + rcu_read_lock(); + sighand = rcu_dereference(tsk->sighand); + spin_lock(&sighand->siglock); - REMOVE_LINKS(p); + posix_cpu_timers_exit(tsk); + if (atomic_dec_and_test(&sig->count)) + posix_cpu_timers_exit_group(tsk); + else { + /* + * If there is any task waiting for the group exit + * then notify it: + */ + if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count) { + wake_up_process(sig->group_exit_task); + sig->group_exit_task = NULL; + } + if (tsk == sig->curr_target) + sig->curr_target = next_thread(tsk); + /* + * Accumulate here the counters for all threads but the + * group leader as they die, so they can be added into + * the process-wide totals when those are taken. + * The group leader stays around as a zombie as long + * as there are other threads. When it gets reaped, + * the exit.c code will add its counts into these totals. + * We won't ever get here for the group leader, since it + * will have been the last reference on the signal_struct. + */ + sig->utime = cputime_add(sig->utime, tsk->utime); + sig->stime = cputime_add(sig->stime, tsk->stime); + sig->min_flt += tsk->min_flt; + sig->maj_flt += tsk->maj_flt; + sig->nvcsw += tsk->nvcsw; + sig->nivcsw += tsk->nivcsw; + sig->sched_time += tsk->sched_time; + sig = NULL; /* Marker for below. */ + } + + __unhash_process(tsk); + + tsk->signal = NULL; + tsk->sighand = NULL; + spin_unlock(&sighand->siglock); + rcu_read_unlock(); + + __cleanup_sighand(sighand); + clear_tsk_thread_flag(tsk,TIF_SIGPENDING); + flush_sigqueue(&tsk->pending); + if (sig) { + flush_sigqueue(&sig->shared_pending); + __cleanup_signal(sig); + } } void release_task(struct task_struct * p) @@ -65,21 +133,14 @@ void release_task(struct task_struct * p) task_t *leader; struct dentry *proc_dentry; -repeat: +repeat: atomic_dec(&p->user->processes); spin_lock(&p->proc_lock); proc_dentry = proc_pid_unhash(p); write_lock_irq(&tasklist_lock); - if (unlikely(p->ptrace)) - __ptrace_unlink(p); + ptrace_unlink(p); BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); __exit_signal(p); - /* - * Note that the fastpath in sys_times depends on __exit_signal having - * updated the counters before a task is removed from the tasklist of - * the process by __unhash_process. - */ - __unhash_process(p); /* * If we are the last non-leader member of the thread @@ -114,21 +175,6 @@ repeat: goto repeat; } -/* we are using it only for SMP init */ - -void unhash_process(struct task_struct *p) -{ - struct dentry *proc_dentry; - - spin_lock(&p->proc_lock); - proc_dentry = proc_pid_unhash(p); - write_lock_irq(&tasklist_lock); - __unhash_process(p); - write_unlock_irq(&tasklist_lock); - spin_unlock(&p->proc_lock); - proc_pid_flush(proc_dentry); -} - /* * This checks not only the pgrp, but falls back on the pid if no * satisfactory pgrp is found. I dunno - gdb doesn't work correctly @@ -236,10 +282,10 @@ static void reparent_to_init(void) ptrace_unlink(current); /* Reparent to init */ - REMOVE_LINKS(current); + remove_parent(current); current->parent = child_reaper; current->real_parent = child_reaper; - SET_LINKS(current); + add_parent(current); /* Set the exit signal to SIGCHLD so we signal init on exit */ current->exit_signal = SIGCHLD; @@ -345,9 +391,9 @@ void daemonize(const char *name, ...) exit_mm(current); set_special_pids(1, 1); - down(&tty_sem); + mutex_lock(&tty_mutex); current->signal->tty = NULL; - up(&tty_sem); + mutex_unlock(&tty_mutex); /* Block and flush all signals */ sigfillset(&blocked); @@ -536,13 +582,13 @@ static void exit_mm(struct task_struct * tsk) mmput(mm); } -static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_reaper) +static inline void choose_new_parent(task_t *p, task_t *reaper) { /* * Make sure we're not reparenting to ourselves and that * the parent is not a zombie. */ - BUG_ON(p == reaper || reaper->exit_state >= EXIT_ZOMBIE); + BUG_ON(p == reaper || reaper->exit_state); p->real_parent = reaper; } @@ -567,9 +613,9 @@ static void reparent_thread(task_t *p, task_t *father, int traced) * anyway, so let go of it. */ p->ptrace = 0; - list_del_init(&p->sibling); + remove_parent(p); p->parent = p->real_parent; - list_add_tail(&p->sibling, &p->parent->children); + add_parent(p); /* If we'd notified the old parent about this child's death, * also notify the new parent. @@ -643,7 +689,7 @@ static void forget_original_parent(struct task_struct * father, if (father == p->real_parent) { /* reparent with a reaper, real father it's us */ - choose_new_parent(p, reaper, child_reaper); + choose_new_parent(p, reaper); reparent_thread(p, father, 0); } else { /* reparent ptraced task to its real parent */ @@ -664,7 +710,7 @@ static void forget_original_parent(struct task_struct * father, } list_for_each_safe(_p, _n, &father->ptrace_children) { p = list_entry(_p,struct task_struct,ptrace_list); - choose_new_parent(p, reaper, child_reaper); + choose_new_parent(p, reaper); reparent_thread(p, father, 1); } } @@ -805,10 +851,8 @@ fastcall NORET_TYPE void do_exit(long code) panic("Aiee, killing interrupt handler!"); if (unlikely(!tsk->pid)) panic("Attempted to kill the idle task!"); - if (unlikely(tsk->pid == 1)) + if (unlikely(tsk == child_reaper)) panic("Attempted to kill init!"); - if (tsk->io_context) - exit_io_context(); if (unlikely(current->ptrace & PT_TRACE_EXIT)) { current->ptrace_message = code; @@ -822,6 +866,8 @@ fastcall NORET_TYPE void do_exit(long code) if (unlikely(tsk->flags & PF_EXITING)) { printk(KERN_ALERT "Fixing recursive fault but reboot is needed!\n"); + if (tsk->io_context) + exit_io_context(); set_current_state(TASK_UNINTERRUPTIBLE); schedule(); } @@ -852,6 +898,12 @@ fastcall NORET_TYPE void do_exit(long code) exit_itimers(tsk->signal); acct_process(code); } + if (unlikely(tsk->robust_list)) + exit_robust_list(tsk); +#ifdef CONFIG_COMPAT + if (unlikely(tsk->compat_robust_list)) + compat_exit_robust_list(tsk); +#endif exit_mm(tsk); exit_sem(tsk); @@ -881,6 +933,9 @@ fastcall NORET_TYPE void do_exit(long code) */ mutex_debug_check_no_locks_held(tsk); + if (tsk->io_context) + exit_io_context(); + /* PF_DEAD causes final put_task_struct after we schedule. */ preempt_disable(); BUG_ON(tsk->flags & PF_DEAD); @@ -909,13 +964,6 @@ asmlinkage long sys_exit(int error_code) do_exit((error_code&0xff)<<8); } -task_t fastcall *next_thread(const task_t *p) -{ - return pid_task(p->pids[PIDTYPE_TGID].pid_list.next, PIDTYPE_TGID); -} - -EXPORT_SYMBOL(next_thread); - /* * Take down every thread in the group. This is called by fatal signals * as well as by sys_exit_group (below). @@ -930,7 +978,6 @@ do_group_exit(int exit_code) else if (!thread_group_empty(current)) { struct signal_struct *const sig = current->signal; struct sighand_struct *const sighand = current->sighand; - read_lock(&tasklist_lock); spin_lock_irq(&sighand->siglock); if (sig->flags & SIGNAL_GROUP_EXIT) /* Another thread got here before we took the lock. */ @@ -940,7 +987,6 @@ do_group_exit(int exit_code) zap_other_threads(current); } spin_unlock_irq(&sighand->siglock); - read_unlock(&tasklist_lock); } do_exit(exit_code); @@ -1270,7 +1316,7 @@ bail_ref: /* move to end of parent's list to avoid starvation */ remove_parent(p); - add_parent(p, p->parent); + add_parent(p); write_unlock_irq(&tasklist_lock); diff --git a/kernel/fork.c b/kernel/fork.c index b373322ca49..b3f7a1bb5e5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -84,7 +84,7 @@ static kmem_cache_t *task_struct_cachep; #endif /* SLAB cache for signal_struct structures (tsk->signal) */ -kmem_cache_t *signal_cachep; +static kmem_cache_t *signal_cachep; /* SLAB cache for sighand_struct structures (tsk->sighand) */ kmem_cache_t *sighand_cachep; @@ -181,6 +181,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) /* One for us, one for whoever does the "release_task()" (usually parent) */ atomic_set(&tsk->usage,2); atomic_set(&tsk->fs_excl, 0); + tsk->btrace_seq = 0; return tsk; } @@ -607,12 +608,12 @@ static struct files_struct *alloc_files(void) atomic_set(&newf->count, 1); spin_lock_init(&newf->file_lock); + newf->next_fd = 0; fdt = &newf->fdtab; - fdt->next_fd = 0; fdt->max_fds = NR_OPEN_DEFAULT; - fdt->max_fdset = __FD_SETSIZE; - fdt->close_on_exec = &newf->close_on_exec_init; - fdt->open_fds = &newf->open_fds_init; + fdt->max_fdset = EMBEDDED_FD_SET_SIZE; + fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init; + fdt->open_fds = (fd_set *)&newf->open_fds_init; fdt->fd = &newf->fd_array[0]; INIT_RCU_HEAD(&fdt->rcu); fdt->free_files = NULL; @@ -768,8 +769,7 @@ int unshare_files(void) struct files_struct *files = current->files; int rc; - if(!files) - BUG(); + BUG_ON(!files); /* This can race but the race causes us to copy when we don't need to and drop the copy */ @@ -786,14 +786,6 @@ int unshare_files(void) EXPORT_SYMBOL(unshare_files); -void sighand_free_cb(struct rcu_head *rhp) -{ - struct sighand_struct *sp; - - sp = container_of(rhp, struct sighand_struct, rcu); - kmem_cache_free(sighand_cachep, sp); -} - static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk) { struct sighand_struct *sig; @@ -806,12 +798,17 @@ static inline int copy_sighand(unsigned long clone_flags, struct task_struct * t rcu_assign_pointer(tsk->sighand, sig); if (!sig) return -ENOMEM; - spin_lock_init(&sig->siglock); atomic_set(&sig->count, 1); memcpy(sig->action, current->sighand->action, sizeof(sig->action)); return 0; } +void __cleanup_sighand(struct sighand_struct *sighand) +{ + if (atomic_dec_and_test(&sighand->count)) + kmem_cache_free(sighand_cachep, sighand); +} + static inline int copy_signal(unsigned long clone_flags, struct task_struct * tsk) { struct signal_struct *sig; @@ -847,7 +844,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_REL); sig->it_real_incr.tv64 = 0; sig->real_timer.function = it_real_fn; - sig->real_timer.data = tsk; + sig->tsk = tsk; sig->it_virt_expires = cputime_zero; sig->it_virt_incr = cputime_zero; @@ -881,6 +878,22 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts return 0; } +void __cleanup_signal(struct signal_struct *sig) +{ + exit_thread_group_keys(sig); + kmem_cache_free(signal_cachep, sig); +} + +static inline void cleanup_signal(struct task_struct *tsk) +{ + struct signal_struct *sig = tsk->signal; + + atomic_dec(&sig->live); + + if (atomic_dec_and_test(&sig->count)) + __cleanup_signal(sig); +} + static inline void copy_flags(unsigned long clone_flags, struct task_struct *p) { unsigned long new_flags = p->flags; @@ -1020,6 +1033,7 @@ static task_t *copy_process(unsigned long clone_flags, p->mempolicy = NULL; goto bad_fork_cleanup_cpuset; } + mpol_fix_fork_child_flag(p); #endif #ifdef CONFIG_DEBUG_MUTEXES @@ -1060,7 +1074,10 @@ static task_t *copy_process(unsigned long clone_flags, * Clear TID on mm_release()? */ p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; - + p->robust_list = NULL; +#ifdef CONFIG_COMPAT + p->compat_robust_list = NULL; +#endif /* * sigaltstack should be cleared when sharing the same VM */ @@ -1091,6 +1108,7 @@ static task_t *copy_process(unsigned long clone_flags, * We dont wake it up yet. */ p->group_leader = p; + INIT_LIST_HEAD(&p->thread_group); INIT_LIST_HEAD(&p->ptrace_children); INIT_LIST_HEAD(&p->ptrace_list); @@ -1114,16 +1132,6 @@ static task_t *copy_process(unsigned long clone_flags, !cpu_online(task_cpu(p)))) set_task_cpu(p, smp_processor_id()); - /* - * Check for pending SIGKILL! The new thread should not be allowed - * to slip out of an OOM kill. (or normal SIGKILL.) - */ - if (sigismember(¤t->pending.signal, SIGKILL)) { - write_unlock_irq(&tasklist_lock); - retval = -EINTR; - goto bad_fork_cleanup_namespace; - } - /* CLONE_PARENT re-uses the old parent */ if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) p->real_parent = current->real_parent; @@ -1132,6 +1140,23 @@ static task_t *copy_process(unsigned long clone_flags, p->parent = p->real_parent; spin_lock(¤t->sighand->siglock); + + /* + * Process group and session signals need to be delivered to just the + * parent before the fork or both the parent and the child after the + * fork. Restart if a signal comes in before we add the new process to + * it's process group. + * A fatal signal pending means that current will exit, so the new + * thread can't slip out of an OOM kill (or normal SIGKILL). + */ + recalc_sigpending(); + if (signal_pending(current)) { + spin_unlock(¤t->sighand->siglock); + write_unlock_irq(&tasklist_lock); + retval = -ERESTARTNOINTR; + goto bad_fork_cleanup_namespace; + } + if (clone_flags & CLONE_THREAD) { /* * Important: if an exit-all has been started then @@ -1144,17 +1169,9 @@ static task_t *copy_process(unsigned long clone_flags, retval = -EAGAIN; goto bad_fork_cleanup_namespace; } - p->group_leader = current->group_leader; - if (current->signal->group_stop_count > 0) { - /* - * There is an all-stop in progress for the group. - * We ourselves will stop as soon as we check signals. - * Make the new thread part of that group stop too. - */ - current->signal->group_stop_count++; - set_tsk_thread_flag(p, TIF_SIGPENDING); - } + p->group_leader = current->group_leader; + list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); if (!cputime_eq(current->signal->it_virt_expires, cputime_zero) || @@ -1177,23 +1194,25 @@ static task_t *copy_process(unsigned long clone_flags, */ p->ioprio = current->ioprio; - SET_LINKS(p); - if (unlikely(p->ptrace & PT_PTRACED)) - __ptrace_link(p, current->parent); - - if (thread_group_leader(p)) { - p->signal->tty = current->signal->tty; - p->signal->pgrp = process_group(current); - p->signal->session = current->signal->session; - attach_pid(p, PIDTYPE_PGID, process_group(p)); - attach_pid(p, PIDTYPE_SID, p->signal->session); - if (p->pid) + if (likely(p->pid)) { + add_parent(p); + if (unlikely(p->ptrace & PT_PTRACED)) + __ptrace_link(p, current->parent); + + if (thread_group_leader(p)) { + p->signal->tty = current->signal->tty; + p->signal->pgrp = process_group(current); + p->signal->session = current->signal->session; + attach_pid(p, PIDTYPE_PGID, process_group(p)); + attach_pid(p, PIDTYPE_SID, p->signal->session); + + list_add_tail(&p->tasks, &init_task.tasks); __get_cpu_var(process_counts)++; + } + attach_pid(p, PIDTYPE_PID, p->pid); + nr_threads++; } - attach_pid(p, PIDTYPE_TGID, p->tgid); - attach_pid(p, PIDTYPE_PID, p->pid); - nr_threads++; total_forks++; spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); @@ -1208,9 +1227,9 @@ bad_fork_cleanup_mm: if (p->mm) mmput(p->mm); bad_fork_cleanup_signal: - exit_signal(p); + cleanup_signal(p); bad_fork_cleanup_sighand: - exit_sighand(p); + __cleanup_sighand(p->sighand); bad_fork_cleanup_fs: exit_fs(p); /* blocking */ bad_fork_cleanup_files: @@ -1257,7 +1276,7 @@ task_t * __devinit fork_idle(int cpu) if (!task) return ERR_PTR(-ENOMEM); init_idle(task, cpu); - unhash_process(task); + return task; } @@ -1349,11 +1368,21 @@ long do_fork(unsigned long clone_flags, #define ARCH_MIN_MMSTRUCT_ALIGN 0 #endif +static void sighand_ctor(void *data, kmem_cache_t *cachep, unsigned long flags) +{ + struct sighand_struct *sighand = data; + + if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) + spin_lock_init(&sighand->siglock); +} + void __init proc_caches_init(void) { sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU, + sighand_ctor, NULL); signal_cachep = kmem_cache_create("signal_cache", sizeof(struct signal_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); @@ -1534,6 +1563,12 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) check_unshare_flags(&unshare_flags); + /* Return -EINVAL for all unsupported flags */ + err = -EINVAL; + if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| + CLONE_VM|CLONE_FILES|CLONE_SYSVSEM)) + goto bad_unshare_out; + if ((err = unshare_thread(unshare_flags))) goto bad_unshare_out; if ((err = unshare_fs(unshare_flags, &new_fs))) diff --git a/kernel/futex.c b/kernel/futex.c index 5efa2f97803..9c9b2b6b22d 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -8,6 +8,10 @@ * Removed page pinning, fix privately mapped COW pages and other cleanups * (C) Copyright 2003, 2004 Jamie Lokier * + * Robust futex support started by Ingo Molnar + * (C) Copyright 2006 Red Hat Inc, All Rights Reserved + * Thanks to Thomas Gleixner for suggestions, analysis and fixes. + * * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly * enough at me, Linus for the original (flawed) idea, Matthew * Kirkwood for proof-of-concept implementation. @@ -829,6 +833,172 @@ error: goto out; } +/* + * Support for robust futexes: the kernel cleans up held futexes at + * thread exit time. + * + * Implementation: user-space maintains a per-thread list of locks it + * is holding. Upon do_exit(), the kernel carefully walks this list, + * and marks all locks that are owned by this thread with the + * FUTEX_OWNER_DEAD bit, and wakes up a waiter (if any). The list is + * always manipulated with the lock held, so the list is private and + * per-thread. Userspace also maintains a per-thread 'list_op_pending' + * field, to allow the kernel to clean up if the thread dies after + * acquiring the lock, but just before it could have added itself to + * the list. There can only be one such pending lock. + */ + +/** + * sys_set_robust_list - set the robust-futex list head of a task + * @head: pointer to the list-head + * @len: length of the list-head, as userspace expects + */ +asmlinkage long +sys_set_robust_list(struct robust_list_head __user *head, + size_t len) +{ + /* + * The kernel knows only one size for now: + */ + if (unlikely(len != sizeof(*head))) + return -EINVAL; + + current->robust_list = head; + + return 0; +} + +/** + * sys_get_robust_list - get the robust-futex list head of a task + * @pid: pid of the process [zero for current task] + * @head_ptr: pointer to a list-head pointer, the kernel fills it in + * @len_ptr: pointer to a length field, the kernel fills in the header size + */ +asmlinkage long +sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr, + size_t __user *len_ptr) +{ + struct robust_list_head *head; + unsigned long ret; + + if (!pid) + head = current->robust_list; + else { + struct task_struct *p; + + ret = -ESRCH; + read_lock(&tasklist_lock); + p = find_task_by_pid(pid); + if (!p) + goto err_unlock; + ret = -EPERM; + if ((current->euid != p->euid) && (current->euid != p->uid) && + !capable(CAP_SYS_PTRACE)) + goto err_unlock; + head = p->robust_list; + read_unlock(&tasklist_lock); + } + + if (put_user(sizeof(*head), len_ptr)) + return -EFAULT; + return put_user(head, head_ptr); + +err_unlock: + read_unlock(&tasklist_lock); + + return ret; +} + +/* + * Process a futex-list entry, check whether it's owned by the + * dying task, and do notification if so: + */ +int handle_futex_death(u32 __user *uaddr, struct task_struct *curr) +{ + u32 uval; + +retry: + if (get_user(uval, uaddr)) + return -1; + + if ((uval & FUTEX_TID_MASK) == curr->pid) { + /* + * Ok, this dying thread is truly holding a futex + * of interest. Set the OWNER_DIED bit atomically + * via cmpxchg, and if the value had FUTEX_WAITERS + * set, wake up a waiter (if any). (We have to do a + * futex_wake() even if OWNER_DIED is already set - + * to handle the rare but possible case of recursive + * thread-death.) The rest of the cleanup is done in + * userspace. + */ + if (futex_atomic_cmpxchg_inatomic(uaddr, uval, + uval | FUTEX_OWNER_DIED) != uval) + goto retry; + + if (uval & FUTEX_WAITERS) + futex_wake((unsigned long)uaddr, 1); + } + return 0; +} + +/* + * Walk curr->robust_list (very carefully, it's a userspace list!) + * and mark any locks found there dead, and notify any waiters. + * + * We silently return on any sign of list-walking problem. + */ +void exit_robust_list(struct task_struct *curr) +{ + struct robust_list_head __user *head = curr->robust_list; + struct robust_list __user *entry, *pending; + unsigned int limit = ROBUST_LIST_LIMIT; + unsigned long futex_offset; + + /* + * Fetch the list head (which was registered earlier, via + * sys_set_robust_list()): + */ + if (get_user(entry, &head->list.next)) + return; + /* + * Fetch the relative futex offset: + */ + if (get_user(futex_offset, &head->futex_offset)) + return; + /* + * Fetch any possibly pending lock-add first, and handle it + * if it exists: + */ + if (get_user(pending, &head->list_op_pending)) + return; + if (pending) + handle_futex_death((void *)pending + futex_offset, curr); + + while (entry != &head->list) { + /* + * A pending lock might already be on the list, so + * dont process it twice: + */ + if (entry != pending) + if (handle_futex_death((void *)entry + futex_offset, + curr)) + return; + /* + * Fetch the next entry in the list: + */ + if (get_user(entry, &entry->next)) + return; + /* + * Avoid excessively long or circular lists: + */ + if (!--limit) + break; + + cond_resched(); + } +} + long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, unsigned long uaddr2, int val2, int val3) { diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c new file mode 100644 index 00000000000..54274fc8532 --- /dev/null +++ b/kernel/futex_compat.c @@ -0,0 +1,142 @@ +/* + * linux/kernel/futex_compat.c + * + * Futex compatibililty routines. + * + * Copyright 2006, Red Hat, Inc., Ingo Molnar + */ + +#include <linux/linkage.h> +#include <linux/compat.h> +#include <linux/futex.h> + +#include <asm/uaccess.h> + +/* + * Walk curr->robust_list (very carefully, it's a userspace list!) + * and mark any locks found there dead, and notify any waiters. + * + * We silently return on any sign of list-walking problem. + */ +void compat_exit_robust_list(struct task_struct *curr) +{ + struct compat_robust_list_head __user *head = curr->compat_robust_list; + struct robust_list __user *entry, *pending; + compat_uptr_t uentry, upending; + unsigned int limit = ROBUST_LIST_LIMIT; + compat_long_t futex_offset; + + /* + * Fetch the list head (which was registered earlier, via + * sys_set_robust_list()): + */ + if (get_user(uentry, &head->list.next)) + return; + entry = compat_ptr(uentry); + /* + * Fetch the relative futex offset: + */ + if (get_user(futex_offset, &head->futex_offset)) + return; + /* + * Fetch any possibly pending lock-add first, and handle it + * if it exists: + */ + if (get_user(upending, &head->list_op_pending)) + return; + pending = compat_ptr(upending); + if (upending) + handle_futex_death((void *)pending + futex_offset, curr); + + while (compat_ptr(uentry) != &head->list) { + /* + * A pending lock might already be on the list, so + * dont process it twice: + */ + if (entry != pending) + if (handle_futex_death((void *)entry + futex_offset, + curr)) + return; + + /* + * Fetch the next entry in the list: + */ + if (get_user(uentry, (compat_uptr_t *)&entry->next)) + return; + entry = compat_ptr(uentry); + /* + * Avoid excessively long or circular lists: + */ + if (!--limit) + break; + + cond_resched(); + } +} + +asmlinkage long +compat_sys_set_robust_list(struct compat_robust_list_head __user *head, + compat_size_t len) +{ + if (unlikely(len != sizeof(*head))) + return -EINVAL; + + current->compat_robust_list = head; + + return 0; +} + +asmlinkage long +compat_sys_get_robust_list(int pid, compat_uptr_t *head_ptr, + compat_size_t __user *len_ptr) +{ + struct compat_robust_list_head *head; + unsigned long ret; + + if (!pid) + head = current->compat_robust_list; + else { + struct task_struct *p; + + ret = -ESRCH; + read_lock(&tasklist_lock); + p = find_task_by_pid(pid); + if (!p) + goto err_unlock; + ret = -EPERM; + if ((current->euid != p->euid) && (current->euid != p->uid) && + !capable(CAP_SYS_PTRACE)) + goto err_unlock; + head = p->compat_robust_list; + read_unlock(&tasklist_lock); + } + + if (put_user(sizeof(*head), len_ptr)) + return -EFAULT; + return put_user(ptr_to_compat(head), head_ptr); + +err_unlock: + read_unlock(&tasklist_lock); + + return ret; +} + +asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, + struct compat_timespec __user *utime, u32 __user *uaddr2, + u32 val3) +{ + struct timespec t; + unsigned long timeout = MAX_SCHEDULE_TIMEOUT; + int val2 = 0; + + if ((op == FUTEX_WAIT) && utime) { + if (get_compat_timespec(&t, utime)) + return -EFAULT; + timeout = timespec_to_jiffies(&t) + 1; + } + if (op >= FUTEX_REQUEUE) + val2 = (int) (unsigned long) utime; + + return do_futex((unsigned long)uaddr, op, val, timeout, + (unsigned long)uaddr2, val2, val3); +} diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 14bc9cfa639..0237a556eb1 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -123,6 +123,26 @@ void ktime_get_ts(struct timespec *ts) EXPORT_SYMBOL_GPL(ktime_get_ts); /* + * Get the coarse grained time at the softirq based on xtime and + * wall_to_monotonic. + */ +static void hrtimer_get_softirq_time(struct hrtimer_base *base) +{ + ktime_t xtim, tomono; + unsigned long seq; + + do { + seq = read_seqbegin(&xtime_lock); + xtim = timespec_to_ktime(xtime); + tomono = timespec_to_ktime(wall_to_monotonic); + + } while (read_seqretry(&xtime_lock, seq)); + + base[CLOCK_REALTIME].softirq_time = xtim; + base[CLOCK_MONOTONIC].softirq_time = ktime_add(xtim, tomono); +} + +/* * Functions and macros which are different for UP/SMP systems are kept in a * single place */ @@ -246,7 +266,7 @@ ktime_t ktime_add_ns(const ktime_t kt, u64 nsec) /* * Divide a ktime value by a nanosecond value */ -static unsigned long ktime_divns(const ktime_t kt, nsec_t div) +static unsigned long ktime_divns(const ktime_t kt, s64 div) { u64 dclc, inc, dns; int sft = 0; @@ -281,18 +301,17 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) * hrtimer_forward - forward the timer expiry * * @timer: hrtimer to forward + * @now: forward past this time * @interval: the interval to forward * * Forward the timer expiry so it will expire in the future. * Returns the number of overruns. */ unsigned long -hrtimer_forward(struct hrtimer *timer, ktime_t interval) +hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) { unsigned long orun = 1; - ktime_t delta, now; - - now = timer->base->get_time(); + ktime_t delta; delta = ktime_sub(now, timer->expires); @@ -303,7 +322,7 @@ hrtimer_forward(struct hrtimer *timer, ktime_t interval) interval.tv64 = timer->base->resolution.tv64; if (unlikely(delta.tv64 >= interval.tv64)) { - nsec_t incr = ktime_to_ns(interval); + s64 incr = ktime_to_ns(interval); orun = ktime_divns(delta, incr); timer->expires = ktime_add_ns(timer->expires, incr * orun); @@ -355,8 +374,6 @@ static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) rb_link_node(&timer->node, parent, link); rb_insert_color(&timer->node, &base->active); - timer->state = HRTIMER_PENDING; - if (!base->first || timer->expires.tv64 < rb_entry(base->first, struct hrtimer, node)->expires.tv64) base->first = &timer->node; @@ -376,6 +393,7 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) if (base->first == &timer->node) base->first = rb_next(&timer->node); rb_erase(&timer->node, &base->active); + timer->node.rb_parent = HRTIMER_INACTIVE; } /* @@ -386,7 +404,6 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) { if (hrtimer_active(timer)) { __remove_hrtimer(timer, base); - timer->state = HRTIMER_INACTIVE; return 1; } return 0; @@ -560,6 +577,7 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, clock_id = CLOCK_MONOTONIC; timer->base = &bases[clock_id]; + timer->node.rb_parent = HRTIMER_INACTIVE; } /** @@ -586,48 +604,35 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) */ static inline void run_hrtimer_queue(struct hrtimer_base *base) { - ktime_t now = base->get_time(); struct rb_node *node; + if (base->get_softirq_time) + base->softirq_time = base->get_softirq_time(); + spin_lock_irq(&base->lock); while ((node = base->first)) { struct hrtimer *timer; - int (*fn)(void *); + int (*fn)(struct hrtimer *); int restart; - void *data; timer = rb_entry(node, struct hrtimer, node); - if (now.tv64 <= timer->expires.tv64) + if (base->softirq_time.tv64 <= timer->expires.tv64) break; fn = timer->function; - data = timer->data; set_curr_timer(base, timer); - timer->state = HRTIMER_RUNNING; __remove_hrtimer(timer, base); spin_unlock_irq(&base->lock); - /* - * fn == NULL is special case for the simplest timer - * variant - wake up process and do not restart: - */ - if (!fn) { - wake_up_process(data); - restart = HRTIMER_NORESTART; - } else - restart = fn(data); + restart = fn(timer); spin_lock_irq(&base->lock); - /* Another CPU has added back the timer */ - if (timer->state != HRTIMER_RUNNING) - continue; - - if (restart == HRTIMER_RESTART) + if (restart != HRTIMER_NORESTART) { + BUG_ON(hrtimer_active(timer)); enqueue_hrtimer(timer, base); - else - timer->state = HRTIMER_EXPIRED; + } } set_curr_timer(base, NULL); spin_unlock_irq(&base->lock); @@ -641,6 +646,8 @@ void hrtimer_run_queues(void) struct hrtimer_base *base = __get_cpu_var(hrtimer_bases); int i; + hrtimer_get_softirq_time(base); + for (i = 0; i < MAX_HRTIMER_BASES; i++) run_hrtimer_queue(&base[i]); } @@ -649,79 +656,70 @@ void hrtimer_run_queues(void) * Sleep related functions: */ -/** - * schedule_hrtimer - sleep until timeout - * - * @timer: hrtimer variable initialized with the correct clock base - * @mode: timeout value is abs/rel - * - * Make the current task sleep until @timeout is - * elapsed. - * - * You can set the task state as follows - - * - * %TASK_UNINTERRUPTIBLE - at least @timeout is guaranteed to - * pass before the routine returns. The routine will return 0 - * - * %TASK_INTERRUPTIBLE - the routine may return early if a signal is - * delivered to the current task. In this case the remaining time - * will be returned - * - * The current task state is guaranteed to be TASK_RUNNING when this - * routine returns. - */ -static ktime_t __sched -schedule_hrtimer(struct hrtimer *timer, const enum hrtimer_mode mode) -{ - /* fn stays NULL, meaning single-shot wakeup: */ - timer->data = current; +struct sleep_hrtimer { + struct hrtimer timer; + struct task_struct *task; + int expired; +}; - hrtimer_start(timer, timer->expires, mode); +static int nanosleep_wakeup(struct hrtimer *timer) +{ + struct sleep_hrtimer *t = + container_of(timer, struct sleep_hrtimer, timer); - schedule(); - hrtimer_cancel(timer); + t->expired = 1; + wake_up_process(t->task); - /* Return the remaining time: */ - if (timer->state != HRTIMER_EXPIRED) - return ktime_sub(timer->expires, timer->base->get_time()); - else - return (ktime_t) {.tv64 = 0 }; + return HRTIMER_NORESTART; } -static inline ktime_t __sched -schedule_hrtimer_interruptible(struct hrtimer *timer, - const enum hrtimer_mode mode) +static int __sched do_nanosleep(struct sleep_hrtimer *t, enum hrtimer_mode mode) { - set_current_state(TASK_INTERRUPTIBLE); + t->timer.function = nanosleep_wakeup; + t->task = current; + t->expired = 0; + + do { + set_current_state(TASK_INTERRUPTIBLE); + hrtimer_start(&t->timer, t->timer.expires, mode); + + schedule(); + + if (unlikely(!t->expired)) { + hrtimer_cancel(&t->timer); + mode = HRTIMER_ABS; + } + } while (!t->expired && !signal_pending(current)); - return schedule_hrtimer(timer, mode); + return t->expired; } static long __sched nanosleep_restart(struct restart_block *restart) { + struct sleep_hrtimer t; struct timespec __user *rmtp; struct timespec tu; - void *rfn_save = restart->fn; - struct hrtimer timer; - ktime_t rem; + ktime_t time; restart->fn = do_no_restart_syscall; - hrtimer_init(&timer, (clockid_t) restart->arg3, HRTIMER_ABS); - - timer.expires.tv64 = ((u64)restart->arg1 << 32) | (u64) restart->arg0; - - rem = schedule_hrtimer_interruptible(&timer, HRTIMER_ABS); + hrtimer_init(&t.timer, restart->arg3, HRTIMER_ABS); + t.timer.expires.tv64 = ((u64)restart->arg1 << 32) | (u64) restart->arg0; - if (rem.tv64 <= 0) + if (do_nanosleep(&t, HRTIMER_ABS)) return 0; rmtp = (struct timespec __user *) restart->arg2; - tu = ktime_to_timespec(rem); - if (rmtp && copy_to_user(rmtp, &tu, sizeof(tu))) - return -EFAULT; + if (rmtp) { + time = ktime_sub(t.timer.expires, t.timer.base->get_time()); + if (time.tv64 <= 0) + return 0; + tu = ktime_to_timespec(time); + if (copy_to_user(rmtp, &tu, sizeof(tu))) + return -EFAULT; + } - restart->fn = rfn_save; + restart->fn = nanosleep_restart; /* The other values in restart are already filled in */ return -ERESTART_RESTARTBLOCK; @@ -731,33 +729,34 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, const enum hrtimer_mode mode, const clockid_t clockid) { struct restart_block *restart; - struct hrtimer timer; + struct sleep_hrtimer t; struct timespec tu; ktime_t rem; - hrtimer_init(&timer, clockid, mode); - - timer.expires = timespec_to_ktime(*rqtp); - - rem = schedule_hrtimer_interruptible(&timer, mode); - if (rem.tv64 <= 0) + hrtimer_init(&t.timer, clockid, mode); + t.timer.expires = timespec_to_ktime(*rqtp); + if (do_nanosleep(&t, mode)) return 0; /* Absolute timers do not update the rmtp value and restart: */ if (mode == HRTIMER_ABS) return -ERESTARTNOHAND; - tu = ktime_to_timespec(rem); - - if (rmtp && copy_to_user(rmtp, &tu, sizeof(tu))) - return -EFAULT; + if (rmtp) { + rem = ktime_sub(t.timer.expires, t.timer.base->get_time()); + if (rem.tv64 <= 0) + return 0; + tu = ktime_to_timespec(rem); + if (copy_to_user(rmtp, &tu, sizeof(tu))) + return -EFAULT; + } restart = ¤t_thread_info()->restart_block; restart->fn = nanosleep_restart; - restart->arg0 = timer.expires.tv64 & 0xFFFFFFFF; - restart->arg1 = timer.expires.tv64 >> 32; + restart->arg0 = t.timer.expires.tv64 & 0xFFFFFFFF; + restart->arg1 = t.timer.expires.tv64 >> 32; restart->arg2 = (unsigned long) rmtp; - restart->arg3 = (unsigned long) timer.base->index; + restart->arg3 = (unsigned long) t.timer.base->index; return -ERESTART_RESTARTBLOCK; } diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 49378738ff5..2b33f852be3 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -1,5 +1,4 @@ -obj-y := handle.o manage.o spurious.o +obj-y := handle.o manage.o spurious.o migration.o obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o obj-$(CONFIG_PROC_FS) += proc.o - diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 97d5559997d..ac766ad573e 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -204,10 +204,14 @@ int setup_irq(unsigned int irq, struct irqaction * new) p = &desc->action; if ((old = *p) != NULL) { /* Can't share interrupts unless both agree to */ - if (!(old->flags & new->flags & SA_SHIRQ)) { - spin_unlock_irqrestore(&desc->lock,flags); - return -EBUSY; - } + if (!(old->flags & new->flags & SA_SHIRQ)) + goto mismatch; + +#if defined(ARCH_HAS_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ) + /* All handlers must agree on per-cpuness */ + if ((old->flags & IRQ_PER_CPU) != (new->flags & IRQ_PER_CPU)) + goto mismatch; +#endif /* add new interrupt at end of irq queue */ do { @@ -218,7 +222,10 @@ int setup_irq(unsigned int irq, struct irqaction * new) } *p = new; - +#if defined(ARCH_HAS_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ) + if (new->flags & SA_PERCPU_IRQ) + desc->status |= IRQ_PER_CPU; +#endif if (!shared) { desc->depth = 0; desc->status &= ~(IRQ_DISABLED | IRQ_AUTODETECT | @@ -236,6 +243,12 @@ int setup_irq(unsigned int irq, struct irqaction * new) register_handler_proc(irq, new); return 0; + +mismatch: + spin_unlock_irqrestore(&desc->lock, flags); + printk(KERN_ERR "%s: irq handler mismatch\n", __FUNCTION__); + dump_stack(); + return -EBUSY; } /** @@ -258,6 +271,7 @@ void free_irq(unsigned int irq, void *dev_id) struct irqaction **p; unsigned long flags; + WARN_ON(in_interrupt()); if (irq >= NR_IRQS) return; diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c new file mode 100644 index 00000000000..52a8655fa08 --- /dev/null +++ b/kernel/irq/migration.c @@ -0,0 +1,65 @@ +#include <linux/irq.h> + +#if defined(CONFIG_GENERIC_PENDING_IRQ) + +void set_pending_irq(unsigned int irq, cpumask_t mask) +{ + irq_desc_t *desc = irq_desc + irq; + unsigned long flags; + + spin_lock_irqsave(&desc->lock, flags); + desc->move_irq = 1; + pending_irq_cpumask[irq] = mask; + spin_unlock_irqrestore(&desc->lock, flags); +} + +void move_native_irq(int irq) +{ + cpumask_t tmp; + irq_desc_t *desc = irq_descp(irq); + + if (likely(!desc->move_irq)) + return; + + /* + * Paranoia: cpu-local interrupts shouldn't be calling in here anyway. + */ + if (CHECK_IRQ_PER_CPU(desc->status)) { + WARN_ON(1); + return; + } + + desc->move_irq = 0; + + if (likely(cpus_empty(pending_irq_cpumask[irq]))) + return; + + if (!desc->handler->set_affinity) + return; + + assert_spin_locked(&desc->lock); + + cpus_and(tmp, pending_irq_cpumask[irq], cpu_online_map); + + /* + * If there was a valid mask to work with, please + * do the disable, re-program, enable sequence. + * This is *not* particularly important for level triggered + * but in a edge trigger case, we might be setting rte + * when an active trigger is comming in. This could + * cause some ioapics to mal-function. + * Being paranoid i guess! + */ + if (unlikely(!cpus_empty(tmp))) { + if (likely(!(desc->status & IRQ_DISABLED))) + desc->handler->disable(irq); + + desc->handler->set_affinity(irq,tmp); + + if (likely(!(desc->status & IRQ_DISABLED))) + desc->handler->enable(irq); + } + cpus_clear(pending_irq_cpumask[irq]); +} + +#endif diff --git a/kernel/itimer.c b/kernel/itimer.c index 379be2f8c84..204ed7939e7 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -128,21 +128,75 @@ asmlinkage long sys_getitimer(int which, struct itimerval __user *value) /* * The timer is automagically restarted, when interval != 0 */ -int it_real_fn(void *data) +int it_real_fn(struct hrtimer *timer) { - struct task_struct *tsk = (struct task_struct *) data; + struct signal_struct *sig = + container_of(timer, struct signal_struct, real_timer); - send_group_sig_info(SIGALRM, SEND_SIG_PRIV, tsk); - - if (tsk->signal->it_real_incr.tv64 != 0) { - hrtimer_forward(&tsk->signal->real_timer, - tsk->signal->it_real_incr); + send_group_sig_info(SIGALRM, SEND_SIG_PRIV, sig->tsk); + if (sig->it_real_incr.tv64 != 0) { + hrtimer_forward(timer, timer->base->softirq_time, + sig->it_real_incr); return HRTIMER_RESTART; } return HRTIMER_NORESTART; } +/* + * We do not care about correctness. We just sanitize the values so + * the ktime_t operations which expect normalized values do not + * break. This converts negative values to long timeouts similar to + * the code in kernel versions < 2.6.16 + * + * Print a limited number of warning messages when an invalid timeval + * is detected. + */ +static void fixup_timeval(struct timeval *tv, int interval) +{ + static int warnlimit = 10; + unsigned long tmp; + + if (warnlimit > 0) { + warnlimit--; + printk(KERN_WARNING + "setitimer: %s (pid = %d) provided " + "invalid timeval %s: tv_sec = %ld tv_usec = %ld\n", + current->comm, current->pid, + interval ? "it_interval" : "it_value", + tv->tv_sec, (long) tv->tv_usec); + } + + tmp = tv->tv_usec; + if (tmp >= USEC_PER_SEC) { + tv->tv_usec = tmp % USEC_PER_SEC; + tv->tv_sec += tmp / USEC_PER_SEC; + } + + tmp = tv->tv_sec; + if (tmp > LONG_MAX) + tv->tv_sec = LONG_MAX; +} + +/* + * Returns true if the timeval is in canonical form + */ +#define timeval_valid(t) \ + (((t)->tv_sec >= 0) && (((unsigned long) (t)->tv_usec) < USEC_PER_SEC)) + +/* + * Check for invalid timevals, sanitize them and print a limited + * number of warnings. + */ +static void check_itimerval(struct itimerval *value) { + + if (unlikely(!timeval_valid(&value->it_value))) + fixup_timeval(&value->it_value, 0); + + if (unlikely(!timeval_valid(&value->it_interval))) + fixup_timeval(&value->it_interval, 1); +} + int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) { struct task_struct *tsk = current; @@ -150,6 +204,18 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) ktime_t expires; cputime_t cval, cinterval, nval, ninterval; + /* + * Validate the timevals in value. + * + * Note: Although the spec requires that invalid values shall + * return -EINVAL, we just fixup the value and print a limited + * number of warnings in order not to break users of this + * historical misfeature. + * + * Scheduled for replacement in March 2007 + */ + check_itimerval(value); + switch (which) { case ITIMER_REAL: again: @@ -226,6 +292,43 @@ again: return 0; } +/** + * alarm_setitimer - set alarm in seconds + * + * @seconds: number of seconds until alarm + * 0 disables the alarm + * + * Returns the remaining time in seconds of a pending timer or 0 when + * the timer is not active. + * + * On 32 bit machines the seconds value is limited to (INT_MAX/2) to avoid + * negative timeval settings which would cause immediate expiry. + */ +unsigned int alarm_setitimer(unsigned int seconds) +{ + struct itimerval it_new, it_old; + +#if BITS_PER_LONG < 64 + if (seconds > INT_MAX) + seconds = INT_MAX; +#endif + it_new.it_value.tv_sec = seconds; + it_new.it_value.tv_usec = 0; + it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0; + + do_setitimer(ITIMER_REAL, &it_new, &it_old); + + /* + * We can't return 0 if we have an alarm pending ... And we'd + * better return too much than too little anyway + */ + if ((!it_old.it_value.tv_sec && it_old.it_value.tv_usec) || + it_old.it_value.tv_usec >= 500000) + it_old.it_value.tv_sec++; + + return it_old.it_value.tv_sec; +} + asmlinkage long sys_setitimer(int which, struct itimerval __user *value, struct itimerval __user *ovalue) diff --git a/kernel/kmod.c b/kernel/kmod.c index 51a892063aa..20a997c73c3 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -170,7 +170,7 @@ static int wait_for_helper(void *data) sa.sa.sa_handler = SIG_IGN; sa.sa.sa_flags = 0; siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD)); - do_sigaction(SIGCHLD, &sa, (struct k_sigaction *)0); + do_sigaction(SIGCHLD, &sa, NULL); allow_signal(SIGCHLD); pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index fef1af8a73c..1156eb0977d 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -48,7 +48,7 @@ static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; -DECLARE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ +DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; @@ -323,10 +323,10 @@ struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk) } /* - * This function is called from exit_thread or flush_thread when task tk's - * stack is being recycled so that we can recycle any function-return probe - * instances associated with this task. These left over instances represent - * probed functions that have been called but will never return. + * This function is called from finish_task_switch when task tk becomes dead, + * so that we can recycle any function-return probe instances associated + * with this task. These left over instances represent probed functions + * that have been called but will never return. */ void __kprobes kprobe_flush_task(struct task_struct *tk) { @@ -336,7 +336,7 @@ void __kprobes kprobe_flush_task(struct task_struct *tk) unsigned long flags = 0; spin_lock_irqsave(&kretprobe_lock, flags); - head = kretprobe_inst_table_head(current); + head = kretprobe_inst_table_head(tk); hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { if (ri->task == tk) recycle_rp_inst(ri); @@ -460,7 +460,7 @@ static int __kprobes __register_kprobe(struct kprobe *p, } p->nmissed = 0; - down(&kprobe_mutex); + mutex_lock(&kprobe_mutex); old_p = get_kprobe(p->addr); if (old_p) { ret = register_aggr_kprobe(old_p, p); @@ -477,7 +477,7 @@ static int __kprobes __register_kprobe(struct kprobe *p, arch_arm_kprobe(p); out: - up(&kprobe_mutex); + mutex_unlock(&kprobe_mutex); if (ret && probed_mod) module_put(probed_mod); @@ -496,10 +496,10 @@ void __kprobes unregister_kprobe(struct kprobe *p) struct kprobe *old_p, *list_p; int cleanup_p; - down(&kprobe_mutex); + mutex_lock(&kprobe_mutex); old_p = get_kprobe(p->addr); if (unlikely(!old_p)) { - up(&kprobe_mutex); + mutex_unlock(&kprobe_mutex); return; } if (p != old_p) { @@ -507,7 +507,7 @@ void __kprobes unregister_kprobe(struct kprobe *p) if (list_p == p) /* kprobe p is a valid probe */ goto valid_p; - up(&kprobe_mutex); + mutex_unlock(&kprobe_mutex); return; } valid_p: @@ -523,7 +523,7 @@ valid_p: cleanup_p = 0; } - up(&kprobe_mutex); + mutex_unlock(&kprobe_mutex); synchronize_sched(); if (p->mod_refcounted && diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index d5eeae0fa5b..f119e098e67 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -15,9 +15,6 @@ #include <linux/module.h> #include <linux/init.h> -u64 uevent_seqnum; -char uevent_helper[UEVENT_HELPER_PATH_LEN] = "/sbin/hotplug"; - #define KERNEL_ATTR_RO(_name) \ static struct subsys_attribute _name##_attr = __ATTR_RO(_name) @@ -25,7 +22,7 @@ static struct subsys_attribute _name##_attr = __ATTR_RO(_name) static struct subsys_attribute _name##_attr = \ __ATTR(_name, 0644, _name##_show, _name##_store) -#ifdef CONFIG_HOTPLUG +#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) /* current uevent sequence number */ static ssize_t uevent_seqnum_show(struct subsystem *subsys, char *page) { @@ -55,7 +52,7 @@ decl_subsys(kernel, NULL, NULL); EXPORT_SYMBOL_GPL(kernel_subsys); static struct attribute * kernel_attrs[] = { -#ifdef CONFIG_HOTPLUG +#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) &uevent_seqnum_attr.attr, &uevent_helper_attr.attr, #endif diff --git a/kernel/kthread.c b/kernel/kthread.c index e75950a1092..c5f3c6613b6 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -12,6 +12,7 @@ #include <linux/unistd.h> #include <linux/file.h> #include <linux/module.h> +#include <linux/mutex.h> #include <asm/semaphore.h> /* @@ -41,7 +42,7 @@ struct kthread_stop_info /* Thread stopping is done by setthing this var: lock serializes * multiple kthread_stop calls. */ -static DECLARE_MUTEX(kthread_stop_lock); +static DEFINE_MUTEX(kthread_stop_lock); static struct kthread_stop_info kthread_stop_info; int kthread_should_stop(void) @@ -114,7 +115,9 @@ static void keventd_create_kthread(void *_create) create->result = ERR_PTR(pid); } else { wait_for_completion(&create->started); + read_lock(&tasklist_lock); create->result = find_task_by_pid(pid); + read_unlock(&tasklist_lock); } complete(&create->done); } @@ -173,7 +176,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s) { int ret; - down(&kthread_stop_lock); + mutex_lock(&kthread_stop_lock); /* It could exit after stop_info.k set, but before wake_up_process. */ get_task_struct(k); @@ -194,7 +197,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s) wait_for_completion(&kthread_stop_info.done); kthread_stop_info.k = NULL; ret = kthread_stop_info.err; - up(&kthread_stop_lock); + mutex_unlock(&kthread_stop_lock); return ret; } diff --git a/kernel/module.c b/kernel/module.c index 5aad477ddc7..bd088a7c149 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -39,6 +39,7 @@ #include <linux/device.h> #include <linux/string.h> #include <linux/sched.h> +#include <linux/mutex.h> #include <asm/uaccess.h> #include <asm/semaphore.h> #include <asm/cacheflush.h> @@ -60,29 +61,20 @@ static DEFINE_SPINLOCK(modlist_lock); /* List of modules, protected by module_mutex AND modlist_lock */ -static DECLARE_MUTEX(module_mutex); +static DEFINE_MUTEX(module_mutex); static LIST_HEAD(modules); -static DECLARE_MUTEX(notify_mutex); -static struct notifier_block * module_notify_list; +static BLOCKING_NOTIFIER_HEAD(module_notify_list); int register_module_notifier(struct notifier_block * nb) { - int err; - down(¬ify_mutex); - err = notifier_chain_register(&module_notify_list, nb); - up(¬ify_mutex); - return err; + return blocking_notifier_chain_register(&module_notify_list, nb); } EXPORT_SYMBOL(register_module_notifier); int unregister_module_notifier(struct notifier_block * nb) { - int err; - down(¬ify_mutex); - err = notifier_chain_unregister(&module_notify_list, nb); - up(¬ify_mutex); - return err; + return blocking_notifier_chain_unregister(&module_notify_list, nb); } EXPORT_SYMBOL(unregister_module_notifier); @@ -126,15 +118,30 @@ extern const struct kernel_symbol __start___ksymtab[]; extern const struct kernel_symbol __stop___ksymtab[]; extern const struct kernel_symbol __start___ksymtab_gpl[]; extern const struct kernel_symbol __stop___ksymtab_gpl[]; +extern const struct kernel_symbol __start___ksymtab_gpl_future[]; +extern const struct kernel_symbol __stop___ksymtab_gpl_future[]; extern const unsigned long __start___kcrctab[]; extern const unsigned long __start___kcrctab_gpl[]; +extern const unsigned long __start___kcrctab_gpl_future[]; #ifndef CONFIG_MODVERSIONS #define symversion(base, idx) NULL #else -#define symversion(base, idx) ((base) ? ((base) + (idx)) : NULL) +#define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL) #endif +/* lookup symbol in given range of kernel_symbols */ +static const struct kernel_symbol *lookup_symbol(const char *name, + const struct kernel_symbol *start, + const struct kernel_symbol *stop) +{ + const struct kernel_symbol *ks = start; + for (; ks < stop; ks++) + if (strcmp(ks->name, name) == 0) + return ks; + return NULL; +} + /* Find a symbol, return value, crc and module which owns it */ static unsigned long __find_symbol(const char *name, struct module **owner, @@ -142,64 +149,81 @@ static unsigned long __find_symbol(const char *name, int gplok) { struct module *mod; - unsigned int i; + const struct kernel_symbol *ks; /* Core kernel first. */ *owner = NULL; - for (i = 0; __start___ksymtab+i < __stop___ksymtab; i++) { - if (strcmp(__start___ksymtab[i].name, name) == 0) { - *crc = symversion(__start___kcrctab, i); - return __start___ksymtab[i].value; - } + ks = lookup_symbol(name, __start___ksymtab, __stop___ksymtab); + if (ks) { + *crc = symversion(__start___kcrctab, (ks - __start___ksymtab)); + return ks->value; } if (gplok) { - for (i = 0; __start___ksymtab_gpl+i<__stop___ksymtab_gpl; i++) - if (strcmp(__start___ksymtab_gpl[i].name, name) == 0) { - *crc = symversion(__start___kcrctab_gpl, i); - return __start___ksymtab_gpl[i].value; - } + ks = lookup_symbol(name, __start___ksymtab_gpl, + __stop___ksymtab_gpl); + if (ks) { + *crc = symversion(__start___kcrctab_gpl, + (ks - __start___ksymtab_gpl)); + return ks->value; + } + } + ks = lookup_symbol(name, __start___ksymtab_gpl_future, + __stop___ksymtab_gpl_future); + if (ks) { + if (!gplok) { + printk(KERN_WARNING "Symbol %s is being used " + "by a non-GPL module, which will not " + "be allowed in the future\n", name); + printk(KERN_WARNING "Please see the file " + "Documentation/feature-removal-schedule.txt " + "in the kernel source tree for more " + "details.\n"); + } + *crc = symversion(__start___kcrctab_gpl_future, + (ks - __start___ksymtab_gpl_future)); + return ks->value; } /* Now try modules. */ list_for_each_entry(mod, &modules, list) { *owner = mod; - for (i = 0; i < mod->num_syms; i++) - if (strcmp(mod->syms[i].name, name) == 0) { - *crc = symversion(mod->crcs, i); - return mod->syms[i].value; - } + ks = lookup_symbol(name, mod->syms, mod->syms + mod->num_syms); + if (ks) { + *crc = symversion(mod->crcs, (ks - mod->syms)); + return ks->value; + } if (gplok) { - for (i = 0; i < mod->num_gpl_syms; i++) { - if (strcmp(mod->gpl_syms[i].name, name) == 0) { - *crc = symversion(mod->gpl_crcs, i); - return mod->gpl_syms[i].value; - } + ks = lookup_symbol(name, mod->gpl_syms, + mod->gpl_syms + mod->num_gpl_syms); + if (ks) { + *crc = symversion(mod->gpl_crcs, + (ks - mod->gpl_syms)); + return ks->value; + } + } + ks = lookup_symbol(name, mod->gpl_future_syms, + (mod->gpl_future_syms + + mod->num_gpl_future_syms)); + if (ks) { + if (!gplok) { + printk(KERN_WARNING "Symbol %s is being used " + "by a non-GPL module, which will not " + "be allowed in the future\n", name); + printk(KERN_WARNING "Please see the file " + "Documentation/feature-removal-schedule.txt " + "in the kernel source tree for more " + "details.\n"); } + *crc = symversion(mod->gpl_future_crcs, + (ks - mod->gpl_future_syms)); + return ks->value; } } DEBUGP("Failed to find symbol %s\n", name); return 0; } -/* Find a symbol in this elf symbol table */ -static unsigned long find_local_symbol(Elf_Shdr *sechdrs, - unsigned int symindex, - const char *strtab, - const char *name) -{ - unsigned int i; - Elf_Sym *sym = (void *)sechdrs[symindex].sh_addr; - - /* Search (defined) internal symbols first. */ - for (i = 1; i < sechdrs[symindex].sh_size/sizeof(*sym); i++) { - if (sym[i].st_shndx != SHN_UNDEF - && strcmp(name, strtab + sym[i].st_name) == 0) - return sym[i].st_value; - } - return 0; -} - /* Search for module by name: must hold module_mutex. */ static struct module *find_module(const char *name) { @@ -379,7 +403,6 @@ static inline void percpu_modcopy(void *pcpudst, const void *src, } #endif /* CONFIG_SMP */ -#ifdef CONFIG_MODULE_UNLOAD #define MODINFO_ATTR(field) \ static void setup_modinfo_##field(struct module *mod, const char *s) \ { \ @@ -411,12 +434,7 @@ static struct module_attribute modinfo_##field = { \ MODINFO_ATTR(version); MODINFO_ATTR(srcversion); -static struct module_attribute *modinfo_attrs[] = { - &modinfo_version, - &modinfo_srcversion, - NULL, -}; - +#ifdef CONFIG_MODULE_UNLOAD /* Init the unload section of the module. */ static void module_unload_init(struct module *mod) { @@ -557,7 +575,7 @@ static void free_module(struct module *mod); static void wait_for_zero_refcount(struct module *mod) { /* Since we might sleep for some time, drop the semaphore first */ - up(&module_mutex); + mutex_unlock(&module_mutex); for (;;) { DEBUGP("Looking at refcount...\n"); set_current_state(TASK_UNINTERRUPTIBLE); @@ -566,7 +584,7 @@ static void wait_for_zero_refcount(struct module *mod) schedule(); } current->state = TASK_RUNNING; - down(&module_mutex); + mutex_lock(&module_mutex); } asmlinkage long @@ -583,7 +601,7 @@ sys_delete_module(const char __user *name_user, unsigned int flags) return -EFAULT; name[MODULE_NAME_LEN-1] = '\0'; - if (down_interruptible(&module_mutex) != 0) + if (mutex_lock_interruptible(&module_mutex) != 0) return -EINTR; mod = find_module(name); @@ -632,14 +650,14 @@ sys_delete_module(const char __user *name_user, unsigned int flags) /* Final destruction now noone is using it. */ if (mod->exit != NULL) { - up(&module_mutex); + mutex_unlock(&module_mutex); mod->exit(); - down(&module_mutex); + mutex_lock(&module_mutex); } free_module(mod); out: - up(&module_mutex); + mutex_unlock(&module_mutex); return ret; } @@ -731,138 +749,14 @@ static inline void module_unload_init(struct module *mod) } #endif /* CONFIG_MODULE_UNLOAD */ -#ifdef CONFIG_OBSOLETE_MODPARM -/* Bounds checking done below */ -static int obsparm_copy_string(const char *val, struct kernel_param *kp) -{ - strcpy(kp->arg, val); - return 0; -} - -static int set_obsolete(const char *val, struct kernel_param *kp) -{ - unsigned int min, max; - unsigned int size, maxsize; - int dummy; - char *endp; - const char *p; - struct obsolete_modparm *obsparm = kp->arg; - - if (!val) { - printk(KERN_ERR "Parameter %s needs an argument\n", kp->name); - return -EINVAL; - } - - /* type is: [min[-max]]{b,h,i,l,s} */ - p = obsparm->type; - min = simple_strtol(p, &endp, 10); - if (endp == obsparm->type) - min = max = 1; - else if (*endp == '-') { - p = endp+1; - max = simple_strtol(p, &endp, 10); - } else - max = min; - switch (*endp) { - case 'b': - return param_array(kp->name, val, min, max, obsparm->addr, - 1, param_set_byte, &dummy); - case 'h': - return param_array(kp->name, val, min, max, obsparm->addr, - sizeof(short), param_set_short, &dummy); - case 'i': - return param_array(kp->name, val, min, max, obsparm->addr, - sizeof(int), param_set_int, &dummy); - case 'l': - return param_array(kp->name, val, min, max, obsparm->addr, - sizeof(long), param_set_long, &dummy); - case 's': - return param_array(kp->name, val, min, max, obsparm->addr, - sizeof(char *), param_set_charp, &dummy); - - case 'c': - /* Undocumented: 1-5c50 means 1-5 strings of up to 49 chars, - and the decl is "char xxx[5][50];" */ - p = endp+1; - maxsize = simple_strtol(p, &endp, 10); - /* We check lengths here (yes, this is a hack). */ - p = val; - while (p[size = strcspn(p, ",")]) { - if (size >= maxsize) - goto oversize; - p += size+1; - } - if (size >= maxsize) - goto oversize; - return param_array(kp->name, val, min, max, obsparm->addr, - maxsize, obsparm_copy_string, &dummy); - } - printk(KERN_ERR "Unknown obsolete parameter type %s\n", obsparm->type); - return -EINVAL; - oversize: - printk(KERN_ERR - "Parameter %s doesn't fit in %u chars.\n", kp->name, maxsize); - return -EINVAL; -} - -static int obsolete_params(const char *name, - char *args, - struct obsolete_modparm obsparm[], - unsigned int num, - Elf_Shdr *sechdrs, - unsigned int symindex, - const char *strtab) -{ - struct kernel_param *kp; - unsigned int i; - int ret; - - kp = kmalloc(sizeof(kp[0]) * num, GFP_KERNEL); - if (!kp) - return -ENOMEM; - - for (i = 0; i < num; i++) { - char sym_name[128 + sizeof(MODULE_SYMBOL_PREFIX)]; - - snprintf(sym_name, sizeof(sym_name), "%s%s", - MODULE_SYMBOL_PREFIX, obsparm[i].name); - - kp[i].name = obsparm[i].name; - kp[i].perm = 000; - kp[i].set = set_obsolete; - kp[i].get = NULL; - obsparm[i].addr - = (void *)find_local_symbol(sechdrs, symindex, strtab, - sym_name); - if (!obsparm[i].addr) { - printk("%s: falsely claims to have parameter %s\n", - name, obsparm[i].name); - ret = -EINVAL; - goto out; - } - kp[i].arg = &obsparm[i]; - } - - ret = parse_args(name, args, kp, num, NULL); - out: - kfree(kp); - return ret; -} -#else -static int obsolete_params(const char *name, - char *args, - struct obsolete_modparm obsparm[], - unsigned int num, - Elf_Shdr *sechdrs, - unsigned int symindex, - const char *strtab) -{ - if (num != 0) - printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", - name); - return 0; -} -#endif /* CONFIG_OBSOLETE_MODPARM */ +static struct module_attribute *modinfo_attrs[] = { + &modinfo_version, + &modinfo_srcversion, +#ifdef CONFIG_MODULE_UNLOAD + &refcnt, +#endif + NULL, +}; static const char vermagic[] = VERMAGIC_STRING; @@ -1056,37 +950,28 @@ static inline void remove_sect_attrs(struct module *mod) } #endif /* CONFIG_KALLSYMS */ - -#ifdef CONFIG_MODULE_UNLOAD -static inline int module_add_refcnt_attr(struct module *mod) -{ - return sysfs_create_file(&mod->mkobj.kobj, &refcnt.attr); -} -static void module_remove_refcnt_attr(struct module *mod) -{ - return sysfs_remove_file(&mod->mkobj.kobj, &refcnt.attr); -} -#else -static inline int module_add_refcnt_attr(struct module *mod) -{ - return 0; -} -static void module_remove_refcnt_attr(struct module *mod) -{ -} -#endif - -#ifdef CONFIG_MODULE_UNLOAD static int module_add_modinfo_attrs(struct module *mod) { struct module_attribute *attr; + struct module_attribute *temp_attr; int error = 0; int i; + mod->modinfo_attrs = kzalloc((sizeof(struct module_attribute) * + (ARRAY_SIZE(modinfo_attrs) + 1)), + GFP_KERNEL); + if (!mod->modinfo_attrs) + return -ENOMEM; + + temp_attr = mod->modinfo_attrs; for (i = 0; (attr = modinfo_attrs[i]) && !error; i++) { if (!attr->test || - (attr->test && attr->test(mod))) - error = sysfs_create_file(&mod->mkobj.kobj,&attr->attr); + (attr->test && attr->test(mod))) { + memcpy(temp_attr, attr, sizeof(*temp_attr)); + temp_attr->attr.owner = mod; + error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr); + ++temp_attr; + } } return error; } @@ -1096,12 +981,16 @@ static void module_remove_modinfo_attrs(struct module *mod) struct module_attribute *attr; int i; - for (i = 0; (attr = modinfo_attrs[i]); i++) { + for (i = 0; (attr = &mod->modinfo_attrs[i]); i++) { + /* pick a field to test for end of list */ + if (!attr->attr.name) + break; sysfs_remove_file(&mod->mkobj.kobj,&attr->attr); - attr->free(mod); + if (attr->free) + attr->free(mod); } + kfree(mod->modinfo_attrs); } -#endif static int mod_sysfs_setup(struct module *mod, struct kernel_param *kparam, @@ -1119,19 +1008,13 @@ static int mod_sysfs_setup(struct module *mod, if (err) goto out; - err = module_add_refcnt_attr(mod); - if (err) - goto out_unreg; - err = module_param_sysfs_setup(mod, kparam, num_params); if (err) goto out_unreg; -#ifdef CONFIG_MODULE_UNLOAD err = module_add_modinfo_attrs(mod); if (err) goto out_unreg; -#endif return 0; @@ -1143,10 +1026,7 @@ out: static void mod_kobject_remove(struct module *mod) { -#ifdef CONFIG_MODULE_UNLOAD module_remove_modinfo_attrs(mod); -#endif - module_remove_refcnt_attr(mod); module_param_sysfs_remove(mod); kobject_unregister(&mod->mkobj.kobj); @@ -1424,7 +1304,6 @@ static char *get_modinfo(Elf_Shdr *sechdrs, return NULL; } -#ifdef CONFIG_MODULE_UNLOAD static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs, unsigned int infoindex) { @@ -1439,23 +1318,17 @@ static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs, attr->attr.name)); } } -#endif #ifdef CONFIG_KALLSYMS int is_exported(const char *name, const struct module *mod) { - unsigned int i; - - if (!mod) { - for (i = 0; __start___ksymtab+i < __stop___ksymtab; i++) - if (strcmp(__start___ksymtab[i].name, name) == 0) - return 1; - return 0; - } - for (i = 0; i < mod->num_syms; i++) - if (strcmp(mod->syms[i].name, name) == 0) + if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab)) + return 1; + else + if (lookup_symbol(name, mod->syms, mod->syms + mod->num_syms)) return 1; - return 0; + else + return 0; } /* As per nm */ @@ -1537,8 +1410,8 @@ static struct module *load_module(void __user *umod, char *secstrings, *args, *modmagic, *strtab = NULL; unsigned int i, symindex = 0, strindex = 0, setupindex, exindex, exportindex, modindex, obsparmindex, infoindex, gplindex, - crcindex, gplcrcindex, versindex, pcpuindex; - long arglen; + crcindex, gplcrcindex, versindex, pcpuindex, gplfutureindex, + gplfuturecrcindex; struct module *mod; long err = 0; void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ @@ -1618,8 +1491,10 @@ static struct module *load_module(void __user *umod, /* Optional sections */ exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab"); gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl"); + gplfutureindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl_future"); crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab"); gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl"); + gplfuturecrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl_future"); setupindex = find_sec(hdr, sechdrs, secstrings, "__param"); exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table"); obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm"); @@ -1655,23 +1530,11 @@ static struct module *load_module(void __user *umod, } /* Now copy in args */ - arglen = strlen_user(uargs); - if (!arglen) { - err = -EFAULT; + args = strndup_user(uargs, ~0UL >> 1); + if (IS_ERR(args)) { + err = PTR_ERR(args); goto free_hdr; } - args = kmalloc(arglen, GFP_KERNEL); - if (!args) { - err = -ENOMEM; - goto free_hdr; - } - if (copy_from_user(args, uargs, arglen) != 0) { - err = -EFAULT; - goto free_mod; - } - - /* Userspace could have altered the string after the strlen_user() */ - args[arglen - 1] = '\0'; if (find_module(mod->name)) { err = -EEXIST; @@ -1755,10 +1618,8 @@ static struct module *load_module(void __user *umod, if (strcmp(mod->name, "driverloader") == 0) add_taint(TAINT_PROPRIETARY_MODULE); -#ifdef CONFIG_MODULE_UNLOAD /* Set up MODINFO_ATTR fields */ setup_modinfo(mod, sechdrs, infoindex); -#endif /* Fix up syms, so that st_value is a pointer to location. */ err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex, @@ -1775,10 +1636,16 @@ static struct module *load_module(void __user *umod, mod->gpl_syms = (void *)sechdrs[gplindex].sh_addr; if (gplcrcindex) mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr; + mod->num_gpl_future_syms = sechdrs[gplfutureindex].sh_size / + sizeof(*mod->gpl_future_syms); + mod->gpl_future_syms = (void *)sechdrs[gplfutureindex].sh_addr; + if (gplfuturecrcindex) + mod->gpl_future_crcs = (void *)sechdrs[gplfuturecrcindex].sh_addr; #ifdef CONFIG_MODVERSIONS if ((mod->num_syms && !crcindex) || - (mod->num_gpl_syms && !gplcrcindex)) { + (mod->num_gpl_syms && !gplcrcindex) || + (mod->num_gpl_future_syms && !gplfuturecrcindex)) { printk(KERN_WARNING "%s: No versions for exported symbols." " Tainting kernel.\n", mod->name); add_taint(TAINT_FORCED_MODULE); @@ -1847,27 +1714,17 @@ static struct module *load_module(void __user *umod, set_fs(old_fs); mod->args = args; - if (obsparmindex) { - err = obsolete_params(mod->name, mod->args, - (struct obsolete_modparm *) - sechdrs[obsparmindex].sh_addr, - sechdrs[obsparmindex].sh_size - / sizeof(struct obsolete_modparm), - sechdrs, symindex, - (char *)sechdrs[strindex].sh_addr); - if (setupindex) - printk(KERN_WARNING "%s: Ignoring new-style " - "parameters in presence of obsolete ones\n", - mod->name); - } else { - /* Size of section 0 is 0, so this works well if no params */ - err = parse_args(mod->name, mod->args, - (struct kernel_param *) - sechdrs[setupindex].sh_addr, - sechdrs[setupindex].sh_size - / sizeof(struct kernel_param), - NULL); - } + if (obsparmindex) + printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", + mod->name); + + /* Size of section 0 is 0, so this works well if no params */ + err = parse_args(mod->name, mod->args, + (struct kernel_param *) + sechdrs[setupindex].sh_addr, + sechdrs[setupindex].sh_size + / sizeof(struct kernel_param), + NULL); if (err < 0) goto arch_cleanup; @@ -1933,13 +1790,13 @@ sys_init_module(void __user *umod, return -EPERM; /* Only one module load at a time, please */ - if (down_interruptible(&module_mutex) != 0) + if (mutex_lock_interruptible(&module_mutex) != 0) return -EINTR; /* Do all the hard work */ mod = load_module(umod, len, uargs); if (IS_ERR(mod)) { - up(&module_mutex); + mutex_unlock(&module_mutex); return PTR_ERR(mod); } @@ -1948,11 +1805,10 @@ sys_init_module(void __user *umod, stop_machine_run(__link_module, mod, NR_CPUS); /* Drop lock so they can recurse */ - up(&module_mutex); + mutex_unlock(&module_mutex); - down(¬ify_mutex); - notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod); - up(¬ify_mutex); + blocking_notifier_call_chain(&module_notify_list, + MODULE_STATE_COMING, mod); /* Start the module */ if (mod->init != NULL) @@ -1967,15 +1823,15 @@ sys_init_module(void __user *umod, mod->name); else { module_put(mod); - down(&module_mutex); + mutex_lock(&module_mutex); free_module(mod); - up(&module_mutex); + mutex_unlock(&module_mutex); } return ret; } /* Now it's a first class citizen! */ - down(&module_mutex); + mutex_lock(&module_mutex); mod->state = MODULE_STATE_LIVE; /* Drop initial reference. */ module_put(mod); @@ -1983,7 +1839,7 @@ sys_init_module(void __user *umod, mod->module_init = NULL; mod->init_size = 0; mod->init_text_size = 0; - up(&module_mutex); + mutex_unlock(&module_mutex); return 0; } @@ -2073,7 +1929,7 @@ struct module *module_get_kallsym(unsigned int symnum, { struct module *mod; - down(&module_mutex); + mutex_lock(&module_mutex); list_for_each_entry(mod, &modules, list) { if (symnum < mod->num_symtab) { *value = mod->symtab[symnum].st_value; @@ -2081,12 +1937,12 @@ struct module *module_get_kallsym(unsigned int symnum, strncpy(namebuf, mod->strtab + mod->symtab[symnum].st_name, 127); - up(&module_mutex); + mutex_unlock(&module_mutex); return mod; } symnum -= mod->num_symtab; } - up(&module_mutex); + mutex_unlock(&module_mutex); return NULL; } @@ -2129,7 +1985,7 @@ static void *m_start(struct seq_file *m, loff_t *pos) struct list_head *i; loff_t n = 0; - down(&module_mutex); + mutex_lock(&module_mutex); list_for_each(i, &modules) { if (n++ == *pos) break; @@ -2150,7 +2006,7 @@ static void *m_next(struct seq_file *m, void *p, loff_t *pos) static void m_stop(struct seq_file *m, void *p) { - up(&module_mutex); + mutex_unlock(&module_mutex); } static int m_show(struct seq_file *m, void *p) diff --git a/kernel/panic.c b/kernel/panic.c index 126dc43f1c7..f895c7c01d5 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -20,13 +20,16 @@ #include <linux/nmi.h> #include <linux/kexec.h> -int panic_timeout; int panic_on_oops; int tainted; +static int pause_on_oops; +static int pause_on_oops_flag; +static DEFINE_SPINLOCK(pause_on_oops_lock); +int panic_timeout; EXPORT_SYMBOL(panic_timeout); -struct notifier_block *panic_notifier_list; +ATOMIC_NOTIFIER_HEAD(panic_notifier_list); EXPORT_SYMBOL(panic_notifier_list); @@ -94,7 +97,7 @@ NORET_TYPE void panic(const char * fmt, ...) smp_send_stop(); #endif - notifier_call_chain(&panic_notifier_list, 0, buf); + atomic_notifier_call_chain(&panic_notifier_list, 0, buf); if (!panic_blink) panic_blink = no_blink; @@ -174,3 +177,95 @@ void add_taint(unsigned flag) tainted |= flag; } EXPORT_SYMBOL(add_taint); + +static int __init pause_on_oops_setup(char *str) +{ + pause_on_oops = simple_strtoul(str, NULL, 0); + return 1; +} +__setup("pause_on_oops=", pause_on_oops_setup); + +static void spin_msec(int msecs) +{ + int i; + + for (i = 0; i < msecs; i++) { + touch_nmi_watchdog(); + mdelay(1); + } +} + +/* + * It just happens that oops_enter() and oops_exit() are identically + * implemented... + */ +static void do_oops_enter_exit(void) +{ + unsigned long flags; + static int spin_counter; + + if (!pause_on_oops) + return; + + spin_lock_irqsave(&pause_on_oops_lock, flags); + if (pause_on_oops_flag == 0) { + /* This CPU may now print the oops message */ + pause_on_oops_flag = 1; + } else { + /* We need to stall this CPU */ + if (!spin_counter) { + /* This CPU gets to do the counting */ + spin_counter = pause_on_oops; + do { + spin_unlock(&pause_on_oops_lock); + spin_msec(MSEC_PER_SEC); + spin_lock(&pause_on_oops_lock); + } while (--spin_counter); + pause_on_oops_flag = 0; + } else { + /* This CPU waits for a different one */ + while (spin_counter) { + spin_unlock(&pause_on_oops_lock); + spin_msec(1); + spin_lock(&pause_on_oops_lock); + } + } + } + spin_unlock_irqrestore(&pause_on_oops_lock, flags); +} + +/* + * Return true if the calling CPU is allowed to print oops-related info. This + * is a bit racy.. + */ +int oops_may_print(void) +{ + return pause_on_oops_flag == 0; +} + +/* + * Called when the architecture enters its oops handler, before it prints + * anything. If this is the first CPU to oops, and it's oopsing the first time + * then let it proceed. + * + * This is all enabled by the pause_on_oops kernel boot option. We do all this + * to ensure that oopses don't scroll off the screen. It has the side-effect + * of preventing later-oopsing CPUs from mucking up the display, too. + * + * It turns out that the CPU which is allowed to print ends up pausing for the + * right duration, whereas all the other CPUs pause for twice as long: once in + * oops_enter(), once in oops_exit(). + */ +void oops_enter(void) +{ + do_oops_enter_exit(); +} + +/* + * Called when the architecture exits its oops handler, after printing + * everything. + */ +void oops_exit(void) +{ + do_oops_enter_exit(); +} diff --git a/kernel/params.c b/kernel/params.c index c76ad25e6a2..af43ecdc8d9 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -31,7 +31,7 @@ #define DEBUGP(fmt, a...) #endif -static inline int dash2underscore(char c) +static inline char dash2underscore(char c) { if (c == '-') return '_'; @@ -265,12 +265,12 @@ int param_get_invbool(char *buffer, struct kernel_param *kp) } /* We cheat here and temporarily mangle the string. */ -int param_array(const char *name, - const char *val, - unsigned int min, unsigned int max, - void *elem, int elemsize, - int (*set)(const char *, struct kernel_param *kp), - int *num) +static int param_array(const char *name, + const char *val, + unsigned int min, unsigned int max, + void *elem, int elemsize, + int (*set)(const char *, struct kernel_param *kp), + int *num) { int ret; struct kernel_param kp; @@ -638,13 +638,8 @@ static ssize_t module_attr_show(struct kobject *kobj, if (!attribute->show) return -EIO; - if (!try_module_get(mk->mod)) - return -ENODEV; - ret = attribute->show(attribute, mk->mod, buf); - module_put(mk->mod); - return ret; } @@ -662,13 +657,8 @@ static ssize_t module_attr_store(struct kobject *kobj, if (!attribute->store) return -EIO; - if (!try_module_get(mk->mod)) - return -ENODEV; - ret = attribute->store(attribute, mk->mod, buf, len); - module_put(mk->mod); - return ret; } diff --git a/kernel/pid.c b/kernel/pid.c index 1acc0724699..a9f2dfd006d 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -218,36 +218,6 @@ task_t *find_task_by_pid_type(int type, int nr) EXPORT_SYMBOL(find_task_by_pid_type); /* - * This function switches the PIDs if a non-leader thread calls - * sys_execve() - this must be done without releasing the PID. - * (which a detach_pid() would eventually do.) - */ -void switch_exec_pids(task_t *leader, task_t *thread) -{ - __detach_pid(leader, PIDTYPE_PID); - __detach_pid(leader, PIDTYPE_TGID); - __detach_pid(leader, PIDTYPE_PGID); - __detach_pid(leader, PIDTYPE_SID); - - __detach_pid(thread, PIDTYPE_PID); - __detach_pid(thread, PIDTYPE_TGID); - - leader->pid = leader->tgid = thread->pid; - thread->pid = thread->tgid; - - attach_pid(thread, PIDTYPE_PID, thread->pid); - attach_pid(thread, PIDTYPE_TGID, thread->tgid); - attach_pid(thread, PIDTYPE_PGID, thread->signal->pgrp); - attach_pid(thread, PIDTYPE_SID, thread->signal->session); - list_add_tail(&thread->tasks, &init_task.tasks); - - attach_pid(leader, PIDTYPE_PID, leader->pid); - attach_pid(leader, PIDTYPE_TGID, leader->tgid); - attach_pid(leader, PIDTYPE_PGID, leader->signal->pgrp); - attach_pid(leader, PIDTYPE_SID, leader->signal->session); -} - -/* * The pid hash table is scaled according to the amount of memory in the * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or * more. @@ -277,16 +247,8 @@ void __init pidhash_init(void) void __init pidmap_init(void) { - int i; - pidmap_array->page = (void *)get_zeroed_page(GFP_KERNEL); + /* Reserve PID 0. We never call free_pidmap(0) */ set_bit(0, pidmap_array->page); atomic_dec(&pidmap_array->nr_free); - - /* - * Allocate PID 0, and hash it via all PID types: - */ - - for (i = 0; i < PIDTYPE_MAX; i++) - attach_pid(current, i, 0); } diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index fa895fc2ecf..ac6dc874442 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -35,6 +35,7 @@ #include <linux/interrupt.h> #include <linux/slab.h> #include <linux/time.h> +#include <linux/mutex.h> #include <asm/uaccess.h> #include <asm/semaphore.h> @@ -144,7 +145,7 @@ static int common_timer_set(struct k_itimer *, int, struct itimerspec *, struct itimerspec *); static int common_timer_del(struct k_itimer *timer); -static int posix_timer_fn(void *data); +static int posix_timer_fn(struct hrtimer *data); static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags); @@ -250,15 +251,18 @@ __initcall(init_posix_timers); static void schedule_next_timer(struct k_itimer *timr) { + struct hrtimer *timer = &timr->it.real.timer; + if (timr->it.real.interval.tv64 == 0) return; - timr->it_overrun += hrtimer_forward(&timr->it.real.timer, + timr->it_overrun += hrtimer_forward(timer, timer->base->get_time(), timr->it.real.interval); + timr->it_overrun_last = timr->it_overrun; timr->it_overrun = -1; ++timr->it_requeue_pending; - hrtimer_restart(&timr->it.real.timer); + hrtimer_restart(timer); } /* @@ -330,13 +334,14 @@ EXPORT_SYMBOL_GPL(posix_timer_event); * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers. */ -static int posix_timer_fn(void *data) +static int posix_timer_fn(struct hrtimer *timer) { - struct k_itimer *timr = data; + struct k_itimer *timr; unsigned long flags; int si_private = 0; int ret = HRTIMER_NORESTART; + timr = container_of(timer, struct k_itimer, it.real.timer); spin_lock_irqsave(&timr->it_lock, flags); if (timr->it.real.interval.tv64 != 0) @@ -350,7 +355,8 @@ static int posix_timer_fn(void *data) */ if (timr->it.real.interval.tv64 != 0) { timr->it_overrun += - hrtimer_forward(&timr->it.real.timer, + hrtimer_forward(timer, + timer->base->softirq_time, timr->it.real.interval); ret = HRTIMER_RESTART; ++timr->it_requeue_pending; @@ -602,38 +608,41 @@ static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags) static void common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) { - ktime_t remaining; + ktime_t now, remaining, iv; struct hrtimer *timer = &timr->it.real.timer; memset(cur_setting, 0, sizeof(struct itimerspec)); - remaining = hrtimer_get_remaining(timer); - /* Time left ? or timer pending */ - if (remaining.tv64 > 0 || hrtimer_active(timer)) - goto calci; + iv = timr->it.real.interval; + /* interval timer ? */ - if (timr->it.real.interval.tv64 == 0) + if (iv.tv64) + cur_setting->it_interval = ktime_to_timespec(iv); + else if (!hrtimer_active(timer) && + (timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) return; + + now = timer->base->get_time(); + /* - * When a requeue is pending or this is a SIGEV_NONE timer - * move the expiry time forward by intervals, so expiry is > - * now. + * When a requeue is pending or this is a SIGEV_NONE + * timer move the expiry time forward by intervals, so + * expiry is > now. */ - if (timr->it_requeue_pending & REQUEUE_PENDING || - (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) { - timr->it_overrun += - hrtimer_forward(timer, timr->it.real.interval); - remaining = hrtimer_get_remaining(timer); - } - calci: - /* interval timer ? */ - if (timr->it.real.interval.tv64 != 0) - cur_setting->it_interval = - ktime_to_timespec(timr->it.real.interval); + if (iv.tv64 && (timr->it_requeue_pending & REQUEUE_PENDING || + (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) + timr->it_overrun += hrtimer_forward(timer, now, iv); + + remaining = ktime_sub(timer->expires, now); /* Return 0 only, when the timer is expired and not pending */ - if (remaining.tv64 <= 0) - cur_setting->it_value.tv_nsec = 1; - else + if (remaining.tv64 <= 0) { + /* + * A single shot SIGEV_NONE timer must return 0, when + * it is expired ! + */ + if ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) + cur_setting->it_value.tv_nsec = 1; + } else cur_setting->it_value = ktime_to_timespec(remaining); } @@ -716,7 +725,6 @@ common_timer_set(struct k_itimer *timr, int flags, mode = flags & TIMER_ABSTIME ? HRTIMER_ABS : HRTIMER_REL; hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); - timr->it.real.timer.data = timr; timr->it.real.timer.function = posix_timer_fn; timer->expires = timespec_to_ktime(new_setting->it_value); diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 04be7d0d96a..8d0af3d37a4 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -5,7 +5,7 @@ endif obj-y := main.o process.o console.o obj-$(CONFIG_PM_LEGACY) += pm.o -obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o snapshot.o +obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o snapshot.o swap.o user.o obj-$(CONFIG_SUSPEND_SMP) += smp.o diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 0b43847dc98..81d4d982f3f 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -22,17 +22,6 @@ #include "power.h" -extern suspend_disk_method_t pm_disk_mode; - -extern int swsusp_shrink_memory(void); -extern int swsusp_suspend(void); -extern int swsusp_write(struct pbe *pblist, unsigned int nr_pages); -extern int swsusp_check(void); -extern int swsusp_read(struct pbe **pblist_ptr); -extern void swsusp_close(void); -extern int swsusp_resume(void); - - static int noresume = 0; char resume_file[256] = CONFIG_PM_STD_PARTITION; dev_t swsusp_resume_device; @@ -70,10 +59,6 @@ static void power_down(suspend_disk_method_t mode) while(1); } - -static int in_suspend __nosavedata = 0; - - static inline void platform_finish(void) { if (pm_disk_mode == PM_DISK_PLATFORM) { @@ -87,7 +72,6 @@ static int prepare_processes(void) int error; pm_prepare_console(); - sys_sync(); disable_nonboot_cpus(); if (freeze_processes()) { @@ -145,7 +129,7 @@ int pm_suspend_disk(void) if (in_suspend) { device_resume(); pr_debug("PM: writing image.\n"); - error = swsusp_write(pagedir_nosave, nr_copy_pages); + error = swsusp_write(); if (!error) power_down(pm_disk_mode); else { @@ -216,7 +200,7 @@ static int software_resume(void) pr_debug("PM: Reading swsusp image.\n"); - if ((error = swsusp_read(&pagedir_nosave))) { + if ((error = swsusp_read())) { swsusp_free(); goto Thaw; } diff --git a/kernel/power/main.c b/kernel/power/main.c index 9cb235cba4a..ee371f50cca 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -103,7 +103,7 @@ static int suspend_prepare(suspend_state_t state) } -static int suspend_enter(suspend_state_t state) +int suspend_enter(suspend_state_t state) { int error = 0; unsigned long flags; diff --git a/kernel/power/pm.c b/kernel/power/pm.c index 33c508e857d..0f6908cce1d 100644 --- a/kernel/power/pm.c +++ b/kernel/power/pm.c @@ -25,6 +25,7 @@ #include <linux/pm.h> #include <linux/pm_legacy.h> #include <linux/interrupt.h> +#include <linux/mutex.h> int pm_active; @@ -40,7 +41,7 @@ int pm_active; * until a resume but that will be fine. */ -static DECLARE_MUTEX(pm_devs_lock); +static DEFINE_MUTEX(pm_devs_lock); static LIST_HEAD(pm_devs); /** @@ -67,9 +68,9 @@ struct pm_dev *pm_register(pm_dev_t type, dev->id = id; dev->callback = callback; - down(&pm_devs_lock); + mutex_lock(&pm_devs_lock); list_add(&dev->entry, &pm_devs); - up(&pm_devs_lock); + mutex_unlock(&pm_devs_lock); } return dev; } @@ -85,9 +86,9 @@ struct pm_dev *pm_register(pm_dev_t type, void pm_unregister(struct pm_dev *dev) { if (dev) { - down(&pm_devs_lock); + mutex_lock(&pm_devs_lock); list_del(&dev->entry); - up(&pm_devs_lock); + mutex_unlock(&pm_devs_lock); kfree(dev); } @@ -118,7 +119,7 @@ void pm_unregister_all(pm_callback callback) if (!callback) return; - down(&pm_devs_lock); + mutex_lock(&pm_devs_lock); entry = pm_devs.next; while (entry != &pm_devs) { struct pm_dev *dev = list_entry(entry, struct pm_dev, entry); @@ -126,7 +127,7 @@ void pm_unregister_all(pm_callback callback) if (dev->callback == callback) __pm_unregister(dev); } - up(&pm_devs_lock); + mutex_unlock(&pm_devs_lock); } /** @@ -234,7 +235,7 @@ int pm_send_all(pm_request_t rqst, void *data) { struct list_head *entry; - down(&pm_devs_lock); + mutex_lock(&pm_devs_lock); entry = pm_devs.next; while (entry != &pm_devs) { struct pm_dev *dev = list_entry(entry, struct pm_dev, entry); @@ -246,13 +247,13 @@ int pm_send_all(pm_request_t rqst, void *data) */ if (rqst == PM_SUSPEND) pm_undo_all(dev); - up(&pm_devs_lock); + mutex_unlock(&pm_devs_lock); return status; } } entry = entry->next; } - up(&pm_devs_lock); + mutex_unlock(&pm_devs_lock); return 0; } diff --git a/kernel/power/power.h b/kernel/power/power.h index 388dba68084..f06f12f2176 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -8,6 +8,7 @@ struct swsusp_info { int cpus; unsigned long image_pages; unsigned long pages; + unsigned long size; } __attribute__((aligned(PAGE_SIZE))); @@ -37,21 +38,79 @@ extern struct subsystem power_subsys; /* References to section boundaries */ extern const void __nosave_begin, __nosave_end; -extern unsigned int nr_copy_pages; extern struct pbe *pagedir_nosave; /* Preferred image size in bytes (default 500 MB) */ extern unsigned long image_size; +extern int in_suspend; +extern dev_t swsusp_resume_device; extern asmlinkage int swsusp_arch_suspend(void); extern asmlinkage int swsusp_arch_resume(void); extern unsigned int count_data_pages(void); -extern void free_pagedir(struct pbe *pblist); -extern void release_eaten_pages(void); -extern struct pbe *alloc_pagedir(unsigned nr_pages, gfp_t gfp_mask, int safe_needed); + +struct snapshot_handle { + loff_t offset; + unsigned int page; + unsigned int page_offset; + unsigned int prev; + struct pbe *pbe; + void *buffer; + unsigned int buf_offset; +}; + +#define data_of(handle) ((handle).buffer + (handle).buf_offset) + +extern int snapshot_read_next(struct snapshot_handle *handle, size_t count); +extern int snapshot_write_next(struct snapshot_handle *handle, size_t count); +int snapshot_image_loaded(struct snapshot_handle *handle); + +#define SNAPSHOT_IOC_MAGIC '3' +#define SNAPSHOT_FREEZE _IO(SNAPSHOT_IOC_MAGIC, 1) +#define SNAPSHOT_UNFREEZE _IO(SNAPSHOT_IOC_MAGIC, 2) +#define SNAPSHOT_ATOMIC_SNAPSHOT _IOW(SNAPSHOT_IOC_MAGIC, 3, void *) +#define SNAPSHOT_ATOMIC_RESTORE _IO(SNAPSHOT_IOC_MAGIC, 4) +#define SNAPSHOT_FREE _IO(SNAPSHOT_IOC_MAGIC, 5) +#define SNAPSHOT_SET_IMAGE_SIZE _IOW(SNAPSHOT_IOC_MAGIC, 6, unsigned long) +#define SNAPSHOT_AVAIL_SWAP _IOR(SNAPSHOT_IOC_MAGIC, 7, void *) +#define SNAPSHOT_GET_SWAP_PAGE _IOR(SNAPSHOT_IOC_MAGIC, 8, void *) +#define SNAPSHOT_FREE_SWAP_PAGES _IO(SNAPSHOT_IOC_MAGIC, 9) +#define SNAPSHOT_SET_SWAP_FILE _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int) +#define SNAPSHOT_S2RAM _IO(SNAPSHOT_IOC_MAGIC, 11) +#define SNAPSHOT_IOC_MAXNR 11 + +/** + * The bitmap is used for tracing allocated swap pages + * + * The entire bitmap consists of a number of bitmap_page + * structures linked with the help of the .next member. + * Thus each page can be allocated individually, so we only + * need to make 0-order memory allocations to create + * the bitmap. + */ + +#define BITMAP_PAGE_SIZE (PAGE_SIZE - sizeof(void *)) +#define BITMAP_PAGE_CHUNKS (BITMAP_PAGE_SIZE / sizeof(long)) +#define BITS_PER_CHUNK (sizeof(long) * 8) +#define BITMAP_PAGE_BITS (BITMAP_PAGE_CHUNKS * BITS_PER_CHUNK) + +struct bitmap_page { + unsigned long chunks[BITMAP_PAGE_CHUNKS]; + struct bitmap_page *next; +}; + +extern void free_bitmap(struct bitmap_page *bitmap); +extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits); +extern unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap); +extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap); + +extern int swsusp_check(void); +extern int swsusp_shrink_memory(void); extern void swsusp_free(void); -extern int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed); -extern unsigned int snapshot_nr_pages(void); -extern struct pbe *snapshot_pblist(void); -extern void snapshot_pblist_set(struct pbe *pblist); +extern int swsusp_suspend(void); +extern int swsusp_resume(void); +extern int swsusp_read(void); +extern int swsusp_write(void); +extern void swsusp_close(void); +extern int suspend_enter(suspend_state_t state); diff --git a/kernel/power/process.c b/kernel/power/process.c index 28de118f7a0..8ac7c35fad7 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -12,11 +12,12 @@ #include <linux/interrupt.h> #include <linux/suspend.h> #include <linux/module.h> +#include <linux/syscalls.h> /* * Timeout for stopping processes */ -#define TIMEOUT (6 * HZ) +#define TIMEOUT (20 * HZ) static inline int freezeable(struct task_struct * p) @@ -54,38 +55,62 @@ void refrigerator(void) current->state = save; } +static inline void freeze_process(struct task_struct *p) +{ + unsigned long flags; + + if (!freezing(p)) { + freeze(p); + spin_lock_irqsave(&p->sighand->siglock, flags); + signal_wake_up(p, 0); + spin_unlock_irqrestore(&p->sighand->siglock, flags); + } +} + /* 0 = success, else # of processes that we failed to stop */ int freeze_processes(void) { - int todo; + int todo, nr_user, user_frozen; unsigned long start_time; struct task_struct *g, *p; unsigned long flags; printk( "Stopping tasks: " ); start_time = jiffies; + user_frozen = 0; do { - todo = 0; + nr_user = todo = 0; read_lock(&tasklist_lock); do_each_thread(g, p) { if (!freezeable(p)) continue; if (frozen(p)) continue; - - freeze(p); - spin_lock_irqsave(&p->sighand->siglock, flags); - signal_wake_up(p, 0); - spin_unlock_irqrestore(&p->sighand->siglock, flags); - todo++; + if (p->mm && !(p->flags & PF_BORROWED_MM)) { + /* The task is a user-space one. + * Freeze it unless there's a vfork completion + * pending + */ + if (!p->vfork_done) + freeze_process(p); + nr_user++; + } else { + /* Freeze only if the user space is frozen */ + if (user_frozen) + freeze_process(p); + todo++; + } } while_each_thread(g, p); read_unlock(&tasklist_lock); + todo += nr_user; + if (!user_frozen && !nr_user) { + sys_sync(); + start_time = jiffies; + } + user_frozen = !nr_user; yield(); /* Yield is okay here */ - if (todo && time_after(jiffies, start_time + TIMEOUT)) { - printk( "\n" ); - printk(KERN_ERR " stopping tasks failed (%d tasks remaining)\n", todo ); + if (todo && time_after(jiffies, start_time + TIMEOUT)) break; - } } while(todo); /* This does not unfreeze processes that are already frozen @@ -94,8 +119,14 @@ int freeze_processes(void) * but it cleans up leftover PF_FREEZE requests. */ if (todo) { + printk( "\n" ); + printk(KERN_ERR " stopping tasks timed out " + "after %d seconds (%d tasks remaining):\n", + TIMEOUT / HZ, todo); read_lock(&tasklist_lock); - do_each_thread(g, p) + do_each_thread(g, p) { + if (freezeable(p) && !frozen(p)) + printk(KERN_ERR " %s\n", p->comm); if (freezing(p)) { pr_debug(" clean up: %s\n", p->comm); p->flags &= ~PF_FREEZE; @@ -103,7 +134,7 @@ int freeze_processes(void) recalc_sigpending_tsk(p); spin_unlock_irqrestore(&p->sighand->siglock, flags); } - while_each_thread(g, p); + } while_each_thread(g, p); read_unlock(&tasklist_lock); return todo; } diff --git a/kernel/power/smp.c b/kernel/power/smp.c index 911fc62b822..5957312b2d6 100644 --- a/kernel/power/smp.c +++ b/kernel/power/smp.c @@ -49,9 +49,7 @@ void enable_nonboot_cpus(void) printk("Thawing cpus ...\n"); for_each_cpu_mask(cpu, frozen_cpus) { - error = smp_prepare_cpu(cpu); - if (!error) - error = cpu_up(cpu); + error = cpu_up(cpu); if (!error) { printk("CPU%d is up\n", cpu); continue; diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 8d5a5986d62..c5863d02c89 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -10,6 +10,7 @@ */ +#include <linux/version.h> #include <linux/module.h> #include <linux/mm.h> #include <linux/suspend.h> @@ -34,7 +35,9 @@ #include "power.h" struct pbe *pagedir_nosave; -unsigned int nr_copy_pages; +static unsigned int nr_copy_pages; +static unsigned int nr_meta_pages; +static unsigned long *buffer; #ifdef CONFIG_HIGHMEM unsigned int count_highmem_pages(void) @@ -80,7 +83,7 @@ static int save_highmem_zone(struct zone *zone) void *kaddr; unsigned long pfn = zone_pfn + zone->zone_start_pfn; - if (!(pfn%1000)) + if (!(pfn%10000)) printk("."); if (!pfn_valid(pfn)) continue; @@ -119,13 +122,15 @@ int save_highmem(void) struct zone *zone; int res = 0; - pr_debug("swsusp: Saving Highmem\n"); + pr_debug("swsusp: Saving Highmem"); + drain_local_pages(); for_each_zone (zone) { if (is_highmem(zone)) res = save_highmem_zone(zone); if (res) return res; } + printk("\n"); return 0; } @@ -235,7 +240,7 @@ static void copy_data_pages(struct pbe *pblist) * free_pagedir - free pages allocated with alloc_pagedir() */ -void free_pagedir(struct pbe *pblist) +static void free_pagedir(struct pbe *pblist) { struct pbe *pbe; @@ -301,7 +306,7 @@ struct eaten_page { static struct eaten_page *eaten_pages = NULL; -void release_eaten_pages(void) +static void release_eaten_pages(void) { struct eaten_page *p, *q; @@ -376,7 +381,6 @@ struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed if (!nr_pages) return NULL; - pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages); pblist = alloc_image_page(gfp_mask, safe_needed); /* FIXME: rewrite this ugly loop */ for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages; @@ -388,7 +392,7 @@ struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed free_pagedir(pblist); pblist = NULL; } else - create_pbe_list(pblist, nr_pages); + create_pbe_list(pblist, nr_pages); return pblist; } @@ -414,6 +418,10 @@ void swsusp_free(void) } } } + nr_copy_pages = 0; + nr_meta_pages = 0; + pagedir_nosave = NULL; + buffer = NULL; } @@ -437,7 +445,7 @@ static int enough_free_mem(unsigned int nr_pages) (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); } -int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed) +static int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed) { struct pbe *p; @@ -504,7 +512,318 @@ asmlinkage int swsusp_save(void) */ nr_copy_pages = nr_pages; + nr_meta_pages = (nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT; printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages); return 0; } + +static void init_header(struct swsusp_info *info) +{ + memset(info, 0, sizeof(struct swsusp_info)); + info->version_code = LINUX_VERSION_CODE; + info->num_physpages = num_physpages; + memcpy(&info->uts, &system_utsname, sizeof(system_utsname)); + info->cpus = num_online_cpus(); + info->image_pages = nr_copy_pages; + info->pages = nr_copy_pages + nr_meta_pages + 1; + info->size = info->pages; + info->size <<= PAGE_SHIFT; +} + +/** + * pack_orig_addresses - the .orig_address fields of the PBEs from the + * list starting at @pbe are stored in the array @buf[] (1 page) + */ + +static inline struct pbe *pack_orig_addresses(unsigned long *buf, struct pbe *pbe) +{ + int j; + + for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) { + buf[j] = pbe->orig_address; + pbe = pbe->next; + } + if (!pbe) + for (; j < PAGE_SIZE / sizeof(long); j++) + buf[j] = 0; + return pbe; +} + +/** + * snapshot_read_next - used for reading the system memory snapshot. + * + * On the first call to it @handle should point to a zeroed + * snapshot_handle structure. The structure gets updated and a pointer + * to it should be passed to this function every next time. + * + * The @count parameter should contain the number of bytes the caller + * wants to read from the snapshot. It must not be zero. + * + * On success the function returns a positive number. Then, the caller + * is allowed to read up to the returned number of bytes from the memory + * location computed by the data_of() macro. The number returned + * may be smaller than @count, but this only happens if the read would + * cross a page boundary otherwise. + * + * The function returns 0 to indicate the end of data stream condition, + * and a negative number is returned on error. In such cases the + * structure pointed to by @handle is not updated and should not be used + * any more. + */ + +int snapshot_read_next(struct snapshot_handle *handle, size_t count) +{ + if (handle->page > nr_meta_pages + nr_copy_pages) + return 0; + if (!buffer) { + /* This makes the buffer be freed by swsusp_free() */ + buffer = alloc_image_page(GFP_ATOMIC, 0); + if (!buffer) + return -ENOMEM; + } + if (!handle->offset) { + init_header((struct swsusp_info *)buffer); + handle->buffer = buffer; + handle->pbe = pagedir_nosave; + } + if (handle->prev < handle->page) { + if (handle->page <= nr_meta_pages) { + handle->pbe = pack_orig_addresses(buffer, handle->pbe); + if (!handle->pbe) + handle->pbe = pagedir_nosave; + } else { + handle->buffer = (void *)handle->pbe->address; + handle->pbe = handle->pbe->next; + } + handle->prev = handle->page; + } + handle->buf_offset = handle->page_offset; + if (handle->page_offset + count >= PAGE_SIZE) { + count = PAGE_SIZE - handle->page_offset; + handle->page_offset = 0; + handle->page++; + } else { + handle->page_offset += count; + } + handle->offset += count; + return count; +} + +/** + * mark_unsafe_pages - mark the pages that cannot be used for storing + * the image during resume, because they conflict with the pages that + * had been used before suspend + */ + +static int mark_unsafe_pages(struct pbe *pblist) +{ + struct zone *zone; + unsigned long zone_pfn; + struct pbe *p; + + if (!pblist) /* a sanity check */ + return -EINVAL; + + /* Clear page flags */ + for_each_zone (zone) { + for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) + if (pfn_valid(zone_pfn + zone->zone_start_pfn)) + ClearPageNosaveFree(pfn_to_page(zone_pfn + + zone->zone_start_pfn)); + } + + /* Mark orig addresses */ + for_each_pbe (p, pblist) { + if (virt_addr_valid(p->orig_address)) + SetPageNosaveFree(virt_to_page(p->orig_address)); + else + return -EFAULT; + } + + return 0; +} + +static void copy_page_backup_list(struct pbe *dst, struct pbe *src) +{ + /* We assume both lists contain the same number of elements */ + while (src) { + dst->orig_address = src->orig_address; + dst = dst->next; + src = src->next; + } +} + +static int check_header(struct swsusp_info *info) +{ + char *reason = NULL; + + if (info->version_code != LINUX_VERSION_CODE) + reason = "kernel version"; + if (info->num_physpages != num_physpages) + reason = "memory size"; + if (strcmp(info->uts.sysname,system_utsname.sysname)) + reason = "system type"; + if (strcmp(info->uts.release,system_utsname.release)) + reason = "kernel release"; + if (strcmp(info->uts.version,system_utsname.version)) + reason = "version"; + if (strcmp(info->uts.machine,system_utsname.machine)) + reason = "machine"; + if (reason) { + printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason); + return -EPERM; + } + return 0; +} + +/** + * load header - check the image header and copy data from it + */ + +static int load_header(struct snapshot_handle *handle, + struct swsusp_info *info) +{ + int error; + struct pbe *pblist; + + error = check_header(info); + if (!error) { + pblist = alloc_pagedir(info->image_pages, GFP_ATOMIC, 0); + if (!pblist) + return -ENOMEM; + pagedir_nosave = pblist; + handle->pbe = pblist; + nr_copy_pages = info->image_pages; + nr_meta_pages = info->pages - info->image_pages - 1; + } + return error; +} + +/** + * unpack_orig_addresses - copy the elements of @buf[] (1 page) to + * the PBEs in the list starting at @pbe + */ + +static inline struct pbe *unpack_orig_addresses(unsigned long *buf, + struct pbe *pbe) +{ + int j; + + for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) { + pbe->orig_address = buf[j]; + pbe = pbe->next; + } + return pbe; +} + +/** + * create_image - use metadata contained in the PBE list + * pointed to by pagedir_nosave to mark the pages that will + * be overwritten in the process of restoring the system + * memory state from the image and allocate memory for + * the image avoiding these pages + */ + +static int create_image(struct snapshot_handle *handle) +{ + int error = 0; + struct pbe *p, *pblist; + + p = pagedir_nosave; + error = mark_unsafe_pages(p); + if (!error) { + pblist = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1); + if (pblist) + copy_page_backup_list(pblist, p); + free_pagedir(p); + if (!pblist) + error = -ENOMEM; + } + if (!error) + error = alloc_data_pages(pblist, GFP_ATOMIC, 1); + if (!error) { + release_eaten_pages(); + pagedir_nosave = pblist; + } else { + pagedir_nosave = NULL; + handle->pbe = NULL; + nr_copy_pages = 0; + nr_meta_pages = 0; + } + return error; +} + +/** + * snapshot_write_next - used for writing the system memory snapshot. + * + * On the first call to it @handle should point to a zeroed + * snapshot_handle structure. The structure gets updated and a pointer + * to it should be passed to this function every next time. + * + * The @count parameter should contain the number of bytes the caller + * wants to write to the image. It must not be zero. + * + * On success the function returns a positive number. Then, the caller + * is allowed to write up to the returned number of bytes to the memory + * location computed by the data_of() macro. The number returned + * may be smaller than @count, but this only happens if the write would + * cross a page boundary otherwise. + * + * The function returns 0 to indicate the "end of file" condition, + * and a negative number is returned on error. In such cases the + * structure pointed to by @handle is not updated and should not be used + * any more. + */ + +int snapshot_write_next(struct snapshot_handle *handle, size_t count) +{ + int error = 0; + + if (handle->prev && handle->page > nr_meta_pages + nr_copy_pages) + return 0; + if (!buffer) { + /* This makes the buffer be freed by swsusp_free() */ + buffer = alloc_image_page(GFP_ATOMIC, 0); + if (!buffer) + return -ENOMEM; + } + if (!handle->offset) + handle->buffer = buffer; + if (handle->prev < handle->page) { + if (!handle->prev) { + error = load_header(handle, (struct swsusp_info *)buffer); + if (error) + return error; + } else if (handle->prev <= nr_meta_pages) { + handle->pbe = unpack_orig_addresses(buffer, handle->pbe); + if (!handle->pbe) { + error = create_image(handle); + if (error) + return error; + handle->pbe = pagedir_nosave; + handle->buffer = (void *)handle->pbe->address; + } + } else { + handle->pbe = handle->pbe->next; + handle->buffer = (void *)handle->pbe->address; + } + handle->prev = handle->page; + } + handle->buf_offset = handle->page_offset; + if (handle->page_offset + count >= PAGE_SIZE) { + count = PAGE_SIZE - handle->page_offset; + handle->page_offset = 0; + handle->page++; + } else { + handle->page_offset += count; + } + handle->offset += count; + return count; +} + +int snapshot_image_loaded(struct snapshot_handle *handle) +{ + return !(!handle->pbe || handle->pbe->next || !nr_copy_pages || + handle->page <= nr_meta_pages + nr_copy_pages); +} diff --git a/kernel/power/swap.c b/kernel/power/swap.c new file mode 100644 index 00000000000..044b8e0c102 --- /dev/null +++ b/kernel/power/swap.c @@ -0,0 +1,545 @@ +/* + * linux/kernel/power/swap.c + * + * This file provides functions for reading the suspend image from + * and writing it to a swap partition. + * + * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz> + * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> + * + * This file is released under the GPLv2. + * + */ + +#include <linux/module.h> +#include <linux/smp_lock.h> +#include <linux/file.h> +#include <linux/utsname.h> +#include <linux/version.h> +#include <linux/delay.h> +#include <linux/bitops.h> +#include <linux/genhd.h> +#include <linux/device.h> +#include <linux/buffer_head.h> +#include <linux/bio.h> +#include <linux/swap.h> +#include <linux/swapops.h> +#include <linux/pm.h> + +#include "power.h" + +extern char resume_file[]; + +#define SWSUSP_SIG "S1SUSPEND" + +static struct swsusp_header { + char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)]; + swp_entry_t image; + char orig_sig[10]; + char sig[10]; +} __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header; + +/* + * Saving part... + */ + +static unsigned short root_swap = 0xffff; + +static int mark_swapfiles(swp_entry_t start) +{ + int error; + + rw_swap_page_sync(READ, + swp_entry(root_swap, 0), + virt_to_page((unsigned long)&swsusp_header)); + if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) || + !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { + memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); + memcpy(swsusp_header.sig,SWSUSP_SIG, 10); + swsusp_header.image = start; + error = rw_swap_page_sync(WRITE, + swp_entry(root_swap, 0), + virt_to_page((unsigned long) + &swsusp_header)); + } else { + pr_debug("swsusp: Partition is not swap space.\n"); + error = -ENODEV; + } + return error; +} + +/** + * swsusp_swap_check - check if the resume device is a swap device + * and get its index (if so) + */ + +static int swsusp_swap_check(void) /* This is called before saving image */ +{ + int res = swap_type_of(swsusp_resume_device); + + if (res >= 0) { + root_swap = res; + return 0; + } + return res; +} + +/** + * write_page - Write one page to given swap location. + * @buf: Address we're writing. + * @offset: Offset of the swap page we're writing to. + */ + +static int write_page(void *buf, unsigned long offset) +{ + swp_entry_t entry; + int error = -ENOSPC; + + if (offset) { + entry = swp_entry(root_swap, offset); + error = rw_swap_page_sync(WRITE, entry, virt_to_page(buf)); + } + return error; +} + +/* + * The swap map is a data structure used for keeping track of each page + * written to a swap partition. It consists of many swap_map_page + * structures that contain each an array of MAP_PAGE_SIZE swap entries. + * These structures are stored on the swap and linked together with the + * help of the .next_swap member. + * + * The swap map is created during suspend. The swap map pages are + * allocated and populated one at a time, so we only need one memory + * page to set up the entire structure. + * + * During resume we also only need to use one swap_map_page structure + * at a time. + */ + +#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(long) - 1) + +struct swap_map_page { + unsigned long entries[MAP_PAGE_ENTRIES]; + unsigned long next_swap; +}; + +/** + * The swap_map_handle structure is used for handling swap in + * a file-alike way + */ + +struct swap_map_handle { + struct swap_map_page *cur; + unsigned long cur_swap; + struct bitmap_page *bitmap; + unsigned int k; +}; + +static void release_swap_writer(struct swap_map_handle *handle) +{ + if (handle->cur) + free_page((unsigned long)handle->cur); + handle->cur = NULL; + if (handle->bitmap) + free_bitmap(handle->bitmap); + handle->bitmap = NULL; +} + +static int get_swap_writer(struct swap_map_handle *handle) +{ + handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL); + if (!handle->cur) + return -ENOMEM; + handle->bitmap = alloc_bitmap(count_swap_pages(root_swap, 0)); + if (!handle->bitmap) { + release_swap_writer(handle); + return -ENOMEM; + } + handle->cur_swap = alloc_swap_page(root_swap, handle->bitmap); + if (!handle->cur_swap) { + release_swap_writer(handle); + return -ENOSPC; + } + handle->k = 0; + return 0; +} + +static int swap_write_page(struct swap_map_handle *handle, void *buf) +{ + int error; + unsigned long offset; + + if (!handle->cur) + return -EINVAL; + offset = alloc_swap_page(root_swap, handle->bitmap); + error = write_page(buf, offset); + if (error) + return error; + handle->cur->entries[handle->k++] = offset; + if (handle->k >= MAP_PAGE_ENTRIES) { + offset = alloc_swap_page(root_swap, handle->bitmap); + if (!offset) + return -ENOSPC; + handle->cur->next_swap = offset; + error = write_page(handle->cur, handle->cur_swap); + if (error) + return error; + memset(handle->cur, 0, PAGE_SIZE); + handle->cur_swap = offset; + handle->k = 0; + } + return 0; +} + +static int flush_swap_writer(struct swap_map_handle *handle) +{ + if (handle->cur && handle->cur_swap) + return write_page(handle->cur, handle->cur_swap); + else + return -EINVAL; +} + +/** + * save_image - save the suspend image data + */ + +static int save_image(struct swap_map_handle *handle, + struct snapshot_handle *snapshot, + unsigned int nr_pages) +{ + unsigned int m; + int ret; + int error = 0; + + printk("Saving image data pages (%u pages) ... ", nr_pages); + m = nr_pages / 100; + if (!m) + m = 1; + nr_pages = 0; + do { + ret = snapshot_read_next(snapshot, PAGE_SIZE); + if (ret > 0) { + error = swap_write_page(handle, data_of(*snapshot)); + if (error) + break; + if (!(nr_pages % m)) + printk("\b\b\b\b%3d%%", nr_pages / m); + nr_pages++; + } + } while (ret > 0); + if (!error) + printk("\b\b\b\bdone\n"); + return error; +} + +/** + * enough_swap - Make sure we have enough swap to save the image. + * + * Returns TRUE or FALSE after checking the total amount of swap + * space avaiable from the resume partition. + */ + +static int enough_swap(unsigned int nr_pages) +{ + unsigned int free_swap = count_swap_pages(root_swap, 1); + + pr_debug("swsusp: free swap pages: %u\n", free_swap); + return free_swap > (nr_pages + PAGES_FOR_IO + + (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); +} + +/** + * swsusp_write - Write entire image and metadata. + * + * It is important _NOT_ to umount filesystems at this point. We want + * them synced (in case something goes wrong) but we DO not want to mark + * filesystem clean: it is not. (And it does not matter, if we resume + * correctly, we'll mark system clean, anyway.) + */ + +int swsusp_write(void) +{ + struct swap_map_handle handle; + struct snapshot_handle snapshot; + struct swsusp_info *header; + unsigned long start; + int error; + + if ((error = swsusp_swap_check())) { + printk(KERN_ERR "swsusp: Cannot find swap device, try swapon -a.\n"); + return error; + } + memset(&snapshot, 0, sizeof(struct snapshot_handle)); + error = snapshot_read_next(&snapshot, PAGE_SIZE); + if (error < PAGE_SIZE) + return error < 0 ? error : -EFAULT; + header = (struct swsusp_info *)data_of(snapshot); + if (!enough_swap(header->pages)) { + printk(KERN_ERR "swsusp: Not enough free swap\n"); + return -ENOSPC; + } + error = get_swap_writer(&handle); + if (!error) { + start = handle.cur_swap; + error = swap_write_page(&handle, header); + } + if (!error) + error = save_image(&handle, &snapshot, header->pages - 1); + if (!error) { + flush_swap_writer(&handle); + printk("S"); + error = mark_swapfiles(swp_entry(root_swap, start)); + printk("|\n"); + } + if (error) + free_all_swap_pages(root_swap, handle.bitmap); + release_swap_writer(&handle); + return error; +} + +/* + * Using bio to read from swap. + * This code requires a bit more work than just using buffer heads + * but, it is the recommended way for 2.5/2.6. + * The following are to signal the beginning and end of I/O. Bios + * finish asynchronously, while we want them to happen synchronously. + * A simple atomic_t, and a wait loop take care of this problem. + */ + +static atomic_t io_done = ATOMIC_INIT(0); + +static int end_io(struct bio *bio, unsigned int num, int err) +{ + if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + panic("I/O error reading memory image"); + atomic_set(&io_done, 0); + return 0; +} + +static struct block_device *resume_bdev; + +/** + * submit - submit BIO request. + * @rw: READ or WRITE. + * @off physical offset of page. + * @page: page we're reading or writing. + * + * Straight from the textbook - allocate and initialize the bio. + * If we're writing, make sure the page is marked as dirty. + * Then submit it and wait. + */ + +static int submit(int rw, pgoff_t page_off, void *page) +{ + int error = 0; + struct bio *bio; + + bio = bio_alloc(GFP_ATOMIC, 1); + if (!bio) + return -ENOMEM; + bio->bi_sector = page_off * (PAGE_SIZE >> 9); + bio->bi_bdev = resume_bdev; + bio->bi_end_io = end_io; + + if (bio_add_page(bio, virt_to_page(page), PAGE_SIZE, 0) < PAGE_SIZE) { + printk("swsusp: ERROR: adding page to bio at %ld\n",page_off); + error = -EFAULT; + goto Done; + } + + atomic_set(&io_done, 1); + submit_bio(rw | (1 << BIO_RW_SYNC), bio); + while (atomic_read(&io_done)) + yield(); + if (rw == READ) + bio_set_pages_dirty(bio); + Done: + bio_put(bio); + return error; +} + +static int bio_read_page(pgoff_t page_off, void *page) +{ + return submit(READ, page_off, page); +} + +static int bio_write_page(pgoff_t page_off, void *page) +{ + return submit(WRITE, page_off, page); +} + +/** + * The following functions allow us to read data using a swap map + * in a file-alike way + */ + +static void release_swap_reader(struct swap_map_handle *handle) +{ + if (handle->cur) + free_page((unsigned long)handle->cur); + handle->cur = NULL; +} + +static int get_swap_reader(struct swap_map_handle *handle, + swp_entry_t start) +{ + int error; + + if (!swp_offset(start)) + return -EINVAL; + handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); + if (!handle->cur) + return -ENOMEM; + error = bio_read_page(swp_offset(start), handle->cur); + if (error) { + release_swap_reader(handle); + return error; + } + handle->k = 0; + return 0; +} + +static int swap_read_page(struct swap_map_handle *handle, void *buf) +{ + unsigned long offset; + int error; + + if (!handle->cur) + return -EINVAL; + offset = handle->cur->entries[handle->k]; + if (!offset) + return -EFAULT; + error = bio_read_page(offset, buf); + if (error) + return error; + if (++handle->k >= MAP_PAGE_ENTRIES) { + handle->k = 0; + offset = handle->cur->next_swap; + if (!offset) + release_swap_reader(handle); + else + error = bio_read_page(offset, handle->cur); + } + return error; +} + +/** + * load_image - load the image using the swap map handle + * @handle and the snapshot handle @snapshot + * (assume there are @nr_pages pages to load) + */ + +static int load_image(struct swap_map_handle *handle, + struct snapshot_handle *snapshot, + unsigned int nr_pages) +{ + unsigned int m; + int ret; + int error = 0; + + printk("Loading image data pages (%u pages) ... ", nr_pages); + m = nr_pages / 100; + if (!m) + m = 1; + nr_pages = 0; + do { + ret = snapshot_write_next(snapshot, PAGE_SIZE); + if (ret > 0) { + error = swap_read_page(handle, data_of(*snapshot)); + if (error) + break; + if (!(nr_pages % m)) + printk("\b\b\b\b%3d%%", nr_pages / m); + nr_pages++; + } + } while (ret > 0); + if (!error) { + printk("\b\b\b\bdone\n"); + if (!snapshot_image_loaded(snapshot)) + error = -ENODATA; + } + return error; +} + +int swsusp_read(void) +{ + int error; + struct swap_map_handle handle; + struct snapshot_handle snapshot; + struct swsusp_info *header; + + if (IS_ERR(resume_bdev)) { + pr_debug("swsusp: block device not initialised\n"); + return PTR_ERR(resume_bdev); + } + + memset(&snapshot, 0, sizeof(struct snapshot_handle)); + error = snapshot_write_next(&snapshot, PAGE_SIZE); + if (error < PAGE_SIZE) + return error < 0 ? error : -EFAULT; + header = (struct swsusp_info *)data_of(snapshot); + error = get_swap_reader(&handle, swsusp_header.image); + if (!error) + error = swap_read_page(&handle, header); + if (!error) + error = load_image(&handle, &snapshot, header->pages - 1); + release_swap_reader(&handle); + + blkdev_put(resume_bdev); + + if (!error) + pr_debug("swsusp: Reading resume file was successful\n"); + else + pr_debug("swsusp: Error %d resuming\n", error); + return error; +} + +/** + * swsusp_check - Check for swsusp signature in the resume device + */ + +int swsusp_check(void) +{ + int error; + + resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); + if (!IS_ERR(resume_bdev)) { + set_blocksize(resume_bdev, PAGE_SIZE); + memset(&swsusp_header, 0, sizeof(swsusp_header)); + if ((error = bio_read_page(0, &swsusp_header))) + return error; + if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { + memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); + /* Reset swap signature now */ + error = bio_write_page(0, &swsusp_header); + } else { + return -EINVAL; + } + if (error) + blkdev_put(resume_bdev); + else + pr_debug("swsusp: Signature found, resuming\n"); + } else { + error = PTR_ERR(resume_bdev); + } + + if (error) + pr_debug("swsusp: Error %d check for resume file\n", error); + + return error; +} + +/** + * swsusp_close - close swap device. + */ + +void swsusp_close(void) +{ + if (IS_ERR(resume_bdev)) { + pr_debug("swsusp: block device not initialised\n"); + return; + } + + blkdev_put(resume_bdev); +} diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index 2d9d08f72f7..c4016cbbd3e 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -31,41 +31,24 @@ * Fixed runaway init * * Rafael J. Wysocki <rjw@sisk.pl> - * Added the swap map data structure and reworked the handling of swap + * Reworked the freeing of memory and the handling of swap * * More state savers are welcome. Especially for the scsi layer... * * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt */ -#include <linux/module.h> #include <linux/mm.h> #include <linux/suspend.h> -#include <linux/smp_lock.h> -#include <linux/file.h> -#include <linux/utsname.h> -#include <linux/version.h> -#include <linux/delay.h> -#include <linux/bitops.h> #include <linux/spinlock.h> -#include <linux/genhd.h> #include <linux/kernel.h> #include <linux/major.h> #include <linux/swap.h> #include <linux/pm.h> -#include <linux/device.h> -#include <linux/buffer_head.h> #include <linux/swapops.h> #include <linux/bootmem.h> #include <linux/syscalls.h> #include <linux/highmem.h> -#include <linux/bio.h> - -#include <asm/uaccess.h> -#include <asm/mmu_context.h> -#include <asm/pgtable.h> -#include <asm/tlbflush.h> -#include <asm/io.h> #include "power.h" @@ -77,6 +60,8 @@ */ unsigned long image_size = 500 * 1024 * 1024; +int in_suspend __nosavedata = 0; + #ifdef CONFIG_HIGHMEM unsigned int count_highmem_pages(void); int save_highmem(void); @@ -87,471 +72,97 @@ static int restore_highmem(void) { return 0; } static unsigned int count_highmem_pages(void) { return 0; } #endif -extern char resume_file[]; - -#define SWSUSP_SIG "S1SUSPEND" - -static struct swsusp_header { - char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)]; - swp_entry_t image; - char orig_sig[10]; - char sig[10]; -} __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header; - -static struct swsusp_info swsusp_info; - -/* - * Saving part... - */ - -static unsigned short root_swap = 0xffff; - -static int mark_swapfiles(swp_entry_t start) -{ - int error; - - rw_swap_page_sync(READ, - swp_entry(root_swap, 0), - virt_to_page((unsigned long)&swsusp_header)); - if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) || - !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { - memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); - memcpy(swsusp_header.sig,SWSUSP_SIG, 10); - swsusp_header.image = start; - error = rw_swap_page_sync(WRITE, - swp_entry(root_swap, 0), - virt_to_page((unsigned long) - &swsusp_header)); - } else { - pr_debug("swsusp: Partition is not swap space.\n"); - error = -ENODEV; - } - return error; -} - -/* - * Check whether the swap device is the specified resume - * device, irrespective of whether they are specified by - * identical names. - * - * (Thus, device inode aliasing is allowed. You can say /dev/hda4 - * instead of /dev/ide/host0/bus0/target0/lun0/part4 [if using devfs] - * and they'll be considered the same device. This is *necessary* for - * devfs, since the resume code can only recognize the form /dev/hda4, - * but the suspend code would see the long name.) - */ -static inline int is_resume_device(const struct swap_info_struct *swap_info) -{ - struct file *file = swap_info->swap_file; - struct inode *inode = file->f_dentry->d_inode; - - return S_ISBLK(inode->i_mode) && - swsusp_resume_device == MKDEV(imajor(inode), iminor(inode)); -} - -static int swsusp_swap_check(void) /* This is called before saving image */ -{ - int i; - - spin_lock(&swap_lock); - for (i = 0; i < MAX_SWAPFILES; i++) { - if (!(swap_info[i].flags & SWP_WRITEOK)) - continue; - if (!swsusp_resume_device || is_resume_device(swap_info + i)) { - spin_unlock(&swap_lock); - root_swap = i; - return 0; - } - } - spin_unlock(&swap_lock); - return -ENODEV; -} - -/** - * write_page - Write one page to a fresh swap location. - * @addr: Address we're writing. - * @loc: Place to store the entry we used. - * - * Allocate a new swap entry and 'sync' it. Note we discard -EIO - * errors. That is an artifact left over from swsusp. It did not - * check the return of rw_swap_page_sync() at all, since most pages - * written back to swap would return -EIO. - * This is a partial improvement, since we will at least return other - * errors, though we need to eventually fix the damn code. - */ -static int write_page(unsigned long addr, swp_entry_t *loc) -{ - swp_entry_t entry; - int error = -ENOSPC; - - entry = get_swap_page_of_type(root_swap); - if (swp_offset(entry)) { - error = rw_swap_page_sync(WRITE, entry, virt_to_page(addr)); - if (!error || error == -EIO) - *loc = entry; - } - return error; -} - /** - * Swap map-handling functions - * - * The swap map is a data structure used for keeping track of each page - * written to the swap. It consists of many swap_map_page structures - * that contain each an array of MAP_PAGE_SIZE swap entries. - * These structures are linked together with the help of either the - * .next (in memory) or the .next_swap (in swap) member. + * The following functions are used for tracing the allocated + * swap pages, so that they can be freed in case of an error. * - * The swap map is created during suspend. At that time we need to keep - * it in memory, because we have to free all of the allocated swap - * entries if an error occurs. The memory needed is preallocated - * so that we know in advance if there's enough of it. - * - * The first swap_map_page structure is filled with the swap entries that - * correspond to the first MAP_PAGE_SIZE data pages written to swap and - * so on. After the all of the data pages have been written, the order - * of the swap_map_page structures in the map is reversed so that they - * can be read from swap in the original order. This causes the data - * pages to be loaded in exactly the same order in which they have been - * saved. - * - * During resume we only need to use one swap_map_page structure - * at a time, which means that we only need to use two memory pages for - * reading the image - one for reading the swap_map_page structures - * and the second for reading the data pages from swap. + * The functions operate on a linked bitmap structure defined + * in power.h */ -#define MAP_PAGE_SIZE ((PAGE_SIZE - sizeof(swp_entry_t) - sizeof(void *)) \ - / sizeof(swp_entry_t)) - -struct swap_map_page { - swp_entry_t entries[MAP_PAGE_SIZE]; - swp_entry_t next_swap; - struct swap_map_page *next; -}; - -static inline void free_swap_map(struct swap_map_page *swap_map) +void free_bitmap(struct bitmap_page *bitmap) { - struct swap_map_page *swp; + struct bitmap_page *bp; - while (swap_map) { - swp = swap_map->next; - free_page((unsigned long)swap_map); - swap_map = swp; + while (bitmap) { + bp = bitmap->next; + free_page((unsigned long)bitmap); + bitmap = bp; } } -static struct swap_map_page *alloc_swap_map(unsigned int nr_pages) +struct bitmap_page *alloc_bitmap(unsigned int nr_bits) { - struct swap_map_page *swap_map, *swp; - unsigned n = 0; + struct bitmap_page *bitmap, *bp; + unsigned int n; - if (!nr_pages) + if (!nr_bits) return NULL; - pr_debug("alloc_swap_map(): nr_pages = %d\n", nr_pages); - swap_map = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); - swp = swap_map; - for (n = MAP_PAGE_SIZE; n < nr_pages; n += MAP_PAGE_SIZE) { - swp->next = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); - swp = swp->next; - if (!swp) { - free_swap_map(swap_map); + bitmap = (struct bitmap_page *)get_zeroed_page(GFP_KERNEL); + bp = bitmap; + for (n = BITMAP_PAGE_BITS; n < nr_bits; n += BITMAP_PAGE_BITS) { + bp->next = (struct bitmap_page *)get_zeroed_page(GFP_KERNEL); + bp = bp->next; + if (!bp) { + free_bitmap(bitmap); return NULL; } } - return swap_map; + return bitmap; } -/** - * reverse_swap_map - reverse the order of pages in the swap map - * @swap_map - */ - -static inline struct swap_map_page *reverse_swap_map(struct swap_map_page *swap_map) -{ - struct swap_map_page *prev, *next; - - prev = NULL; - while (swap_map) { - next = swap_map->next; - swap_map->next = prev; - prev = swap_map; - swap_map = next; - } - return prev; -} - -/** - * free_swap_map_entries - free the swap entries allocated to store - * the swap map @swap_map (this is only called in case of an error) - */ -static inline void free_swap_map_entries(struct swap_map_page *swap_map) -{ - while (swap_map) { - if (swap_map->next_swap.val) - swap_free(swap_map->next_swap); - swap_map = swap_map->next; - } -} - -/** - * save_swap_map - save the swap map used for tracing the data pages - * stored in the swap - */ - -static int save_swap_map(struct swap_map_page *swap_map, swp_entry_t *start) -{ - swp_entry_t entry = (swp_entry_t){0}; - int error; - - while (swap_map) { - swap_map->next_swap = entry; - if ((error = write_page((unsigned long)swap_map, &entry))) - return error; - swap_map = swap_map->next; - } - *start = entry; - return 0; -} - -/** - * free_image_entries - free the swap entries allocated to store - * the image data pages (this is only called in case of an error) - */ - -static inline void free_image_entries(struct swap_map_page *swp) +static int bitmap_set(struct bitmap_page *bitmap, unsigned long bit) { - unsigned k; + unsigned int n; - while (swp) { - for (k = 0; k < MAP_PAGE_SIZE; k++) - if (swp->entries[k].val) - swap_free(swp->entries[k]); - swp = swp->next; + n = BITMAP_PAGE_BITS; + while (bitmap && n <= bit) { + n += BITMAP_PAGE_BITS; + bitmap = bitmap->next; } -} - -/** - * The swap_map_handle structure is used for handling the swap map in - * a file-alike way - */ - -struct swap_map_handle { - struct swap_map_page *cur; - unsigned int k; -}; - -static inline void init_swap_map_handle(struct swap_map_handle *handle, - struct swap_map_page *map) -{ - handle->cur = map; - handle->k = 0; -} - -static inline int swap_map_write_page(struct swap_map_handle *handle, - unsigned long addr) -{ - int error; - - error = write_page(addr, handle->cur->entries + handle->k); - if (error) - return error; - if (++handle->k >= MAP_PAGE_SIZE) { - handle->cur = handle->cur->next; - handle->k = 0; + if (!bitmap) + return -EINVAL; + n -= BITMAP_PAGE_BITS; + bit -= n; + n = 0; + while (bit >= BITS_PER_CHUNK) { + bit -= BITS_PER_CHUNK; + n++; } + bitmap->chunks[n] |= (1UL << bit); return 0; } -/** - * save_image_data - save the data pages pointed to by the PBEs - * from the list @pblist using the swap map handle @handle - * (assume there are @nr_pages data pages to save) - */ - -static int save_image_data(struct pbe *pblist, - struct swap_map_handle *handle, - unsigned int nr_pages) -{ - unsigned int m; - struct pbe *p; - int error = 0; - - printk("Saving image data pages (%u pages) ... ", nr_pages); - m = nr_pages / 100; - if (!m) - m = 1; - nr_pages = 0; - for_each_pbe (p, pblist) { - error = swap_map_write_page(handle, p->address); - if (error) - break; - if (!(nr_pages % m)) - printk("\b\b\b\b%3d%%", nr_pages / m); - nr_pages++; - } - if (!error) - printk("\b\b\b\bdone\n"); - return error; -} - -static void dump_info(void) -{ - pr_debug(" swsusp: Version: %u\n",swsusp_info.version_code); - pr_debug(" swsusp: Num Pages: %ld\n",swsusp_info.num_physpages); - pr_debug(" swsusp: UTS Sys: %s\n",swsusp_info.uts.sysname); - pr_debug(" swsusp: UTS Node: %s\n",swsusp_info.uts.nodename); - pr_debug(" swsusp: UTS Release: %s\n",swsusp_info.uts.release); - pr_debug(" swsusp: UTS Version: %s\n",swsusp_info.uts.version); - pr_debug(" swsusp: UTS Machine: %s\n",swsusp_info.uts.machine); - pr_debug(" swsusp: UTS Domain: %s\n",swsusp_info.uts.domainname); - pr_debug(" swsusp: CPUs: %d\n",swsusp_info.cpus); - pr_debug(" swsusp: Image: %ld Pages\n",swsusp_info.image_pages); - pr_debug(" swsusp: Total: %ld Pages\n", swsusp_info.pages); -} - -static void init_header(unsigned int nr_pages) -{ - memset(&swsusp_info, 0, sizeof(swsusp_info)); - swsusp_info.version_code = LINUX_VERSION_CODE; - swsusp_info.num_physpages = num_physpages; - memcpy(&swsusp_info.uts, &system_utsname, sizeof(system_utsname)); - - swsusp_info.cpus = num_online_cpus(); - swsusp_info.image_pages = nr_pages; - swsusp_info.pages = nr_pages + - ((nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT) + 1; -} - -/** - * pack_orig_addresses - the .orig_address fields of the PBEs from the - * list starting at @pbe are stored in the array @buf[] (1 page) - */ - -static inline struct pbe *pack_orig_addresses(unsigned long *buf, - struct pbe *pbe) -{ - int j; - - for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) { - buf[j] = pbe->orig_address; - pbe = pbe->next; - } - if (!pbe) - for (; j < PAGE_SIZE / sizeof(long); j++) - buf[j] = 0; - return pbe; -} - -/** - * save_image_metadata - save the .orig_address fields of the PBEs - * from the list @pblist using the swap map handle @handle - */ - -static int save_image_metadata(struct pbe *pblist, - struct swap_map_handle *handle) +unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap) { - unsigned long *buf; - unsigned int n = 0; - struct pbe *p; - int error = 0; + unsigned long offset; - printk("Saving image metadata ... "); - buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC); - if (!buf) - return -ENOMEM; - p = pblist; - while (p) { - p = pack_orig_addresses(buf, p); - error = swap_map_write_page(handle, (unsigned long)buf); - if (error) - break; - n++; + offset = swp_offset(get_swap_page_of_type(swap)); + if (offset) { + if (bitmap_set(bitmap, offset)) { + swap_free(swp_entry(swap, offset)); + offset = 0; + } } - free_page((unsigned long)buf); - if (!error) - printk("done (%u pages saved)\n", n); - return error; + return offset; } -/** - * enough_swap - Make sure we have enough swap to save the image. - * - * Returns TRUE or FALSE after checking the total amount of swap - * space avaiable from the resume partition. - */ - -static int enough_swap(unsigned int nr_pages) +void free_all_swap_pages(int swap, struct bitmap_page *bitmap) { - unsigned int free_swap = swap_info[root_swap].pages - - swap_info[root_swap].inuse_pages; - - pr_debug("swsusp: free swap pages: %u\n", free_swap); - return free_swap > (nr_pages + PAGES_FOR_IO + - (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); -} + unsigned int bit, n; + unsigned long test; -/** - * swsusp_write - Write entire image and metadata. - * - * It is important _NOT_ to umount filesystems at this point. We want - * them synced (in case something goes wrong) but we DO not want to mark - * filesystem clean: it is not. (And it does not matter, if we resume - * correctly, we'll mark system clean, anyway.) - */ - -int swsusp_write(struct pbe *pblist, unsigned int nr_pages) -{ - struct swap_map_page *swap_map; - struct swap_map_handle handle; - swp_entry_t start; - int error; - - if ((error = swsusp_swap_check())) { - printk(KERN_ERR "swsusp: Cannot find swap device, try swapon -a.\n"); - return error; - } - if (!enough_swap(nr_pages)) { - printk(KERN_ERR "swsusp: Not enough free swap\n"); - return -ENOSPC; + bit = 0; + while (bitmap) { + for (n = 0; n < BITMAP_PAGE_CHUNKS; n++) + for (test = 1UL; test; test <<= 1) { + if (bitmap->chunks[n] & test) + swap_free(swp_entry(swap, bit)); + bit++; + } + bitmap = bitmap->next; } - - init_header(nr_pages); - swap_map = alloc_swap_map(swsusp_info.pages); - if (!swap_map) - return -ENOMEM; - init_swap_map_handle(&handle, swap_map); - - error = swap_map_write_page(&handle, (unsigned long)&swsusp_info); - if (!error) - error = save_image_metadata(pblist, &handle); - if (!error) - error = save_image_data(pblist, &handle, nr_pages); - if (error) - goto Free_image_entries; - - swap_map = reverse_swap_map(swap_map); - error = save_swap_map(swap_map, &start); - if (error) - goto Free_map_entries; - - dump_info(); - printk( "S" ); - error = mark_swapfiles(start); - printk( "|\n" ); - if (error) - goto Free_map_entries; - -Free_swap_map: - free_swap_map(swap_map); - return error; - -Free_map_entries: - free_swap_map_entries(swap_map); -Free_image_entries: - free_image_entries(swap_map); - goto Free_swap_map; } /** @@ -660,379 +271,3 @@ int swsusp_resume(void) local_irq_enable(); return error; } - -/** - * mark_unsafe_pages - mark the pages that cannot be used for storing - * the image during resume, because they conflict with the pages that - * had been used before suspend - */ - -static void mark_unsafe_pages(struct pbe *pblist) -{ - struct zone *zone; - unsigned long zone_pfn; - struct pbe *p; - - if (!pblist) /* a sanity check */ - return; - - /* Clear page flags */ - for_each_zone (zone) { - for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) - if (pfn_valid(zone_pfn + zone->zone_start_pfn)) - ClearPageNosaveFree(pfn_to_page(zone_pfn + - zone->zone_start_pfn)); - } - - /* Mark orig addresses */ - for_each_pbe (p, pblist) - SetPageNosaveFree(virt_to_page(p->orig_address)); - -} - -static void copy_page_backup_list(struct pbe *dst, struct pbe *src) -{ - /* We assume both lists contain the same number of elements */ - while (src) { - dst->orig_address = src->orig_address; - dst = dst->next; - src = src->next; - } -} - -/* - * Using bio to read from swap. - * This code requires a bit more work than just using buffer heads - * but, it is the recommended way for 2.5/2.6. - * The following are to signal the beginning and end of I/O. Bios - * finish asynchronously, while we want them to happen synchronously. - * A simple atomic_t, and a wait loop take care of this problem. - */ - -static atomic_t io_done = ATOMIC_INIT(0); - -static int end_io(struct bio *bio, unsigned int num, int err) -{ - if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) - panic("I/O error reading memory image"); - atomic_set(&io_done, 0); - return 0; -} - -static struct block_device *resume_bdev; - -/** - * submit - submit BIO request. - * @rw: READ or WRITE. - * @off physical offset of page. - * @page: page we're reading or writing. - * - * Straight from the textbook - allocate and initialize the bio. - * If we're writing, make sure the page is marked as dirty. - * Then submit it and wait. - */ - -static int submit(int rw, pgoff_t page_off, void *page) -{ - int error = 0; - struct bio *bio; - - bio = bio_alloc(GFP_ATOMIC, 1); - if (!bio) - return -ENOMEM; - bio->bi_sector = page_off * (PAGE_SIZE >> 9); - bio->bi_bdev = resume_bdev; - bio->bi_end_io = end_io; - - if (bio_add_page(bio, virt_to_page(page), PAGE_SIZE, 0) < PAGE_SIZE) { - printk("swsusp: ERROR: adding page to bio at %ld\n",page_off); - error = -EFAULT; - goto Done; - } - - - atomic_set(&io_done, 1); - submit_bio(rw | (1 << BIO_RW_SYNC), bio); - while (atomic_read(&io_done)) - yield(); - if (rw == READ) - bio_set_pages_dirty(bio); - Done: - bio_put(bio); - return error; -} - -static int bio_read_page(pgoff_t page_off, void *page) -{ - return submit(READ, page_off, page); -} - -static int bio_write_page(pgoff_t page_off, void *page) -{ - return submit(WRITE, page_off, page); -} - -/** - * The following functions allow us to read data using a swap map - * in a file-alike way - */ - -static inline void release_swap_map_reader(struct swap_map_handle *handle) -{ - if (handle->cur) - free_page((unsigned long)handle->cur); - handle->cur = NULL; -} - -static inline int get_swap_map_reader(struct swap_map_handle *handle, - swp_entry_t start) -{ - int error; - - if (!swp_offset(start)) - return -EINVAL; - handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); - if (!handle->cur) - return -ENOMEM; - error = bio_read_page(swp_offset(start), handle->cur); - if (error) { - release_swap_map_reader(handle); - return error; - } - handle->k = 0; - return 0; -} - -static inline int swap_map_read_page(struct swap_map_handle *handle, void *buf) -{ - unsigned long offset; - int error; - - if (!handle->cur) - return -EINVAL; - offset = swp_offset(handle->cur->entries[handle->k]); - if (!offset) - return -EINVAL; - error = bio_read_page(offset, buf); - if (error) - return error; - if (++handle->k >= MAP_PAGE_SIZE) { - handle->k = 0; - offset = swp_offset(handle->cur->next_swap); - if (!offset) - release_swap_map_reader(handle); - else - error = bio_read_page(offset, handle->cur); - } - return error; -} - -static int check_header(void) -{ - char *reason = NULL; - - dump_info(); - if (swsusp_info.version_code != LINUX_VERSION_CODE) - reason = "kernel version"; - if (swsusp_info.num_physpages != num_physpages) - reason = "memory size"; - if (strcmp(swsusp_info.uts.sysname,system_utsname.sysname)) - reason = "system type"; - if (strcmp(swsusp_info.uts.release,system_utsname.release)) - reason = "kernel release"; - if (strcmp(swsusp_info.uts.version,system_utsname.version)) - reason = "version"; - if (strcmp(swsusp_info.uts.machine,system_utsname.machine)) - reason = "machine"; - if (reason) { - printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason); - return -EPERM; - } - return 0; -} - -/** - * load_image_data - load the image data using the swap map handle - * @handle and store them using the page backup list @pblist - * (assume there are @nr_pages pages to load) - */ - -static int load_image_data(struct pbe *pblist, - struct swap_map_handle *handle, - unsigned int nr_pages) -{ - int error; - unsigned int m; - struct pbe *p; - - if (!pblist) - return -EINVAL; - printk("Loading image data pages (%u pages) ... ", nr_pages); - m = nr_pages / 100; - if (!m) - m = 1; - nr_pages = 0; - p = pblist; - while (p) { - error = swap_map_read_page(handle, (void *)p->address); - if (error) - break; - p = p->next; - if (!(nr_pages % m)) - printk("\b\b\b\b%3d%%", nr_pages / m); - nr_pages++; - } - if (!error) - printk("\b\b\b\bdone\n"); - return error; -} - -/** - * unpack_orig_addresses - copy the elements of @buf[] (1 page) to - * the PBEs in the list starting at @pbe - */ - -static inline struct pbe *unpack_orig_addresses(unsigned long *buf, - struct pbe *pbe) -{ - int j; - - for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) { - pbe->orig_address = buf[j]; - pbe = pbe->next; - } - return pbe; -} - -/** - * load_image_metadata - load the image metadata using the swap map - * handle @handle and put them into the PBEs in the list @pblist - */ - -static int load_image_metadata(struct pbe *pblist, struct swap_map_handle *handle) -{ - struct pbe *p; - unsigned long *buf; - unsigned int n = 0; - int error = 0; - - printk("Loading image metadata ... "); - buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC); - if (!buf) - return -ENOMEM; - p = pblist; - while (p) { - error = swap_map_read_page(handle, buf); - if (error) - break; - p = unpack_orig_addresses(buf, p); - n++; - } - free_page((unsigned long)buf); - if (!error) - printk("done (%u pages loaded)\n", n); - return error; -} - -int swsusp_read(struct pbe **pblist_ptr) -{ - int error; - struct pbe *p, *pblist; - struct swap_map_handle handle; - unsigned int nr_pages; - - if (IS_ERR(resume_bdev)) { - pr_debug("swsusp: block device not initialised\n"); - return PTR_ERR(resume_bdev); - } - - error = get_swap_map_reader(&handle, swsusp_header.image); - if (!error) - error = swap_map_read_page(&handle, &swsusp_info); - if (!error) - error = check_header(); - if (error) - return error; - nr_pages = swsusp_info.image_pages; - p = alloc_pagedir(nr_pages, GFP_ATOMIC, 0); - if (!p) - return -ENOMEM; - error = load_image_metadata(p, &handle); - if (!error) { - mark_unsafe_pages(p); - pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1); - if (pblist) - copy_page_backup_list(pblist, p); - free_pagedir(p); - if (!pblist) - error = -ENOMEM; - - /* Allocate memory for the image and read the data from swap */ - if (!error) - error = alloc_data_pages(pblist, GFP_ATOMIC, 1); - if (!error) { - release_eaten_pages(); - error = load_image_data(pblist, &handle, nr_pages); - } - if (!error) - *pblist_ptr = pblist; - } - release_swap_map_reader(&handle); - - blkdev_put(resume_bdev); - - if (!error) - pr_debug("swsusp: Reading resume file was successful\n"); - else - pr_debug("swsusp: Error %d resuming\n", error); - return error; -} - -/** - * swsusp_check - Check for swsusp signature in the resume device - */ - -int swsusp_check(void) -{ - int error; - - resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); - if (!IS_ERR(resume_bdev)) { - set_blocksize(resume_bdev, PAGE_SIZE); - memset(&swsusp_header, 0, sizeof(swsusp_header)); - if ((error = bio_read_page(0, &swsusp_header))) - return error; - if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { - memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); - /* Reset swap signature now */ - error = bio_write_page(0, &swsusp_header); - } else { - return -EINVAL; - } - if (error) - blkdev_put(resume_bdev); - else - pr_debug("swsusp: Signature found, resuming\n"); - } else { - error = PTR_ERR(resume_bdev); - } - - if (error) - pr_debug("swsusp: Error %d check for resume file\n", error); - - return error; -} - -/** - * swsusp_close - close swap device. - */ - -void swsusp_close(void) -{ - if (IS_ERR(resume_bdev)) { - pr_debug("swsusp: block device not initialised\n"); - return; - } - - blkdev_put(resume_bdev); -} diff --git a/kernel/power/user.c b/kernel/power/user.c new file mode 100644 index 00000000000..3f1539fbe48 --- /dev/null +++ b/kernel/power/user.c @@ -0,0 +1,333 @@ +/* + * linux/kernel/power/user.c + * + * This file provides the user space interface for software suspend/resume. + * + * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> + * + * This file is released under the GPLv2. + * + */ + +#include <linux/suspend.h> +#include <linux/syscalls.h> +#include <linux/string.h> +#include <linux/device.h> +#include <linux/miscdevice.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/swapops.h> +#include <linux/pm.h> +#include <linux/fs.h> + +#include <asm/uaccess.h> + +#include "power.h" + +#define SNAPSHOT_MINOR 231 + +static struct snapshot_data { + struct snapshot_handle handle; + int swap; + struct bitmap_page *bitmap; + int mode; + char frozen; + char ready; +} snapshot_state; + +static atomic_t device_available = ATOMIC_INIT(1); + +static int snapshot_open(struct inode *inode, struct file *filp) +{ + struct snapshot_data *data; + + if (!atomic_add_unless(&device_available, -1, 0)) + return -EBUSY; + + if ((filp->f_flags & O_ACCMODE) == O_RDWR) + return -ENOSYS; + + nonseekable_open(inode, filp); + data = &snapshot_state; + filp->private_data = data; + memset(&data->handle, 0, sizeof(struct snapshot_handle)); + if ((filp->f_flags & O_ACCMODE) == O_RDONLY) { + data->swap = swsusp_resume_device ? swap_type_of(swsusp_resume_device) : -1; + data->mode = O_RDONLY; + } else { + data->swap = -1; + data->mode = O_WRONLY; + } + data->bitmap = NULL; + data->frozen = 0; + data->ready = 0; + + return 0; +} + +static int snapshot_release(struct inode *inode, struct file *filp) +{ + struct snapshot_data *data; + + swsusp_free(); + data = filp->private_data; + free_all_swap_pages(data->swap, data->bitmap); + free_bitmap(data->bitmap); + if (data->frozen) { + down(&pm_sem); + thaw_processes(); + enable_nonboot_cpus(); + up(&pm_sem); + } + atomic_inc(&device_available); + return 0; +} + +static ssize_t snapshot_read(struct file *filp, char __user *buf, + size_t count, loff_t *offp) +{ + struct snapshot_data *data; + ssize_t res; + + data = filp->private_data; + res = snapshot_read_next(&data->handle, count); + if (res > 0) { + if (copy_to_user(buf, data_of(data->handle), res)) + res = -EFAULT; + else + *offp = data->handle.offset; + } + return res; +} + +static ssize_t snapshot_write(struct file *filp, const char __user *buf, + size_t count, loff_t *offp) +{ + struct snapshot_data *data; + ssize_t res; + + data = filp->private_data; + res = snapshot_write_next(&data->handle, count); + if (res > 0) { + if (copy_from_user(data_of(data->handle), buf, res)) + res = -EFAULT; + else + *offp = data->handle.offset; + } + return res; +} + +static int snapshot_ioctl(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg) +{ + int error = 0; + struct snapshot_data *data; + loff_t offset, avail; + + if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC) + return -ENOTTY; + if (_IOC_NR(cmd) > SNAPSHOT_IOC_MAXNR) + return -ENOTTY; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + data = filp->private_data; + + switch (cmd) { + + case SNAPSHOT_FREEZE: + if (data->frozen) + break; + down(&pm_sem); + disable_nonboot_cpus(); + if (freeze_processes()) { + thaw_processes(); + enable_nonboot_cpus(); + error = -EBUSY; + } + up(&pm_sem); + if (!error) + data->frozen = 1; + break; + + case SNAPSHOT_UNFREEZE: + if (!data->frozen) + break; + down(&pm_sem); + thaw_processes(); + enable_nonboot_cpus(); + up(&pm_sem); + data->frozen = 0; + break; + + case SNAPSHOT_ATOMIC_SNAPSHOT: + if (data->mode != O_RDONLY || !data->frozen || data->ready) { + error = -EPERM; + break; + } + down(&pm_sem); + /* Free memory before shutting down devices. */ + error = swsusp_shrink_memory(); + if (!error) { + error = device_suspend(PMSG_FREEZE); + if (!error) { + in_suspend = 1; + error = swsusp_suspend(); + device_resume(); + } + } + up(&pm_sem); + if (!error) + error = put_user(in_suspend, (unsigned int __user *)arg); + if (!error) + data->ready = 1; + break; + + case SNAPSHOT_ATOMIC_RESTORE: + if (data->mode != O_WRONLY || !data->frozen || + !snapshot_image_loaded(&data->handle)) { + error = -EPERM; + break; + } + down(&pm_sem); + pm_prepare_console(); + error = device_suspend(PMSG_FREEZE); + if (!error) { + error = swsusp_resume(); + device_resume(); + } + pm_restore_console(); + up(&pm_sem); + break; + + case SNAPSHOT_FREE: + swsusp_free(); + memset(&data->handle, 0, sizeof(struct snapshot_handle)); + data->ready = 0; + break; + + case SNAPSHOT_SET_IMAGE_SIZE: + image_size = arg; + break; + + case SNAPSHOT_AVAIL_SWAP: + avail = count_swap_pages(data->swap, 1); + avail <<= PAGE_SHIFT; + error = put_user(avail, (loff_t __user *)arg); + break; + + case SNAPSHOT_GET_SWAP_PAGE: + if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { + error = -ENODEV; + break; + } + if (!data->bitmap) { + data->bitmap = alloc_bitmap(count_swap_pages(data->swap, 0)); + if (!data->bitmap) { + error = -ENOMEM; + break; + } + } + offset = alloc_swap_page(data->swap, data->bitmap); + if (offset) { + offset <<= PAGE_SHIFT; + error = put_user(offset, (loff_t __user *)arg); + } else { + error = -ENOSPC; + } + break; + + case SNAPSHOT_FREE_SWAP_PAGES: + if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { + error = -ENODEV; + break; + } + free_all_swap_pages(data->swap, data->bitmap); + free_bitmap(data->bitmap); + data->bitmap = NULL; + break; + + case SNAPSHOT_SET_SWAP_FILE: + if (!data->bitmap) { + /* + * User space encodes device types as two-byte values, + * so we need to recode them + */ + if (old_decode_dev(arg)) { + data->swap = swap_type_of(old_decode_dev(arg)); + if (data->swap < 0) + error = -ENODEV; + } else { + data->swap = -1; + error = -EINVAL; + } + } else { + error = -EPERM; + } + break; + + case SNAPSHOT_S2RAM: + if (!data->frozen) { + error = -EPERM; + break; + } + + if (down_trylock(&pm_sem)) { + error = -EBUSY; + break; + } + + if (pm_ops->prepare) { + error = pm_ops->prepare(PM_SUSPEND_MEM); + if (error) + goto OutS3; + } + + /* Put devices to sleep */ + error = device_suspend(PMSG_SUSPEND); + if (error) { + printk(KERN_ERR "Failed to suspend some devices.\n"); + } else { + /* Enter S3, system is already frozen */ + suspend_enter(PM_SUSPEND_MEM); + + /* Wake up devices */ + device_resume(); + } + + if (pm_ops->finish) + pm_ops->finish(PM_SUSPEND_MEM); + +OutS3: + up(&pm_sem); + break; + + default: + error = -ENOTTY; + + } + + return error; +} + +static struct file_operations snapshot_fops = { + .open = snapshot_open, + .release = snapshot_release, + .read = snapshot_read, + .write = snapshot_write, + .llseek = no_llseek, + .ioctl = snapshot_ioctl, +}; + +static struct miscdevice snapshot_device = { + .minor = SNAPSHOT_MINOR, + .name = "snapshot", + .fops = &snapshot_fops, +}; + +static int __init snapshot_device_init(void) +{ + return misc_register(&snapshot_device); +}; + +device_initcall(snapshot_device_init); diff --git a/kernel/printk.c b/kernel/printk.c index 7ba79ad895e..891f7a71403 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -122,44 +122,6 @@ static char *log_buf = __log_buf; static int log_buf_len = __LOG_BUF_LEN; static unsigned long logged_chars; /* Number of chars produced since last read+clear operation */ -/* - * Setup a list of consoles. Called from init/main.c - */ -static int __init console_setup(char *str) -{ - char name[sizeof(console_cmdline[0].name)]; - char *s, *options; - int idx; - - /* - * Decode str into name, index, options. - */ - if (str[0] >= '0' && str[0] <= '9') { - strcpy(name, "ttyS"); - strncpy(name + 4, str, sizeof(name) - 5); - } else - strncpy(name, str, sizeof(name) - 1); - name[sizeof(name) - 1] = 0; - if ((options = strchr(str, ',')) != NULL) - *(options++) = 0; -#ifdef __sparc__ - if (!strcmp(str, "ttya")) - strcpy(name, "ttyS0"); - if (!strcmp(str, "ttyb")) - strcpy(name, "ttyS1"); -#endif - for (s = name; *s; s++) - if ((*s >= '0' && *s <= '9') || *s == ',') - break; - idx = simple_strtoul(s, NULL, 10); - *s = 0; - - add_preferred_console(name, idx, options); - return 1; -} - -__setup("console=", console_setup); - static int __init log_buf_len_setup(char *str) { unsigned long size = memparse(str, &str); @@ -659,6 +621,44 @@ static void call_console_drivers(unsigned long start, unsigned long end) #endif +/* + * Set up a list of consoles. Called from init/main.c + */ +static int __init console_setup(char *str) +{ + char name[sizeof(console_cmdline[0].name)]; + char *s, *options; + int idx; + + /* + * Decode str into name, index, options. + */ + if (str[0] >= '0' && str[0] <= '9') { + strcpy(name, "ttyS"); + strncpy(name + 4, str, sizeof(name) - 5); + } else { + strncpy(name, str, sizeof(name) - 1); + } + name[sizeof(name) - 1] = 0; + if ((options = strchr(str, ',')) != NULL) + *(options++) = 0; +#ifdef __sparc__ + if (!strcmp(str, "ttya")) + strcpy(name, "ttyS0"); + if (!strcmp(str, "ttyb")) + strcpy(name, "ttyS1"); +#endif + for (s = name; *s; s++) + if ((*s >= '0' && *s <= '9') || *s == ',') + break; + idx = simple_strtoul(s, NULL, 10); + *s = 0; + + add_preferred_console(name, idx, options); + return 1; +} +__setup("console=", console_setup); + /** * add_preferred_console - add a device to the list of preferred consoles. * @name: device name diff --git a/kernel/profile.c b/kernel/profile.c index f89248e6d70..5a730fdb1a2 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -23,6 +23,7 @@ #include <linux/cpu.h> #include <linux/profile.h> #include <linux/highmem.h> +#include <linux/mutex.h> #include <asm/sections.h> #include <asm/semaphore.h> @@ -44,7 +45,7 @@ static cpumask_t prof_cpu_mask = CPU_MASK_ALL; #ifdef CONFIG_SMP static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); static DEFINE_PER_CPU(int, cpu_profile_flip); -static DECLARE_MUTEX(profile_flip_mutex); +static DEFINE_MUTEX(profile_flip_mutex); #endif /* CONFIG_SMP */ static int __init profile_setup(char * str) @@ -86,72 +87,52 @@ void __init profile_init(void) #ifdef CONFIG_PROFILING -static DECLARE_RWSEM(profile_rwsem); -static DEFINE_RWLOCK(handoff_lock); -static struct notifier_block * task_exit_notifier; -static struct notifier_block * task_free_notifier; -static struct notifier_block * munmap_notifier; +static BLOCKING_NOTIFIER_HEAD(task_exit_notifier); +static ATOMIC_NOTIFIER_HEAD(task_free_notifier); +static BLOCKING_NOTIFIER_HEAD(munmap_notifier); void profile_task_exit(struct task_struct * task) { - down_read(&profile_rwsem); - notifier_call_chain(&task_exit_notifier, 0, task); - up_read(&profile_rwsem); + blocking_notifier_call_chain(&task_exit_notifier, 0, task); } int profile_handoff_task(struct task_struct * task) { int ret; - read_lock(&handoff_lock); - ret = notifier_call_chain(&task_free_notifier, 0, task); - read_unlock(&handoff_lock); + ret = atomic_notifier_call_chain(&task_free_notifier, 0, task); return (ret == NOTIFY_OK) ? 1 : 0; } void profile_munmap(unsigned long addr) { - down_read(&profile_rwsem); - notifier_call_chain(&munmap_notifier, 0, (void *)addr); - up_read(&profile_rwsem); + blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr); } int task_handoff_register(struct notifier_block * n) { - int err = -EINVAL; - - write_lock(&handoff_lock); - err = notifier_chain_register(&task_free_notifier, n); - write_unlock(&handoff_lock); - return err; + return atomic_notifier_chain_register(&task_free_notifier, n); } int task_handoff_unregister(struct notifier_block * n) { - int err = -EINVAL; - - write_lock(&handoff_lock); - err = notifier_chain_unregister(&task_free_notifier, n); - write_unlock(&handoff_lock); - return err; + return atomic_notifier_chain_unregister(&task_free_notifier, n); } int profile_event_register(enum profile_type type, struct notifier_block * n) { int err = -EINVAL; - down_write(&profile_rwsem); - switch (type) { case PROFILE_TASK_EXIT: - err = notifier_chain_register(&task_exit_notifier, n); + err = blocking_notifier_chain_register( + &task_exit_notifier, n); break; case PROFILE_MUNMAP: - err = notifier_chain_register(&munmap_notifier, n); + err = blocking_notifier_chain_register( + &munmap_notifier, n); break; } - up_write(&profile_rwsem); - return err; } @@ -160,18 +141,17 @@ int profile_event_unregister(enum profile_type type, struct notifier_block * n) { int err = -EINVAL; - down_write(&profile_rwsem); - switch (type) { case PROFILE_TASK_EXIT: - err = notifier_chain_unregister(&task_exit_notifier, n); + err = blocking_notifier_chain_unregister( + &task_exit_notifier, n); break; case PROFILE_MUNMAP: - err = notifier_chain_unregister(&munmap_notifier, n); + err = blocking_notifier_chain_unregister( + &munmap_notifier, n); break; } - up_write(&profile_rwsem); return err; } @@ -243,7 +223,7 @@ static void profile_flip_buffers(void) { int i, j, cpu; - down(&profile_flip_mutex); + mutex_lock(&profile_flip_mutex); j = per_cpu(cpu_profile_flip, get_cpu()); put_cpu(); on_each_cpu(__profile_flip_buffers, NULL, 0, 1); @@ -259,14 +239,14 @@ static void profile_flip_buffers(void) hits[i].hits = hits[i].pc = 0; } } - up(&profile_flip_mutex); + mutex_unlock(&profile_flip_mutex); } static void profile_discard_flip_buffers(void) { int i, cpu; - down(&profile_flip_mutex); + mutex_lock(&profile_flip_mutex); i = per_cpu(cpu_profile_flip, get_cpu()); put_cpu(); on_each_cpu(__profile_flip_buffers, NULL, 0, 1); @@ -274,7 +254,7 @@ static void profile_discard_flip_buffers(void) struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i]; memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit)); } - up(&profile_flip_mutex); + mutex_unlock(&profile_flip_mutex); } void profile_hit(int type, void *__pc) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index d95a72c9279..86a7f6c60cb 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -35,9 +35,9 @@ void __ptrace_link(task_t *child, task_t *new_parent) if (child->parent == new_parent) return; list_add(&child->ptrace_list, &child->parent->ptrace_children); - REMOVE_LINKS(child); + remove_parent(child); child->parent = new_parent; - SET_LINKS(child); + add_parent(child); } /* @@ -77,9 +77,9 @@ void __ptrace_unlink(task_t *child) child->ptrace = 0; if (!list_empty(&child->ptrace_list)) { list_del_init(&child->ptrace_list); - REMOVE_LINKS(child); + remove_parent(child); child->parent = child->real_parent; - SET_LINKS(child); + add_parent(child); } ptrace_untrace(child); diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 8cf15a569fc..13458bbaa1b 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -47,15 +47,16 @@ #include <linux/notifier.h> #include <linux/rcupdate.h> #include <linux/cpu.h> +#include <linux/mutex.h> /* Definition for rcupdate control block. */ -struct rcu_ctrlblk rcu_ctrlblk = { +static struct rcu_ctrlblk rcu_ctrlblk = { .cur = -300, .completed = -300, .lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE, }; -struct rcu_ctrlblk rcu_bh_ctrlblk = { +static struct rcu_ctrlblk rcu_bh_ctrlblk = { .cur = -300, .completed = -300, .lock = SPIN_LOCK_UNLOCKED, @@ -75,7 +76,7 @@ static int rsinterval = 1000; #endif static atomic_t rcu_barrier_cpu_count; -static struct semaphore rcu_barrier_sema; +static DEFINE_MUTEX(rcu_barrier_mutex); static struct completion rcu_barrier_completion; #ifdef CONFIG_SMP @@ -207,13 +208,13 @@ static void rcu_barrier_func(void *notused) void rcu_barrier(void) { BUG_ON(in_interrupt()); - /* Take cpucontrol semaphore to protect against CPU hotplug */ - down(&rcu_barrier_sema); + /* Take cpucontrol mutex to protect against CPU hotplug */ + mutex_lock(&rcu_barrier_mutex); init_completion(&rcu_barrier_completion); atomic_set(&rcu_barrier_cpu_count, 0); on_each_cpu(rcu_barrier_func, NULL, 0, 1); wait_for_completion(&rcu_barrier_completion); - up(&rcu_barrier_sema); + mutex_unlock(&rcu_barrier_mutex); } EXPORT_SYMBOL_GPL(rcu_barrier); @@ -415,8 +416,8 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, rdp->curtail = &rdp->curlist; } - local_irq_disable(); if (rdp->nxtlist && !rdp->curlist) { + local_irq_disable(); rdp->curlist = rdp->nxtlist; rdp->curtail = rdp->nxttail; rdp->nxtlist = NULL; @@ -441,9 +442,8 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, rcu_start_batch(rcp); spin_unlock(&rcp->lock); } - } else { - local_irq_enable(); } + rcu_check_quiescent_state(rcp, rdp); if (rdp->donelist) rcu_do_batch(rdp); @@ -549,7 +549,6 @@ static struct notifier_block __devinitdata rcu_nb = { */ void __init rcu_init(void) { - sema_init(&rcu_barrier_sema, 1); rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)smp_processor_id()); /* Register notifier for non-boot CPUs */ @@ -609,7 +608,7 @@ module_param(qlowmark, int, 0); module_param(rsinterval, int, 0); #endif EXPORT_SYMBOL_GPL(rcu_batches_completed); -EXPORT_SYMBOL(call_rcu); /* WARNING: GPL-only in April 2006. */ -EXPORT_SYMBOL(call_rcu_bh); /* WARNING: GPL-only in April 2006. */ +EXPORT_SYMBOL_GPL_FUTURE(call_rcu); /* WARNING: GPL-only in April 2006. */ +EXPORT_SYMBOL_GPL_FUTURE(call_rcu_bh); /* WARNING: GPL-only in April 2006. */ EXPORT_SYMBOL_GPL(synchronize_rcu); -EXPORT_SYMBOL(synchronize_kernel); /* WARNING: GPL-only in April 2006. */ +EXPORT_SYMBOL_GPL_FUTURE(synchronize_kernel); /* WARNING: GPL-only in April 2006. */ diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 7712912dbc8..8154e7589d1 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -54,15 +54,15 @@ static int verbose; /* Print more debug info. */ static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/ -MODULE_PARM(nreaders, "i"); +module_param(nreaders, int, 0); MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); -MODULE_PARM(stat_interval, "i"); +module_param(stat_interval, int, 0); MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); -MODULE_PARM(verbose, "i"); +module_param(verbose, bool, 0); MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); -MODULE_PARM(test_no_idle_hz, "i"); +module_param(test_no_idle_hz, bool, 0); MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); -MODULE_PARM(shuffle_interval, "i"); +module_param(shuffle_interval, int, 0); MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); #define TORTURE_FLAG "rcutorture: " #define PRINTK_STRING(s) \ @@ -301,7 +301,7 @@ rcu_torture_printk(char *page) long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; - for_each_cpu(cpu) { + for_each_possible_cpu(cpu) { for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i]; batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i]; @@ -441,6 +441,16 @@ rcu_torture_shuffle(void *arg) return 0; } +static inline void +rcu_torture_print_module_parms(char *tag) +{ + printk(KERN_ALERT TORTURE_FLAG "--- %s: nreaders=%d " + "stat_interval=%d verbose=%d test_no_idle_hz=%d " + "shuffle_interval = %d\n", + tag, nrealreaders, stat_interval, verbose, test_no_idle_hz, + shuffle_interval); +} + static void rcu_torture_cleanup(void) { @@ -483,9 +493,10 @@ rcu_torture_cleanup(void) rcu_barrier(); rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ - printk(KERN_ALERT TORTURE_FLAG - "--- End of test: %s\n", - atomic_read(&n_rcu_torture_error) == 0 ? "SUCCESS" : "FAILURE"); + if (atomic_read(&n_rcu_torture_error)) + rcu_torture_print_module_parms("End of test: FAILURE"); + else + rcu_torture_print_module_parms("End of test: SUCCESS"); } static int @@ -501,11 +512,7 @@ rcu_torture_init(void) nrealreaders = nreaders; else nrealreaders = 2 * num_online_cpus(); - printk(KERN_ALERT TORTURE_FLAG "--- Start of test: nreaders=%d " - "stat_interval=%d verbose=%d test_no_idle_hz=%d " - "shuffle_interval = %d\n", - nrealreaders, stat_interval, verbose, test_no_idle_hz, - shuffle_interval); + rcu_torture_print_module_parms("Start of test"); fullstop = 0; /* Set up the freelist. */ @@ -528,7 +535,7 @@ rcu_torture_init(void) atomic_set(&n_rcu_torture_error, 0); for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) atomic_set(&rcu_torture_wcount[i], 0); - for_each_cpu(cpu) { + for_each_possible_cpu(cpu) { for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { per_cpu(rcu_torture_count, cpu)[i] = 0; per_cpu(rcu_torture_batch, cpu)[i] = 0; diff --git a/kernel/relay.c b/kernel/relay.c new file mode 100644 index 00000000000..33345e73485 --- /dev/null +++ b/kernel/relay.c @@ -0,0 +1,1012 @@ +/* + * Public API and common code for kernel->userspace relay file support. + * + * See Documentation/filesystems/relayfs.txt for an overview of relayfs. + * + * Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp + * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com) + * + * Moved to kernel/relay.c by Paul Mundt, 2006. + * + * This file is released under the GPL. + */ +#include <linux/errno.h> +#include <linux/stddef.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/string.h> +#include <linux/relay.h> +#include <linux/vmalloc.h> +#include <linux/mm.h> + +/* + * close() vm_op implementation for relay file mapping. + */ +static void relay_file_mmap_close(struct vm_area_struct *vma) +{ + struct rchan_buf *buf = vma->vm_private_data; + buf->chan->cb->buf_unmapped(buf, vma->vm_file); +} + +/* + * nopage() vm_op implementation for relay file mapping. + */ +static struct page *relay_buf_nopage(struct vm_area_struct *vma, + unsigned long address, + int *type) +{ + struct page *page; + struct rchan_buf *buf = vma->vm_private_data; + unsigned long offset = address - vma->vm_start; + + if (address > vma->vm_end) + return NOPAGE_SIGBUS; /* Disallow mremap */ + if (!buf) + return NOPAGE_OOM; + + page = vmalloc_to_page(buf->start + offset); + if (!page) + return NOPAGE_OOM; + get_page(page); + + if (type) + *type = VM_FAULT_MINOR; + + return page; +} + +/* + * vm_ops for relay file mappings. + */ +static struct vm_operations_struct relay_file_mmap_ops = { + .nopage = relay_buf_nopage, + .close = relay_file_mmap_close, +}; + +/** + * relay_mmap_buf: - mmap channel buffer to process address space + * @buf: relay channel buffer + * @vma: vm_area_struct describing memory to be mapped + * + * Returns 0 if ok, negative on error + * + * Caller should already have grabbed mmap_sem. + */ +int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma) +{ + unsigned long length = vma->vm_end - vma->vm_start; + struct file *filp = vma->vm_file; + + if (!buf) + return -EBADF; + + if (length != (unsigned long)buf->chan->alloc_size) + return -EINVAL; + + vma->vm_ops = &relay_file_mmap_ops; + vma->vm_private_data = buf; + buf->chan->cb->buf_mapped(buf, filp); + + return 0; +} + +/** + * relay_alloc_buf - allocate a channel buffer + * @buf: the buffer struct + * @size: total size of the buffer + * + * Returns a pointer to the resulting buffer, NULL if unsuccessful. The + * passed in size will get page aligned, if it isn't already. + */ +static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size) +{ + void *mem; + unsigned int i, j, n_pages; + + *size = PAGE_ALIGN(*size); + n_pages = *size >> PAGE_SHIFT; + + buf->page_array = kcalloc(n_pages, sizeof(struct page *), GFP_KERNEL); + if (!buf->page_array) + return NULL; + + for (i = 0; i < n_pages; i++) { + buf->page_array[i] = alloc_page(GFP_KERNEL); + if (unlikely(!buf->page_array[i])) + goto depopulate; + } + mem = vmap(buf->page_array, n_pages, VM_MAP, PAGE_KERNEL); + if (!mem) + goto depopulate; + + memset(mem, 0, *size); + buf->page_count = n_pages; + return mem; + +depopulate: + for (j = 0; j < i; j++) + __free_page(buf->page_array[j]); + kfree(buf->page_array); + return NULL; +} + +/** + * relay_create_buf - allocate and initialize a channel buffer + * @alloc_size: size of the buffer to allocate + * @n_subbufs: number of sub-buffers in the channel + * + * Returns channel buffer if successful, NULL otherwise + */ +struct rchan_buf *relay_create_buf(struct rchan *chan) +{ + struct rchan_buf *buf = kcalloc(1, sizeof(struct rchan_buf), GFP_KERNEL); + if (!buf) + return NULL; + + buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL); + if (!buf->padding) + goto free_buf; + + buf->start = relay_alloc_buf(buf, &chan->alloc_size); + if (!buf->start) + goto free_buf; + + buf->chan = chan; + kref_get(&buf->chan->kref); + return buf; + +free_buf: + kfree(buf->padding); + kfree(buf); + return NULL; +} + +/** + * relay_destroy_channel - free the channel struct + * + * Should only be called from kref_put(). + */ +void relay_destroy_channel(struct kref *kref) +{ + struct rchan *chan = container_of(kref, struct rchan, kref); + kfree(chan); +} + +/** + * relay_destroy_buf - destroy an rchan_buf struct and associated buffer + * @buf: the buffer struct + */ +void relay_destroy_buf(struct rchan_buf *buf) +{ + struct rchan *chan = buf->chan; + unsigned int i; + + if (likely(buf->start)) { + vunmap(buf->start); + for (i = 0; i < buf->page_count; i++) + __free_page(buf->page_array[i]); + kfree(buf->page_array); + } + kfree(buf->padding); + kfree(buf); + kref_put(&chan->kref, relay_destroy_channel); +} + +/** + * relay_remove_buf - remove a channel buffer + * + * Removes the file from the fileystem, which also frees the + * rchan_buf_struct and the channel buffer. Should only be called from + * kref_put(). + */ +void relay_remove_buf(struct kref *kref) +{ + struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref); + buf->chan->cb->remove_buf_file(buf->dentry); + relay_destroy_buf(buf); +} + +/** + * relay_buf_empty - boolean, is the channel buffer empty? + * @buf: channel buffer + * + * Returns 1 if the buffer is empty, 0 otherwise. + */ +int relay_buf_empty(struct rchan_buf *buf) +{ + return (buf->subbufs_produced - buf->subbufs_consumed) ? 0 : 1; +} +EXPORT_SYMBOL_GPL(relay_buf_empty); + +/** + * relay_buf_full - boolean, is the channel buffer full? + * @buf: channel buffer + * + * Returns 1 if the buffer is full, 0 otherwise. + */ +int relay_buf_full(struct rchan_buf *buf) +{ + size_t ready = buf->subbufs_produced - buf->subbufs_consumed; + return (ready >= buf->chan->n_subbufs) ? 1 : 0; +} +EXPORT_SYMBOL_GPL(relay_buf_full); + +/* + * High-level relay kernel API and associated functions. + */ + +/* + * rchan_callback implementations defining default channel behavior. Used + * in place of corresponding NULL values in client callback struct. + */ + +/* + * subbuf_start() default callback. Does nothing. + */ +static int subbuf_start_default_callback (struct rchan_buf *buf, + void *subbuf, + void *prev_subbuf, + size_t prev_padding) +{ + if (relay_buf_full(buf)) + return 0; + + return 1; +} + +/* + * buf_mapped() default callback. Does nothing. + */ +static void buf_mapped_default_callback(struct rchan_buf *buf, + struct file *filp) +{ +} + +/* + * buf_unmapped() default callback. Does nothing. + */ +static void buf_unmapped_default_callback(struct rchan_buf *buf, + struct file *filp) +{ +} + +/* + * create_buf_file_create() default callback. Does nothing. + */ +static struct dentry *create_buf_file_default_callback(const char *filename, + struct dentry *parent, + int mode, + struct rchan_buf *buf, + int *is_global) +{ + return NULL; +} + +/* + * remove_buf_file() default callback. Does nothing. + */ +static int remove_buf_file_default_callback(struct dentry *dentry) +{ + return -EINVAL; +} + +/* relay channel default callbacks */ +static struct rchan_callbacks default_channel_callbacks = { + .subbuf_start = subbuf_start_default_callback, + .buf_mapped = buf_mapped_default_callback, + .buf_unmapped = buf_unmapped_default_callback, + .create_buf_file = create_buf_file_default_callback, + .remove_buf_file = remove_buf_file_default_callback, +}; + +/** + * wakeup_readers - wake up readers waiting on a channel + * @private: the channel buffer + * + * This is the work function used to defer reader waking. The + * reason waking is deferred is that calling directly from write + * causes problems if you're writing from say the scheduler. + */ +static void wakeup_readers(void *private) +{ + struct rchan_buf *buf = private; + wake_up_interruptible(&buf->read_wait); +} + +/** + * __relay_reset - reset a channel buffer + * @buf: the channel buffer + * @init: 1 if this is a first-time initialization + * + * See relay_reset for description of effect. + */ +static inline void __relay_reset(struct rchan_buf *buf, unsigned int init) +{ + size_t i; + + if (init) { + init_waitqueue_head(&buf->read_wait); + kref_init(&buf->kref); + INIT_WORK(&buf->wake_readers, NULL, NULL); + } else { + cancel_delayed_work(&buf->wake_readers); + flush_scheduled_work(); + } + + buf->subbufs_produced = 0; + buf->subbufs_consumed = 0; + buf->bytes_consumed = 0; + buf->finalized = 0; + buf->data = buf->start; + buf->offset = 0; + + for (i = 0; i < buf->chan->n_subbufs; i++) + buf->padding[i] = 0; + + buf->chan->cb->subbuf_start(buf, buf->data, NULL, 0); +} + +/** + * relay_reset - reset the channel + * @chan: the channel + * + * This has the effect of erasing all data from all channel buffers + * and restarting the channel in its initial state. The buffers + * are not freed, so any mappings are still in effect. + * + * NOTE: Care should be taken that the channel isn't actually + * being used by anything when this call is made. + */ +void relay_reset(struct rchan *chan) +{ + unsigned int i; + struct rchan_buf *prev = NULL; + + if (!chan) + return; + + for (i = 0; i < NR_CPUS; i++) { + if (!chan->buf[i] || chan->buf[i] == prev) + break; + __relay_reset(chan->buf[i], 0); + prev = chan->buf[i]; + } +} +EXPORT_SYMBOL_GPL(relay_reset); + +/** + * relay_open_buf - create a new relay channel buffer + * + * Internal - used by relay_open(). + */ +static struct rchan_buf *relay_open_buf(struct rchan *chan, + const char *filename, + struct dentry *parent, + int *is_global) +{ + struct rchan_buf *buf; + struct dentry *dentry; + + if (*is_global) + return chan->buf[0]; + + buf = relay_create_buf(chan); + if (!buf) + return NULL; + + /* Create file in fs */ + dentry = chan->cb->create_buf_file(filename, parent, S_IRUSR, + buf, is_global); + if (!dentry) { + relay_destroy_buf(buf); + return NULL; + } + + buf->dentry = dentry; + __relay_reset(buf, 1); + + return buf; +} + +/** + * relay_close_buf - close a channel buffer + * @buf: channel buffer + * + * Marks the buffer finalized and restores the default callbacks. + * The channel buffer and channel buffer data structure are then freed + * automatically when the last reference is given up. + */ +static inline void relay_close_buf(struct rchan_buf *buf) +{ + buf->finalized = 1; + cancel_delayed_work(&buf->wake_readers); + flush_scheduled_work(); + kref_put(&buf->kref, relay_remove_buf); +} + +static inline void setup_callbacks(struct rchan *chan, + struct rchan_callbacks *cb) +{ + if (!cb) { + chan->cb = &default_channel_callbacks; + return; + } + + if (!cb->subbuf_start) + cb->subbuf_start = subbuf_start_default_callback; + if (!cb->buf_mapped) + cb->buf_mapped = buf_mapped_default_callback; + if (!cb->buf_unmapped) + cb->buf_unmapped = buf_unmapped_default_callback; + if (!cb->create_buf_file) + cb->create_buf_file = create_buf_file_default_callback; + if (!cb->remove_buf_file) + cb->remove_buf_file = remove_buf_file_default_callback; + chan->cb = cb; +} + +/** + * relay_open - create a new relay channel + * @base_filename: base name of files to create + * @parent: dentry of parent directory, NULL for root directory + * @subbuf_size: size of sub-buffers + * @n_subbufs: number of sub-buffers + * @cb: client callback functions + * + * Returns channel pointer if successful, NULL otherwise. + * + * Creates a channel buffer for each cpu using the sizes and + * attributes specified. The created channel buffer files + * will be named base_filename0...base_filenameN-1. File + * permissions will be S_IRUSR. + */ +struct rchan *relay_open(const char *base_filename, + struct dentry *parent, + size_t subbuf_size, + size_t n_subbufs, + struct rchan_callbacks *cb) +{ + unsigned int i; + struct rchan *chan; + char *tmpname; + int is_global = 0; + + if (!base_filename) + return NULL; + + if (!(subbuf_size && n_subbufs)) + return NULL; + + chan = kcalloc(1, sizeof(struct rchan), GFP_KERNEL); + if (!chan) + return NULL; + + chan->version = RELAYFS_CHANNEL_VERSION; + chan->n_subbufs = n_subbufs; + chan->subbuf_size = subbuf_size; + chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs); + setup_callbacks(chan, cb); + kref_init(&chan->kref); + + tmpname = kmalloc(NAME_MAX + 1, GFP_KERNEL); + if (!tmpname) + goto free_chan; + + for_each_online_cpu(i) { + sprintf(tmpname, "%s%d", base_filename, i); + chan->buf[i] = relay_open_buf(chan, tmpname, parent, + &is_global); + if (!chan->buf[i]) + goto free_bufs; + + chan->buf[i]->cpu = i; + } + + kfree(tmpname); + return chan; + +free_bufs: + for (i = 0; i < NR_CPUS; i++) { + if (!chan->buf[i]) + break; + relay_close_buf(chan->buf[i]); + if (is_global) + break; + } + kfree(tmpname); + +free_chan: + kref_put(&chan->kref, relay_destroy_channel); + return NULL; +} +EXPORT_SYMBOL_GPL(relay_open); + +/** + * relay_switch_subbuf - switch to a new sub-buffer + * @buf: channel buffer + * @length: size of current event + * + * Returns either the length passed in or 0 if full. + * + * Performs sub-buffer-switch tasks such as invoking callbacks, + * updating padding counts, waking up readers, etc. + */ +size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) +{ + void *old, *new; + size_t old_subbuf, new_subbuf; + + if (unlikely(length > buf->chan->subbuf_size)) + goto toobig; + + if (buf->offset != buf->chan->subbuf_size + 1) { + buf->prev_padding = buf->chan->subbuf_size - buf->offset; + old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs; + buf->padding[old_subbuf] = buf->prev_padding; + buf->subbufs_produced++; + buf->dentry->d_inode->i_size += buf->chan->subbuf_size - + buf->padding[old_subbuf]; + smp_mb(); + if (waitqueue_active(&buf->read_wait)) { + PREPARE_WORK(&buf->wake_readers, wakeup_readers, buf); + schedule_delayed_work(&buf->wake_readers, 1); + } + } + + old = buf->data; + new_subbuf = buf->subbufs_produced % buf->chan->n_subbufs; + new = buf->start + new_subbuf * buf->chan->subbuf_size; + buf->offset = 0; + if (!buf->chan->cb->subbuf_start(buf, new, old, buf->prev_padding)) { + buf->offset = buf->chan->subbuf_size + 1; + return 0; + } + buf->data = new; + buf->padding[new_subbuf] = 0; + + if (unlikely(length + buf->offset > buf->chan->subbuf_size)) + goto toobig; + + return length; + +toobig: + buf->chan->last_toobig = length; + return 0; +} +EXPORT_SYMBOL_GPL(relay_switch_subbuf); + +/** + * relay_subbufs_consumed - update the buffer's sub-buffers-consumed count + * @chan: the channel + * @cpu: the cpu associated with the channel buffer to update + * @subbufs_consumed: number of sub-buffers to add to current buf's count + * + * Adds to the channel buffer's consumed sub-buffer count. + * subbufs_consumed should be the number of sub-buffers newly consumed, + * not the total consumed. + * + * NOTE: kernel clients don't need to call this function if the channel + * mode is 'overwrite'. + */ +void relay_subbufs_consumed(struct rchan *chan, + unsigned int cpu, + size_t subbufs_consumed) +{ + struct rchan_buf *buf; + + if (!chan) + return; + + if (cpu >= NR_CPUS || !chan->buf[cpu]) + return; + + buf = chan->buf[cpu]; + buf->subbufs_consumed += subbufs_consumed; + if (buf->subbufs_consumed > buf->subbufs_produced) + buf->subbufs_consumed = buf->subbufs_produced; +} +EXPORT_SYMBOL_GPL(relay_subbufs_consumed); + +/** + * relay_close - close the channel + * @chan: the channel + * + * Closes all channel buffers and frees the channel. + */ +void relay_close(struct rchan *chan) +{ + unsigned int i; + struct rchan_buf *prev = NULL; + + if (!chan) + return; + + for (i = 0; i < NR_CPUS; i++) { + if (!chan->buf[i] || chan->buf[i] == prev) + break; + relay_close_buf(chan->buf[i]); + prev = chan->buf[i]; + } + + if (chan->last_toobig) + printk(KERN_WARNING "relay: one or more items not logged " + "[item size (%Zd) > sub-buffer size (%Zd)]\n", + chan->last_toobig, chan->subbuf_size); + + kref_put(&chan->kref, relay_destroy_channel); +} +EXPORT_SYMBOL_GPL(relay_close); + +/** + * relay_flush - close the channel + * @chan: the channel + * + * Flushes all channel buffers i.e. forces buffer switch. + */ +void relay_flush(struct rchan *chan) +{ + unsigned int i; + struct rchan_buf *prev = NULL; + + if (!chan) + return; + + for (i = 0; i < NR_CPUS; i++) { + if (!chan->buf[i] || chan->buf[i] == prev) + break; + relay_switch_subbuf(chan->buf[i], 0); + prev = chan->buf[i]; + } +} +EXPORT_SYMBOL_GPL(relay_flush); + +/** + * relay_file_open - open file op for relay files + * @inode: the inode + * @filp: the file + * + * Increments the channel buffer refcount. + */ +static int relay_file_open(struct inode *inode, struct file *filp) +{ + struct rchan_buf *buf = inode->u.generic_ip; + kref_get(&buf->kref); + filp->private_data = buf; + + return 0; +} + +/** + * relay_file_mmap - mmap file op for relay files + * @filp: the file + * @vma: the vma describing what to map + * + * Calls upon relay_mmap_buf to map the file into user space. + */ +static int relay_file_mmap(struct file *filp, struct vm_area_struct *vma) +{ + struct rchan_buf *buf = filp->private_data; + return relay_mmap_buf(buf, vma); +} + +/** + * relay_file_poll - poll file op for relay files + * @filp: the file + * @wait: poll table + * + * Poll implemention. + */ +static unsigned int relay_file_poll(struct file *filp, poll_table *wait) +{ + unsigned int mask = 0; + struct rchan_buf *buf = filp->private_data; + + if (buf->finalized) + return POLLERR; + + if (filp->f_mode & FMODE_READ) { + poll_wait(filp, &buf->read_wait, wait); + if (!relay_buf_empty(buf)) + mask |= POLLIN | POLLRDNORM; + } + + return mask; +} + +/** + * relay_file_release - release file op for relay files + * @inode: the inode + * @filp: the file + * + * Decrements the channel refcount, as the filesystem is + * no longer using it. + */ +static int relay_file_release(struct inode *inode, struct file *filp) +{ + struct rchan_buf *buf = filp->private_data; + kref_put(&buf->kref, relay_remove_buf); + + return 0; +} + +/** + * relay_file_read_consume - update the consumed count for the buffer + */ +static void relay_file_read_consume(struct rchan_buf *buf, + size_t read_pos, + size_t bytes_consumed) +{ + size_t subbuf_size = buf->chan->subbuf_size; + size_t n_subbufs = buf->chan->n_subbufs; + size_t read_subbuf; + + if (buf->bytes_consumed + bytes_consumed > subbuf_size) { + relay_subbufs_consumed(buf->chan, buf->cpu, 1); + buf->bytes_consumed = 0; + } + + buf->bytes_consumed += bytes_consumed; + read_subbuf = read_pos / buf->chan->subbuf_size; + if (buf->bytes_consumed + buf->padding[read_subbuf] == subbuf_size) { + if ((read_subbuf == buf->subbufs_produced % n_subbufs) && + (buf->offset == subbuf_size)) + return; + relay_subbufs_consumed(buf->chan, buf->cpu, 1); + buf->bytes_consumed = 0; + } +} + +/** + * relay_file_read_avail - boolean, are there unconsumed bytes available? + */ +static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos) +{ + size_t subbuf_size = buf->chan->subbuf_size; + size_t n_subbufs = buf->chan->n_subbufs; + size_t produced = buf->subbufs_produced; + size_t consumed = buf->subbufs_consumed; + + relay_file_read_consume(buf, read_pos, 0); + + if (unlikely(buf->offset > subbuf_size)) { + if (produced == consumed) + return 0; + return 1; + } + + if (unlikely(produced - consumed >= n_subbufs)) { + consumed = (produced / n_subbufs) * n_subbufs; + buf->subbufs_consumed = consumed; + } + + produced = (produced % n_subbufs) * subbuf_size + buf->offset; + consumed = (consumed % n_subbufs) * subbuf_size + buf->bytes_consumed; + + if (consumed > produced) + produced += n_subbufs * subbuf_size; + + if (consumed == produced) + return 0; + + return 1; +} + +/** + * relay_file_read_subbuf_avail - return bytes available in sub-buffer + */ +static size_t relay_file_read_subbuf_avail(size_t read_pos, + struct rchan_buf *buf) +{ + size_t padding, avail = 0; + size_t read_subbuf, read_offset, write_subbuf, write_offset; + size_t subbuf_size = buf->chan->subbuf_size; + + write_subbuf = (buf->data - buf->start) / subbuf_size; + write_offset = buf->offset > subbuf_size ? subbuf_size : buf->offset; + read_subbuf = read_pos / subbuf_size; + read_offset = read_pos % subbuf_size; + padding = buf->padding[read_subbuf]; + + if (read_subbuf == write_subbuf) { + if (read_offset + padding < write_offset) + avail = write_offset - (read_offset + padding); + } else + avail = (subbuf_size - padding) - read_offset; + + return avail; +} + +/** + * relay_file_read_start_pos - find the first available byte to read + * + * If the read_pos is in the middle of padding, return the + * position of the first actually available byte, otherwise + * return the original value. + */ +static size_t relay_file_read_start_pos(size_t read_pos, + struct rchan_buf *buf) +{ + size_t read_subbuf, padding, padding_start, padding_end; + size_t subbuf_size = buf->chan->subbuf_size; + size_t n_subbufs = buf->chan->n_subbufs; + + read_subbuf = read_pos / subbuf_size; + padding = buf->padding[read_subbuf]; + padding_start = (read_subbuf + 1) * subbuf_size - padding; + padding_end = (read_subbuf + 1) * subbuf_size; + if (read_pos >= padding_start && read_pos < padding_end) { + read_subbuf = (read_subbuf + 1) % n_subbufs; + read_pos = read_subbuf * subbuf_size; + } + + return read_pos; +} + +/** + * relay_file_read_end_pos - return the new read position + */ +static size_t relay_file_read_end_pos(struct rchan_buf *buf, + size_t read_pos, + size_t count) +{ + size_t read_subbuf, padding, end_pos; + size_t subbuf_size = buf->chan->subbuf_size; + size_t n_subbufs = buf->chan->n_subbufs; + + read_subbuf = read_pos / subbuf_size; + padding = buf->padding[read_subbuf]; + if (read_pos % subbuf_size + count + padding == subbuf_size) + end_pos = (read_subbuf + 1) * subbuf_size; + else + end_pos = read_pos + count; + if (end_pos >= subbuf_size * n_subbufs) + end_pos = 0; + + return end_pos; +} + +/** + * subbuf_read_actor - read up to one subbuf's worth of data + */ +static int subbuf_read_actor(size_t read_start, + struct rchan_buf *buf, + size_t avail, + read_descriptor_t *desc, + read_actor_t actor) +{ + void *from; + int ret = 0; + + from = buf->start + read_start; + ret = avail; + if (copy_to_user(desc->arg.data, from, avail)) { + desc->error = -EFAULT; + ret = 0; + } + desc->arg.data += ret; + desc->written += ret; + desc->count -= ret; + + return ret; +} + +/** + * subbuf_send_actor - send up to one subbuf's worth of data + */ +static int subbuf_send_actor(size_t read_start, + struct rchan_buf *buf, + size_t avail, + read_descriptor_t *desc, + read_actor_t actor) +{ + unsigned long pidx, poff; + unsigned int subbuf_pages; + int ret = 0; + + subbuf_pages = buf->chan->alloc_size >> PAGE_SHIFT; + pidx = (read_start / PAGE_SIZE) % subbuf_pages; + poff = read_start & ~PAGE_MASK; + while (avail) { + struct page *p = buf->page_array[pidx]; + unsigned int len; + + len = PAGE_SIZE - poff; + if (len > avail) + len = avail; + + len = actor(desc, p, poff, len); + if (desc->error) + break; + + avail -= len; + ret += len; + poff = 0; + pidx = (pidx + 1) % subbuf_pages; + } + + return ret; +} + +typedef int (*subbuf_actor_t) (size_t read_start, + struct rchan_buf *buf, + size_t avail, + read_descriptor_t *desc, + read_actor_t actor); + +/** + * relay_file_read_subbufs - read count bytes, bridging subbuf boundaries + */ +static inline ssize_t relay_file_read_subbufs(struct file *filp, + loff_t *ppos, + size_t count, + subbuf_actor_t subbuf_actor, + read_actor_t actor, + void *target) +{ + struct rchan_buf *buf = filp->private_data; + size_t read_start, avail; + read_descriptor_t desc; + int ret; + + if (!count) + return 0; + + desc.written = 0; + desc.count = count; + desc.arg.data = target; + desc.error = 0; + + mutex_lock(&filp->f_dentry->d_inode->i_mutex); + do { + if (!relay_file_read_avail(buf, *ppos)) + break; + + read_start = relay_file_read_start_pos(*ppos, buf); + avail = relay_file_read_subbuf_avail(read_start, buf); + if (!avail) + break; + + avail = min(desc.count, avail); + ret = subbuf_actor(read_start, buf, avail, &desc, actor); + if (desc.error < 0) + break; + + if (ret) { + relay_file_read_consume(buf, read_start, ret); + *ppos = relay_file_read_end_pos(buf, read_start, ret); + } + } while (desc.count && ret); + mutex_unlock(&filp->f_dentry->d_inode->i_mutex); + + return desc.written; +} + +static ssize_t relay_file_read(struct file *filp, + char __user *buffer, + size_t count, + loff_t *ppos) +{ + return relay_file_read_subbufs(filp, ppos, count, subbuf_read_actor, + NULL, buffer); +} + +static ssize_t relay_file_sendfile(struct file *filp, + loff_t *ppos, + size_t count, + read_actor_t actor, + void *target) +{ + return relay_file_read_subbufs(filp, ppos, count, subbuf_send_actor, + actor, target); +} + +struct file_operations relay_file_operations = { + .open = relay_file_open, + .poll = relay_file_poll, + .mmap = relay_file_mmap, + .read = relay_file_read, + .llseek = no_llseek, + .release = relay_file_release, + .sendfile = relay_file_sendfile, +}; +EXPORT_SYMBOL_GPL(relay_file_operations); diff --git a/kernel/sched.c b/kernel/sched.c index 4d46e90f59c..a9ecac398bb 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -49,6 +49,7 @@ #include <linux/syscalls.h> #include <linux/times.h> #include <linux/acct.h> +#include <linux/kprobes.h> #include <asm/tlb.h> #include <asm/unistd.h> @@ -144,7 +145,8 @@ (v1) * (v2_max) / (v1_max) #define DELTA(p) \ - (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA) + (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \ + INTERACTIVE_DELTA) #define TASK_INTERACTIVE(p) \ ((p)->prio <= (p)->static_prio - DELTA(p)) @@ -237,6 +239,7 @@ struct runqueue { task_t *migration_thread; struct list_head migration_queue; + int cpu; #endif #ifdef CONFIG_SCHEDSTATS @@ -707,12 +710,6 @@ static int recalc_task_prio(task_t *p, unsigned long long now) DEF_TIMESLICE); } else { /* - * The lower the sleep avg a task has the more - * rapidly it will rise with sleep time. - */ - sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1; - - /* * Tasks waking from uninterruptible sleep are * limited in their sleep_avg rise as they * are likely to be waiting on I/O @@ -1551,8 +1548,14 @@ static inline void finish_task_switch(runqueue_t *rq, task_t *prev) finish_lock_switch(rq, prev); if (mm) mmdrop(mm); - if (unlikely(prev_task_flags & PF_DEAD)) + if (unlikely(prev_task_flags & PF_DEAD)) { + /* + * Remove function-return probe instances associated with this + * task and put them back on the free list. + */ + kprobe_flush_task(prev); put_task_struct(prev); + } } /** @@ -1622,7 +1625,7 @@ unsigned long nr_uninterruptible(void) { unsigned long i, sum = 0; - for_each_cpu(i) + for_each_possible_cpu(i) sum += cpu_rq(i)->nr_uninterruptible; /* @@ -1639,7 +1642,7 @@ unsigned long long nr_context_switches(void) { unsigned long long i, sum = 0; - for_each_cpu(i) + for_each_possible_cpu(i) sum += cpu_rq(i)->nr_switches; return sum; @@ -1649,7 +1652,7 @@ unsigned long nr_iowait(void) { unsigned long i, sum = 0; - for_each_cpu(i) + for_each_possible_cpu(i) sum += atomic_read(&cpu_rq(i)->nr_iowait); return sum; @@ -1660,6 +1663,9 @@ unsigned long nr_iowait(void) /* * double_rq_lock - safely lock two runqueues * + * We must take them in cpu order to match code in + * dependent_sleeper and wake_dependent_sleeper. + * * Note this does not disable interrupts like task_rq_lock, * you need to do so manually before calling. */ @@ -1671,7 +1677,7 @@ static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) spin_lock(&rq1->lock); __acquire(rq2->lock); /* Fake it out ;) */ } else { - if (rq1 < rq2) { + if (rq1->cpu < rq2->cpu) { spin_lock(&rq1->lock); spin_lock(&rq2->lock); } else { @@ -1707,7 +1713,7 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) __acquires(this_rq->lock) { if (unlikely(!spin_trylock(&busiest->lock))) { - if (busiest < this_rq) { + if (busiest->cpu < this_rq->cpu) { spin_unlock(&this_rq->lock); spin_lock(&busiest->lock); spin_lock(&this_rq->lock); @@ -2873,13 +2879,11 @@ asmlinkage void __sched schedule(void) * schedule() atomically, we ignore that path for now. * Otherwise, whine if we are scheduling when we should not be. */ - if (likely(!current->exit_state)) { - if (unlikely(in_atomic())) { - printk(KERN_ERR "scheduling while atomic: " - "%s/0x%08x/%d\n", - current->comm, preempt_count(), current->pid); - dump_stack(); - } + if (unlikely(in_atomic() && !current->exit_state)) { + printk(KERN_ERR "BUG: scheduling while atomic: " + "%s/0x%08x/%d\n", + current->comm, preempt_count(), current->pid); + dump_stack(); } profile_hit(SCHED_PROFILING, __builtin_return_address(0)); @@ -5570,11 +5574,31 @@ static int cpu_to_cpu_group(int cpu) } #endif +#ifdef CONFIG_SCHED_MC +static DEFINE_PER_CPU(struct sched_domain, core_domains); +static struct sched_group sched_group_core[NR_CPUS]; +#endif + +#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) +static int cpu_to_core_group(int cpu) +{ + return first_cpu(cpu_sibling_map[cpu]); +} +#elif defined(CONFIG_SCHED_MC) +static int cpu_to_core_group(int cpu) +{ + return cpu; +} +#endif + static DEFINE_PER_CPU(struct sched_domain, phys_domains); static struct sched_group sched_group_phys[NR_CPUS]; static int cpu_to_phys_group(int cpu) { -#ifdef CONFIG_SCHED_SMT +#if defined(CONFIG_SCHED_MC) + cpumask_t mask = cpu_coregroup_map(cpu); + return first_cpu(mask); +#elif defined(CONFIG_SCHED_SMT) return first_cpu(cpu_sibling_map[cpu]); #else return cpu; @@ -5597,6 +5621,32 @@ static int cpu_to_allnodes_group(int cpu) { return cpu_to_node(cpu); } +static void init_numa_sched_groups_power(struct sched_group *group_head) +{ + struct sched_group *sg = group_head; + int j; + + if (!sg) + return; +next_sg: + for_each_cpu_mask(j, sg->cpumask) { + struct sched_domain *sd; + + sd = &per_cpu(phys_domains, j); + if (j != first_cpu(sd->groups->cpumask)) { + /* + * Only add "power" once for each + * physical package. + */ + continue; + } + + sg->cpu_power += sd->groups->cpu_power; + } + sg = sg->next; + if (sg != group_head) + goto next_sg; +} #endif /* @@ -5672,6 +5722,17 @@ void build_sched_domains(const cpumask_t *cpu_map) sd->parent = p; sd->groups = &sched_group_phys[group]; +#ifdef CONFIG_SCHED_MC + p = sd; + sd = &per_cpu(core_domains, i); + group = cpu_to_core_group(i); + *sd = SD_MC_INIT; + sd->span = cpu_coregroup_map(i); + cpus_and(sd->span, sd->span, *cpu_map); + sd->parent = p; + sd->groups = &sched_group_core[group]; +#endif + #ifdef CONFIG_SCHED_SMT p = sd; sd = &per_cpu(cpu_domains, i); @@ -5697,6 +5758,19 @@ void build_sched_domains(const cpumask_t *cpu_map) } #endif +#ifdef CONFIG_SCHED_MC + /* Set up multi-core groups */ + for_each_cpu_mask(i, *cpu_map) { + cpumask_t this_core_map = cpu_coregroup_map(i); + cpus_and(this_core_map, this_core_map, *cpu_map); + if (i != first_cpu(this_core_map)) + continue; + init_sched_build_groups(sched_group_core, this_core_map, + &cpu_to_core_group); + } +#endif + + /* Set up physical groups */ for (i = 0; i < MAX_NUMNODES; i++) { cpumask_t nodemask = node_to_cpumask(i); @@ -5793,51 +5867,38 @@ void build_sched_domains(const cpumask_t *cpu_map) power = SCHED_LOAD_SCALE; sd->groups->cpu_power = power; #endif +#ifdef CONFIG_SCHED_MC + sd = &per_cpu(core_domains, i); + power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1) + * SCHED_LOAD_SCALE / 10; + sd->groups->cpu_power = power; + + sd = &per_cpu(phys_domains, i); + /* + * This has to be < 2 * SCHED_LOAD_SCALE + * Lets keep it SCHED_LOAD_SCALE, so that + * while calculating NUMA group's cpu_power + * we can simply do + * numa_group->cpu_power += phys_group->cpu_power; + * + * See "only add power once for each physical pkg" + * comment below + */ + sd->groups->cpu_power = SCHED_LOAD_SCALE; +#else sd = &per_cpu(phys_domains, i); power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * (cpus_weight(sd->groups->cpumask)-1) / 10; sd->groups->cpu_power = power; - -#ifdef CONFIG_NUMA - sd = &per_cpu(allnodes_domains, i); - if (sd->groups) { - power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * - (cpus_weight(sd->groups->cpumask)-1) / 10; - sd->groups->cpu_power = power; - } #endif } #ifdef CONFIG_NUMA - for (i = 0; i < MAX_NUMNODES; i++) { - struct sched_group *sg = sched_group_nodes[i]; - int j; - - if (sg == NULL) - continue; -next_sg: - for_each_cpu_mask(j, sg->cpumask) { - struct sched_domain *sd; - int power; + for (i = 0; i < MAX_NUMNODES; i++) + init_numa_sched_groups_power(sched_group_nodes[i]); - sd = &per_cpu(phys_domains, j); - if (j != first_cpu(sd->groups->cpumask)) { - /* - * Only add "power" once for each - * physical package. - */ - continue; - } - power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * - (cpus_weight(sd->groups->cpumask)-1) / 10; - - sg->cpu_power += power; - } - sg = sg->next; - if (sg != sched_group_nodes[i]) - goto next_sg; - } + init_numa_sched_groups_power(sched_group_allnodes); #endif /* Attach the domains */ @@ -5845,6 +5906,8 @@ next_sg: struct sched_domain *sd; #ifdef CONFIG_SCHED_SMT sd = &per_cpu(cpu_domains, i); +#elif defined(CONFIG_SCHED_MC) + sd = &per_cpu(core_domains, i); #else sd = &per_cpu(phys_domains, i); #endif @@ -6017,7 +6080,7 @@ void __init sched_init(void) runqueue_t *rq; int i, j, k; - for_each_cpu(i) { + for_each_possible_cpu(i) { prio_array_t *array; rq = cpu_rq(i); @@ -6035,6 +6098,7 @@ void __init sched_init(void) rq->push_cpu = 0; rq->migration_thread = NULL; INIT_LIST_HEAD(&rq->migration_queue); + rq->cpu = i; #endif atomic_set(&rq->nr_iowait, 0); @@ -6075,7 +6139,7 @@ void __might_sleep(char *file, int line) if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) return; prev_jiffy = jiffies; - printk(KERN_ERR "Debug: sleeping function called from invalid" + printk(KERN_ERR "BUG: sleeping function called from invalid" " context at %s:%d\n", file, line); printk("in_atomic():%d, irqs_disabled():%d\n", in_atomic(), irqs_disabled()); diff --git a/kernel/signal.c b/kernel/signal.c index ea154104a00..4922928d91f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -22,7 +22,6 @@ #include <linux/security.h> #include <linux/syscalls.h> #include <linux/ptrace.h> -#include <linux/posix-timers.h> #include <linux/signal.h> #include <linux/audit.h> #include <linux/capability.h> @@ -147,6 +146,8 @@ static kmem_cache_t *sigqueue_cachep; #define sig_kernel_stop(sig) \ (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_STOP_MASK)) +#define sig_needs_tasklist(sig) ((sig) == SIGCONT) + #define sig_user_defined(t, signr) \ (((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_DFL) && \ ((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_IGN)) @@ -292,7 +293,7 @@ static void __sigqueue_free(struct sigqueue *q) kmem_cache_free(sigqueue_cachep, q); } -static void flush_sigqueue(struct sigpending *queue) +void flush_sigqueue(struct sigpending *queue) { struct sigqueue *q; @@ -307,9 +308,7 @@ static void flush_sigqueue(struct sigpending *queue) /* * Flush all pending signals for a task. */ - -void -flush_signals(struct task_struct *t) +void flush_signals(struct task_struct *t) { unsigned long flags; @@ -321,109 +320,6 @@ flush_signals(struct task_struct *t) } /* - * This function expects the tasklist_lock write-locked. - */ -void __exit_sighand(struct task_struct *tsk) -{ - struct sighand_struct * sighand = tsk->sighand; - - /* Ok, we're done with the signal handlers */ - tsk->sighand = NULL; - if (atomic_dec_and_test(&sighand->count)) - sighand_free(sighand); -} - -void exit_sighand(struct task_struct *tsk) -{ - write_lock_irq(&tasklist_lock); - rcu_read_lock(); - if (tsk->sighand != NULL) { - struct sighand_struct *sighand = rcu_dereference(tsk->sighand); - spin_lock(&sighand->siglock); - __exit_sighand(tsk); - spin_unlock(&sighand->siglock); - } - rcu_read_unlock(); - write_unlock_irq(&tasklist_lock); -} - -/* - * This function expects the tasklist_lock write-locked. - */ -void __exit_signal(struct task_struct *tsk) -{ - struct signal_struct * sig = tsk->signal; - struct sighand_struct * sighand; - - if (!sig) - BUG(); - if (!atomic_read(&sig->count)) - BUG(); - rcu_read_lock(); - sighand = rcu_dereference(tsk->sighand); - spin_lock(&sighand->siglock); - posix_cpu_timers_exit(tsk); - if (atomic_dec_and_test(&sig->count)) { - posix_cpu_timers_exit_group(tsk); - tsk->signal = NULL; - __exit_sighand(tsk); - spin_unlock(&sighand->siglock); - flush_sigqueue(&sig->shared_pending); - } else { - /* - * If there is any task waiting for the group exit - * then notify it: - */ - if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count) { - wake_up_process(sig->group_exit_task); - sig->group_exit_task = NULL; - } - if (tsk == sig->curr_target) - sig->curr_target = next_thread(tsk); - tsk->signal = NULL; - /* - * Accumulate here the counters for all threads but the - * group leader as they die, so they can be added into - * the process-wide totals when those are taken. - * The group leader stays around as a zombie as long - * as there are other threads. When it gets reaped, - * the exit.c code will add its counts into these totals. - * We won't ever get here for the group leader, since it - * will have been the last reference on the signal_struct. - */ - sig->utime = cputime_add(sig->utime, tsk->utime); - sig->stime = cputime_add(sig->stime, tsk->stime); - sig->min_flt += tsk->min_flt; - sig->maj_flt += tsk->maj_flt; - sig->nvcsw += tsk->nvcsw; - sig->nivcsw += tsk->nivcsw; - sig->sched_time += tsk->sched_time; - __exit_sighand(tsk); - spin_unlock(&sighand->siglock); - sig = NULL; /* Marker for below. */ - } - rcu_read_unlock(); - clear_tsk_thread_flag(tsk,TIF_SIGPENDING); - flush_sigqueue(&tsk->pending); - if (sig) { - /* - * We are cleaning up the signal_struct here. - */ - exit_thread_group_keys(sig); - kmem_cache_free(signal_cachep, sig); - } -} - -void exit_signal(struct task_struct *tsk) -{ - atomic_dec(&tsk->signal->live); - - write_lock_irq(&tasklist_lock); - __exit_signal(tsk); - write_unlock_irq(&tasklist_lock); -} - -/* * Flush all handlers for a task. */ @@ -695,9 +591,7 @@ static int check_kill_permission(int sig, struct siginfo *info, } /* forward decl */ -static void do_notify_parent_cldstop(struct task_struct *tsk, - int to_self, - int why); +static void do_notify_parent_cldstop(struct task_struct *tsk, int why); /* * Handle magic process-wide effects of stop/continue signals. @@ -747,7 +641,7 @@ static void handle_stop_signal(int sig, struct task_struct *p) p->signal->group_stop_count = 0; p->signal->flags = SIGNAL_STOP_CONTINUED; spin_unlock(&p->sighand->siglock); - do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_STOPPED); + do_notify_parent_cldstop(p, CLD_STOPPED); spin_lock(&p->sighand->siglock); } rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending); @@ -788,7 +682,7 @@ static void handle_stop_signal(int sig, struct task_struct *p) p->signal->flags = SIGNAL_STOP_CONTINUED; p->signal->group_exit_code = 0; spin_unlock(&p->sighand->siglock); - do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_CONTINUED); + do_notify_parent_cldstop(p, CLD_CONTINUED); spin_lock(&p->sighand->siglock); } else { /* @@ -1120,27 +1014,37 @@ void zap_other_threads(struct task_struct *p) /* * Must be called under rcu_read_lock() or with tasklist_lock read-held. */ +struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags) +{ + struct sighand_struct *sighand; + + for (;;) { + sighand = rcu_dereference(tsk->sighand); + if (unlikely(sighand == NULL)) + break; + + spin_lock_irqsave(&sighand->siglock, *flags); + if (likely(sighand == tsk->sighand)) + break; + spin_unlock_irqrestore(&sighand->siglock, *flags); + } + + return sighand; +} + int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) { unsigned long flags; - struct sighand_struct *sp; int ret; -retry: ret = check_kill_permission(sig, info, p); - if (!ret && sig && (sp = rcu_dereference(p->sighand))) { - spin_lock_irqsave(&sp->siglock, flags); - if (p->sighand != sp) { - spin_unlock_irqrestore(&sp->siglock, flags); - goto retry; - } - if ((atomic_read(&sp->count) == 0) || - (atomic_read(&p->usage) == 0)) { - spin_unlock_irqrestore(&sp->siglock, flags); - return -ESRCH; + + if (!ret && sig) { + ret = -ESRCH; + if (lock_task_sighand(p, &flags)) { + ret = __group_send_sig_info(sig, info, p); + unlock_task_sighand(p, &flags); } - ret = __group_send_sig_info(sig, info, p); - spin_unlock_irqrestore(&sp->siglock, flags); } return ret; @@ -1189,7 +1093,7 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid) struct task_struct *p; rcu_read_lock(); - if (unlikely(sig_kernel_stop(sig) || sig == SIGCONT)) { + if (unlikely(sig_needs_tasklist(sig))) { read_lock(&tasklist_lock); acquired_tasklist_lock = 1; } @@ -1405,12 +1309,10 @@ void sigqueue_free(struct sigqueue *q) __sigqueue_free(q); } -int -send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) +int send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) { unsigned long flags; int ret = 0; - struct sighand_struct *sh; BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); @@ -1424,48 +1326,17 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) */ rcu_read_lock(); - if (unlikely(p->flags & PF_EXITING)) { + if (!likely(lock_task_sighand(p, &flags))) { ret = -1; goto out_err; } -retry: - sh = rcu_dereference(p->sighand); - - spin_lock_irqsave(&sh->siglock, flags); - if (p->sighand != sh) { - /* We raced with exec() in a multithreaded process... */ - spin_unlock_irqrestore(&sh->siglock, flags); - goto retry; - } - - /* - * We do the check here again to handle the following scenario: - * - * CPU 0 CPU 1 - * send_sigqueue - * check PF_EXITING - * interrupt exit code running - * __exit_signal - * lock sighand->siglock - * unlock sighand->siglock - * lock sh->siglock - * add(tsk->pending) flush_sigqueue(tsk->pending) - * - */ - - if (unlikely(p->flags & PF_EXITING)) { - ret = -1; - goto out; - } - if (unlikely(!list_empty(&q->list))) { /* * If an SI_TIMER entry is already queue just increment * the overrun count. */ - if (q->info.si_code != SI_TIMER) - BUG(); + BUG_ON(q->info.si_code != SI_TIMER); q->info.si_overrun++; goto out; } @@ -1481,7 +1352,7 @@ retry: signal_wake_up(p, sig == SIGKILL); out: - spin_unlock_irqrestore(&sh->siglock, flags); + unlock_task_sighand(p, &flags); out_err: rcu_read_unlock(); @@ -1613,14 +1484,14 @@ void do_notify_parent(struct task_struct *tsk, int sig) spin_unlock_irqrestore(&psig->siglock, flags); } -static void do_notify_parent_cldstop(struct task_struct *tsk, int to_self, int why) +static void do_notify_parent_cldstop(struct task_struct *tsk, int why) { struct siginfo info; unsigned long flags; struct task_struct *parent; struct sighand_struct *sighand; - if (to_self) + if (tsk->ptrace & PT_PTRACED) parent = tsk->parent; else { tsk = tsk->group_leader; @@ -1695,7 +1566,7 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info) !(current->ptrace & PT_ATTACHED)) && (likely(current->parent->signal != current->signal) || !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) { - do_notify_parent_cldstop(current, 1, CLD_TRAPPED); + do_notify_parent_cldstop(current, CLD_TRAPPED); read_unlock(&tasklist_lock); schedule(); } else { @@ -1744,25 +1615,17 @@ void ptrace_notify(int exit_code) static void finish_stop(int stop_count) { - int to_self; - /* * If there are no other threads in the group, or if there is * a group stop in progress and we are the last to stop, * report to the parent. When ptraced, every thread reports itself. */ - if (stop_count < 0 || (current->ptrace & PT_PTRACED)) - to_self = 1; - else if (stop_count == 0) - to_self = 0; - else - goto out; - - read_lock(&tasklist_lock); - do_notify_parent_cldstop(current, to_self, CLD_STOPPED); - read_unlock(&tasklist_lock); + if (stop_count == 0 || (current->ptrace & PT_PTRACED)) { + read_lock(&tasklist_lock); + do_notify_parent_cldstop(current, CLD_STOPPED); + read_unlock(&tasklist_lock); + } -out: schedule(); /* * Now we don't run again until continued. @@ -1776,12 +1639,10 @@ out: * Returns nonzero if we've actually stopped and released the siglock. * Returns zero if we didn't stop and still hold the siglock. */ -static int -do_signal_stop(int signr) +static int do_signal_stop(int signr) { struct signal_struct *sig = current->signal; - struct sighand_struct *sighand = current->sighand; - int stop_count = -1; + int stop_count; if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED)) return 0; @@ -1791,86 +1652,37 @@ do_signal_stop(int signr) * There is a group stop in progress. We don't need to * start another one. */ - signr = sig->group_exit_code; stop_count = --sig->group_stop_count; - current->exit_code = signr; - set_current_state(TASK_STOPPED); - if (stop_count == 0) - sig->flags = SIGNAL_STOP_STOPPED; - spin_unlock_irq(&sighand->siglock); - } - else if (thread_group_empty(current)) { - /* - * Lock must be held through transition to stopped state. - */ - current->exit_code = current->signal->group_exit_code = signr; - set_current_state(TASK_STOPPED); - sig->flags = SIGNAL_STOP_STOPPED; - spin_unlock_irq(&sighand->siglock); - } - else { + } else { /* * There is no group stop already in progress. - * We must initiate one now, but that requires - * dropping siglock to get both the tasklist lock - * and siglock again in the proper order. Note that - * this allows an intervening SIGCONT to be posted. - * We need to check for that and bail out if necessary. + * We must initiate one now. */ struct task_struct *t; - spin_unlock_irq(&sighand->siglock); - - /* signals can be posted during this window */ + sig->group_exit_code = signr; - read_lock(&tasklist_lock); - spin_lock_irq(&sighand->siglock); - - if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED)) { + stop_count = 0; + for (t = next_thread(current); t != current; t = next_thread(t)) /* - * Another stop or continue happened while we - * didn't have the lock. We can just swallow this - * signal now. If we raced with a SIGCONT, that - * should have just cleared it now. If we raced - * with another processor delivering a stop signal, - * then the SIGCONT that wakes us up should clear it. + * Setting state to TASK_STOPPED for a group + * stop is always done with the siglock held, + * so this check has no races. */ - read_unlock(&tasklist_lock); - return 0; - } - - if (sig->group_stop_count == 0) { - sig->group_exit_code = signr; - stop_count = 0; - for (t = next_thread(current); t != current; - t = next_thread(t)) - /* - * Setting state to TASK_STOPPED for a group - * stop is always done with the siglock held, - * so this check has no races. - */ - if (!t->exit_state && - !(t->state & (TASK_STOPPED|TASK_TRACED))) { - stop_count++; - signal_wake_up(t, 0); - } - sig->group_stop_count = stop_count; - } - else { - /* A race with another thread while unlocked. */ - signr = sig->group_exit_code; - stop_count = --sig->group_stop_count; - } - - current->exit_code = signr; - set_current_state(TASK_STOPPED); - if (stop_count == 0) - sig->flags = SIGNAL_STOP_STOPPED; - - spin_unlock_irq(&sighand->siglock); - read_unlock(&tasklist_lock); + if (!t->exit_state && + !(t->state & (TASK_STOPPED|TASK_TRACED))) { + stop_count++; + signal_wake_up(t, 0); + } + sig->group_stop_count = stop_count; } + if (stop_count == 0) + sig->flags = SIGNAL_STOP_STOPPED; + current->exit_code = sig->group_exit_code; + __set_current_state(TASK_STOPPED); + + spin_unlock_irq(¤t->sighand->siglock); finish_stop(stop_count); return 1; } @@ -1922,6 +1734,8 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, sigset_t *mask = ¤t->blocked; int signr = 0; + try_to_freeze(); + relock: spin_lock_irq(¤t->sighand->siglock); for (;;) { @@ -1988,7 +1802,7 @@ relock: continue; /* Init gets no signals it doesn't want. */ - if (current->pid == 1) + if (current == child_reaper) continue; if (sig_kernel_stop(signr)) { @@ -2099,10 +1913,11 @@ long do_no_restart_syscall(struct restart_block *param) int sigprocmask(int how, sigset_t *set, sigset_t *oldset) { int error; - sigset_t old_block; spin_lock_irq(¤t->sighand->siglock); - old_block = current->blocked; + if (oldset) + *oldset = current->blocked; + error = 0; switch (how) { case SIG_BLOCK: @@ -2119,8 +1934,7 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset) } recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); - if (oldset) - *oldset = old_block; + return error; } @@ -2307,7 +2121,6 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese, timeout = schedule_timeout_interruptible(timeout); - try_to_freeze(); spin_lock_irq(¤t->sighand->siglock); sig = dequeue_signal(current, &these, &info); current->blocked = current->real_blocked; @@ -2429,8 +2242,7 @@ sys_rt_sigqueueinfo(int pid, int sig, siginfo_t __user *uinfo) return kill_proc_info(sig, &info, pid); } -int -do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) +int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) { struct k_sigaction *k; sigset_t mask; @@ -2456,6 +2268,7 @@ do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) if (act) { sigdelsetmask(&act->sa.sa_mask, sigmask(SIGKILL) | sigmask(SIGSTOP)); + *k = *act; /* * POSIX 3.3.1.3: * "Setting a signal action to SIG_IGN for a signal that is @@ -2468,19 +2281,8 @@ do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) * be discarded, whether or not it is blocked" */ if (act->sa.sa_handler == SIG_IGN || - (act->sa.sa_handler == SIG_DFL && - sig_kernel_ignore(sig))) { - /* - * This is a fairly rare case, so we only take the - * tasklist_lock once we're sure we'll need it. - * Now we must do this little unlock and relock - * dance to maintain the lock hierarchy. - */ + (act->sa.sa_handler == SIG_DFL && sig_kernel_ignore(sig))) { struct task_struct *t = current; - spin_unlock_irq(&t->sighand->siglock); - read_lock(&tasklist_lock); - spin_lock_irq(&t->sighand->siglock); - *k = *act; sigemptyset(&mask); sigaddset(&mask, sig); rm_from_queue_full(&mask, &t->signal->shared_pending); @@ -2489,12 +2291,7 @@ do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) recalc_sigpending_tsk(t); t = next_thread(t); } while (t != current); - spin_unlock_irq(¤t->sighand->siglock); - read_unlock(&tasklist_lock); - return 0; } - - *k = *act; } spin_unlock_irq(¤t->sighand->siglock); diff --git a/kernel/softirq.c b/kernel/softirq.c index ad3295cdded..ec8fed42a86 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -16,6 +16,7 @@ #include <linux/cpu.h> #include <linux/kthread.h> #include <linux/rcupdate.h> +#include <linux/smp.h> #include <asm/irq.h> /* @@ -495,3 +496,22 @@ __init int spawn_ksoftirqd(void) register_cpu_notifier(&cpu_nfb); return 0; } + +#ifdef CONFIG_SMP +/* + * Call a function on all processors + */ +int on_each_cpu(void (*func) (void *info), void *info, int retry, int wait) +{ + int ret = 0; + + preempt_disable(); + ret = smp_call_function(func, info, retry, wait); + local_irq_disable(); + func(info); + local_irq_enable(); + preempt_enable(); + return ret; +} +EXPORT_SYMBOL(on_each_cpu); +#endif diff --git a/kernel/softlockup.c b/kernel/softlockup.c index c67189a25d5..ced91e1ff56 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -1,12 +1,11 @@ /* * Detect Soft Lockups * - * started by Ingo Molnar, (C) 2005, Red Hat + * started by Ingo Molnar, Copyright (C) 2005, 2006 Red Hat, Inc. * * this code detects soft lockups: incidents in where on a CPU * the kernel does not reschedule for 10 seconds or more. */ - #include <linux/mm.h> #include <linux/cpu.h> #include <linux/init.h> @@ -17,13 +16,14 @@ static DEFINE_SPINLOCK(print_lock); -static DEFINE_PER_CPU(unsigned long, timestamp) = 0; -static DEFINE_PER_CPU(unsigned long, print_timestamp) = 0; +static DEFINE_PER_CPU(unsigned long, touch_timestamp); +static DEFINE_PER_CPU(unsigned long, print_timestamp); static DEFINE_PER_CPU(struct task_struct *, watchdog_task); static int did_panic = 0; -static int softlock_panic(struct notifier_block *this, unsigned long event, - void *ptr) + +static int +softlock_panic(struct notifier_block *this, unsigned long event, void *ptr) { did_panic = 1; @@ -36,7 +36,7 @@ static struct notifier_block panic_block = { void touch_softlockup_watchdog(void) { - per_cpu(timestamp, raw_smp_processor_id()) = jiffies; + per_cpu(touch_timestamp, raw_smp_processor_id()) = jiffies; } EXPORT_SYMBOL(touch_softlockup_watchdog); @@ -44,25 +44,35 @@ EXPORT_SYMBOL(touch_softlockup_watchdog); * This callback runs from the timer interrupt, and checks * whether the watchdog thread has hung or not: */ -void softlockup_tick(struct pt_regs *regs) +void softlockup_tick(void) { int this_cpu = smp_processor_id(); - unsigned long timestamp = per_cpu(timestamp, this_cpu); + unsigned long touch_timestamp = per_cpu(touch_timestamp, this_cpu); - if (per_cpu(print_timestamp, this_cpu) == timestamp) + /* prevent double reports: */ + if (per_cpu(print_timestamp, this_cpu) == touch_timestamp || + did_panic || + !per_cpu(watchdog_task, this_cpu)) return; - /* Do not cause a second panic when there already was one */ - if (did_panic) + /* do not print during early bootup: */ + if (unlikely(system_state != SYSTEM_RUNNING)) { + touch_softlockup_watchdog(); return; + } - if (time_after(jiffies, timestamp + 10*HZ)) { - per_cpu(print_timestamp, this_cpu) = timestamp; + /* Wake up the high-prio watchdog task every second: */ + if (time_after(jiffies, touch_timestamp + HZ)) + wake_up_process(per_cpu(watchdog_task, this_cpu)); + + /* Warn about unreasonable 10+ seconds delays: */ + if (time_after(jiffies, touch_timestamp + 10*HZ)) { + per_cpu(print_timestamp, this_cpu) = touch_timestamp; spin_lock(&print_lock); printk(KERN_ERR "BUG: soft lockup detected on CPU#%d!\n", this_cpu); - show_regs(regs); + dump_stack(); spin_unlock(&print_lock); } } @@ -77,18 +87,16 @@ static int watchdog(void * __bind_cpu) sched_setscheduler(current, SCHED_FIFO, ¶m); current->flags |= PF_NOFREEZE; - set_current_state(TASK_INTERRUPTIBLE); - /* - * Run briefly once per second - if this gets delayed for - * more than 10 seconds then the debug-printout triggers - * in softlockup_tick(): + * Run briefly once per second to reset the softlockup timestamp. + * If this gets delayed for more than 10 seconds then the + * debug-printout triggers in softlockup_tick(). */ while (!kthread_should_stop()) { - msleep_interruptible(1000); + set_current_state(TASK_INTERRUPTIBLE); touch_softlockup_watchdog(); + schedule(); } - __set_current_state(TASK_RUNNING); return 0; } @@ -110,11 +118,11 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) printk("watchdog for %i failed\n", hotcpu); return NOTIFY_BAD; } + per_cpu(touch_timestamp, hotcpu) = jiffies; per_cpu(watchdog_task, hotcpu) = p; kthread_bind(p, hotcpu); break; case CPU_ONLINE: - wake_up_process(per_cpu(watchdog_task, hotcpu)); break; #ifdef CONFIG_HOTPLUG_CPU @@ -144,6 +152,5 @@ __init void spawn_softlockup_task(void) cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); register_cpu_notifier(&cpu_nfb); - notifier_chain_register(&panic_notifier_list, &panic_block); + atomic_notifier_chain_register(&panic_notifier_list, &panic_block); } - diff --git a/kernel/spinlock.c b/kernel/spinlock.c index 0375fcd5921..d1b810782bc 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -179,16 +179,16 @@ EXPORT_SYMBOL(_write_lock); #define BUILD_LOCK_OPS(op, locktype) \ void __lockfunc _##op##_lock(locktype##_t *lock) \ { \ - preempt_disable(); \ for (;;) { \ + preempt_disable(); \ if (likely(_raw_##op##_trylock(lock))) \ break; \ preempt_enable(); \ + \ if (!(lock)->break_lock) \ (lock)->break_lock = 1; \ while (!op##_can_lock(lock) && (lock)->break_lock) \ cpu_relax(); \ - preempt_disable(); \ } \ (lock)->break_lock = 0; \ } \ @@ -199,19 +199,18 @@ unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock) \ { \ unsigned long flags; \ \ - preempt_disable(); \ for (;;) { \ + preempt_disable(); \ local_irq_save(flags); \ if (likely(_raw_##op##_trylock(lock))) \ break; \ local_irq_restore(flags); \ - \ preempt_enable(); \ + \ if (!(lock)->break_lock) \ (lock)->break_lock = 1; \ while (!op##_can_lock(lock) && (lock)->break_lock) \ cpu_relax(); \ - preempt_disable(); \ } \ (lock)->break_lock = 0; \ return flags; \ diff --git a/kernel/sys.c b/kernel/sys.c index f91218a5463..7ef7f6054c2 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -95,99 +95,304 @@ int cad_pid = 1; * and the like. */ -static struct notifier_block *reboot_notifier_list; -static DEFINE_RWLOCK(notifier_lock); +static BLOCKING_NOTIFIER_HEAD(reboot_notifier_list); + +/* + * Notifier chain core routines. The exported routines below + * are layered on top of these, with appropriate locking added. + */ + +static int notifier_chain_register(struct notifier_block **nl, + struct notifier_block *n) +{ + while ((*nl) != NULL) { + if (n->priority > (*nl)->priority) + break; + nl = &((*nl)->next); + } + n->next = *nl; + rcu_assign_pointer(*nl, n); + return 0; +} + +static int notifier_chain_unregister(struct notifier_block **nl, + struct notifier_block *n) +{ + while ((*nl) != NULL) { + if ((*nl) == n) { + rcu_assign_pointer(*nl, n->next); + return 0; + } + nl = &((*nl)->next); + } + return -ENOENT; +} + +static int __kprobes notifier_call_chain(struct notifier_block **nl, + unsigned long val, void *v) +{ + int ret = NOTIFY_DONE; + struct notifier_block *nb; + + nb = rcu_dereference(*nl); + while (nb) { + ret = nb->notifier_call(nb, val, v); + if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK) + break; + nb = rcu_dereference(nb->next); + } + return ret; +} + +/* + * Atomic notifier chain routines. Registration and unregistration + * use a mutex, and call_chain is synchronized by RCU (no locks). + */ /** - * notifier_chain_register - Add notifier to a notifier chain - * @list: Pointer to root list pointer + * atomic_notifier_chain_register - Add notifier to an atomic notifier chain + * @nh: Pointer to head of the atomic notifier chain * @n: New entry in notifier chain * - * Adds a notifier to a notifier chain. + * Adds a notifier to an atomic notifier chain. * * Currently always returns zero. */ + +int atomic_notifier_chain_register(struct atomic_notifier_head *nh, + struct notifier_block *n) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&nh->lock, flags); + ret = notifier_chain_register(&nh->head, n); + spin_unlock_irqrestore(&nh->lock, flags); + return ret; +} + +EXPORT_SYMBOL_GPL(atomic_notifier_chain_register); + +/** + * atomic_notifier_chain_unregister - Remove notifier from an atomic notifier chain + * @nh: Pointer to head of the atomic notifier chain + * @n: Entry to remove from notifier chain + * + * Removes a notifier from an atomic notifier chain. + * + * Returns zero on success or %-ENOENT on failure. + */ +int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh, + struct notifier_block *n) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&nh->lock, flags); + ret = notifier_chain_unregister(&nh->head, n); + spin_unlock_irqrestore(&nh->lock, flags); + synchronize_rcu(); + return ret; +} + +EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister); + +/** + * atomic_notifier_call_chain - Call functions in an atomic notifier chain + * @nh: Pointer to head of the atomic notifier chain + * @val: Value passed unmodified to notifier function + * @v: Pointer passed unmodified to notifier function + * + * Calls each function in a notifier chain in turn. The functions + * run in an atomic context, so they must not block. + * This routine uses RCU to synchronize with changes to the chain. + * + * If the return value of the notifier can be and'ed + * with %NOTIFY_STOP_MASK then atomic_notifier_call_chain + * will return immediately, with the return value of + * the notifier function which halted execution. + * Otherwise the return value is the return value + * of the last notifier function called. + */ -int notifier_chain_register(struct notifier_block **list, struct notifier_block *n) +int atomic_notifier_call_chain(struct atomic_notifier_head *nh, + unsigned long val, void *v) { - write_lock(¬ifier_lock); - while(*list) - { - if(n->priority > (*list)->priority) - break; - list= &((*list)->next); - } - n->next = *list; - *list=n; - write_unlock(¬ifier_lock); - return 0; + int ret; + + rcu_read_lock(); + ret = notifier_call_chain(&nh->head, val, v); + rcu_read_unlock(); + return ret; } -EXPORT_SYMBOL(notifier_chain_register); +EXPORT_SYMBOL_GPL(atomic_notifier_call_chain); + +/* + * Blocking notifier chain routines. All access to the chain is + * synchronized by an rwsem. + */ /** - * notifier_chain_unregister - Remove notifier from a notifier chain - * @nl: Pointer to root list pointer + * blocking_notifier_chain_register - Add notifier to a blocking notifier chain + * @nh: Pointer to head of the blocking notifier chain * @n: New entry in notifier chain * - * Removes a notifier from a notifier chain. + * Adds a notifier to a blocking notifier chain. + * Must be called in process context. * - * Returns zero on success, or %-ENOENT on failure. + * Currently always returns zero. */ -int notifier_chain_unregister(struct notifier_block **nl, struct notifier_block *n) +int blocking_notifier_chain_register(struct blocking_notifier_head *nh, + struct notifier_block *n) { - write_lock(¬ifier_lock); - while((*nl)!=NULL) - { - if((*nl)==n) - { - *nl=n->next; - write_unlock(¬ifier_lock); - return 0; - } - nl=&((*nl)->next); - } - write_unlock(¬ifier_lock); - return -ENOENT; + int ret; + + /* + * This code gets used during boot-up, when task switching is + * not yet working and interrupts must remain disabled. At + * such times we must not call down_write(). + */ + if (unlikely(system_state == SYSTEM_BOOTING)) + return notifier_chain_register(&nh->head, n); + + down_write(&nh->rwsem); + ret = notifier_chain_register(&nh->head, n); + up_write(&nh->rwsem); + return ret; } -EXPORT_SYMBOL(notifier_chain_unregister); +EXPORT_SYMBOL_GPL(blocking_notifier_chain_register); /** - * notifier_call_chain - Call functions in a notifier chain - * @n: Pointer to root pointer of notifier chain + * blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain + * @nh: Pointer to head of the blocking notifier chain + * @n: Entry to remove from notifier chain + * + * Removes a notifier from a blocking notifier chain. + * Must be called from process context. + * + * Returns zero on success or %-ENOENT on failure. + */ +int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh, + struct notifier_block *n) +{ + int ret; + + /* + * This code gets used during boot-up, when task switching is + * not yet working and interrupts must remain disabled. At + * such times we must not call down_write(). + */ + if (unlikely(system_state == SYSTEM_BOOTING)) + return notifier_chain_unregister(&nh->head, n); + + down_write(&nh->rwsem); + ret = notifier_chain_unregister(&nh->head, n); + up_write(&nh->rwsem); + return ret; +} + +EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister); + +/** + * blocking_notifier_call_chain - Call functions in a blocking notifier chain + * @nh: Pointer to head of the blocking notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function * - * Calls each function in a notifier chain in turn. + * Calls each function in a notifier chain in turn. The functions + * run in a process context, so they are allowed to block. * - * If the return value of the notifier can be and'd - * with %NOTIFY_STOP_MASK, then notifier_call_chain + * If the return value of the notifier can be and'ed + * with %NOTIFY_STOP_MASK then blocking_notifier_call_chain * will return immediately, with the return value of * the notifier function which halted execution. - * Otherwise, the return value is the return value + * Otherwise the return value is the return value * of the last notifier function called. */ -int __kprobes notifier_call_chain(struct notifier_block **n, unsigned long val, void *v) +int blocking_notifier_call_chain(struct blocking_notifier_head *nh, + unsigned long val, void *v) { - int ret=NOTIFY_DONE; - struct notifier_block *nb = *n; + int ret; - while(nb) - { - ret=nb->notifier_call(nb,val,v); - if(ret&NOTIFY_STOP_MASK) - { - return ret; - } - nb=nb->next; - } + down_read(&nh->rwsem); + ret = notifier_call_chain(&nh->head, val, v); + up_read(&nh->rwsem); return ret; } -EXPORT_SYMBOL(notifier_call_chain); +EXPORT_SYMBOL_GPL(blocking_notifier_call_chain); + +/* + * Raw notifier chain routines. There is no protection; + * the caller must provide it. Use at your own risk! + */ + +/** + * raw_notifier_chain_register - Add notifier to a raw notifier chain + * @nh: Pointer to head of the raw notifier chain + * @n: New entry in notifier chain + * + * Adds a notifier to a raw notifier chain. + * All locking must be provided by the caller. + * + * Currently always returns zero. + */ + +int raw_notifier_chain_register(struct raw_notifier_head *nh, + struct notifier_block *n) +{ + return notifier_chain_register(&nh->head, n); +} + +EXPORT_SYMBOL_GPL(raw_notifier_chain_register); + +/** + * raw_notifier_chain_unregister - Remove notifier from a raw notifier chain + * @nh: Pointer to head of the raw notifier chain + * @n: Entry to remove from notifier chain + * + * Removes a notifier from a raw notifier chain. + * All locking must be provided by the caller. + * + * Returns zero on success or %-ENOENT on failure. + */ +int raw_notifier_chain_unregister(struct raw_notifier_head *nh, + struct notifier_block *n) +{ + return notifier_chain_unregister(&nh->head, n); +} + +EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister); + +/** + * raw_notifier_call_chain - Call functions in a raw notifier chain + * @nh: Pointer to head of the raw notifier chain + * @val: Value passed unmodified to notifier function + * @v: Pointer passed unmodified to notifier function + * + * Calls each function in a notifier chain in turn. The functions + * run in an undefined context. + * All locking must be provided by the caller. + * + * If the return value of the notifier can be and'ed + * with %NOTIFY_STOP_MASK then raw_notifier_call_chain + * will return immediately, with the return value of + * the notifier function which halted execution. + * Otherwise the return value is the return value + * of the last notifier function called. + */ + +int raw_notifier_call_chain(struct raw_notifier_head *nh, + unsigned long val, void *v) +{ + return notifier_call_chain(&nh->head, val, v); +} + +EXPORT_SYMBOL_GPL(raw_notifier_call_chain); /** * register_reboot_notifier - Register function to be called at reboot time @@ -196,13 +401,13 @@ EXPORT_SYMBOL(notifier_call_chain); * Registers a function with the list of functions * to be called at reboot time. * - * Currently always returns zero, as notifier_chain_register + * Currently always returns zero, as blocking_notifier_chain_register * always returns zero. */ int register_reboot_notifier(struct notifier_block * nb) { - return notifier_chain_register(&reboot_notifier_list, nb); + return blocking_notifier_chain_register(&reboot_notifier_list, nb); } EXPORT_SYMBOL(register_reboot_notifier); @@ -219,23 +424,11 @@ EXPORT_SYMBOL(register_reboot_notifier); int unregister_reboot_notifier(struct notifier_block * nb) { - return notifier_chain_unregister(&reboot_notifier_list, nb); + return blocking_notifier_chain_unregister(&reboot_notifier_list, nb); } EXPORT_SYMBOL(unregister_reboot_notifier); -#ifndef CONFIG_SECURITY -int capable(int cap) -{ - if (cap_raised(current->cap_effective, cap)) { - current->flags |= PF_SUPERPRIV; - return 1; - } - return 0; -} -EXPORT_SYMBOL(capable); -#endif - static int set_one_prio(struct task_struct *p, int niceval, int error) { int no_nice; @@ -392,7 +585,7 @@ EXPORT_SYMBOL_GPL(emergency_restart); void kernel_restart_prepare(char *cmd) { - notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); + blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); system_state = SYSTEM_RESTART; device_shutdown(); } @@ -442,7 +635,7 @@ EXPORT_SYMBOL_GPL(kernel_kexec); void kernel_shutdown_prepare(enum system_states state) { - notifier_call_chain(&reboot_notifier_list, + blocking_notifier_call_chain(&reboot_notifier_list, (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); system_state = state; device_shutdown(); @@ -1009,69 +1202,24 @@ asmlinkage long sys_times(struct tms __user * tbuf) */ if (tbuf) { struct tms tmp; + struct task_struct *tsk = current; + struct task_struct *t; cputime_t utime, stime, cutime, cstime; -#ifdef CONFIG_SMP - if (thread_group_empty(current)) { - /* - * Single thread case without the use of any locks. - * - * We may race with release_task if two threads are - * executing. However, release task first adds up the - * counters (__exit_signal) before removing the task - * from the process tasklist (__unhash_process). - * __exit_signal also acquires and releases the - * siglock which results in the proper memory ordering - * so that the list modifications are always visible - * after the counters have been updated. - * - * If the counters have been updated by the second thread - * but the thread has not yet been removed from the list - * then the other branch will be executing which will - * block on tasklist_lock until the exit handling of the - * other task is finished. - * - * This also implies that the sighand->siglock cannot - * be held by another processor. So we can also - * skip acquiring that lock. - */ - utime = cputime_add(current->signal->utime, current->utime); - stime = cputime_add(current->signal->utime, current->stime); - cutime = current->signal->cutime; - cstime = current->signal->cstime; - } else -#endif - { + spin_lock_irq(&tsk->sighand->siglock); + utime = tsk->signal->utime; + stime = tsk->signal->stime; + t = tsk; + do { + utime = cputime_add(utime, t->utime); + stime = cputime_add(stime, t->stime); + t = next_thread(t); + } while (t != tsk); - /* Process with multiple threads */ - struct task_struct *tsk = current; - struct task_struct *t; - - read_lock(&tasklist_lock); - utime = tsk->signal->utime; - stime = tsk->signal->stime; - t = tsk; - do { - utime = cputime_add(utime, t->utime); - stime = cputime_add(stime, t->stime); - t = next_thread(t); - } while (t != tsk); + cutime = tsk->signal->cutime; + cstime = tsk->signal->cstime; + spin_unlock_irq(&tsk->sighand->siglock); - /* - * While we have tasklist_lock read-locked, no dying thread - * can be updating current->signal->[us]time. Instead, - * we got their counts included in the live thread loop. - * However, another thread can come in right now and - * do a wait call that updates current->signal->c[us]time. - * To make sure we always see that pair updated atomically, - * we take the siglock around fetching them. - */ - spin_lock_irq(&tsk->sighand->siglock); - cutime = tsk->signal->cutime; - cstime = tsk->signal->cstime; - spin_unlock_irq(&tsk->sighand->siglock); - read_unlock(&tasklist_lock); - } tmp.tms_utime = cputime_to_clock_t(utime); tmp.tms_stime = cputime_to_clock_t(stime); tmp.tms_cutime = cputime_to_clock_t(cutime); @@ -1227,7 +1375,7 @@ asmlinkage long sys_setsid(void) struct pid *pid; int err = -EPERM; - down(&tty_sem); + mutex_lock(&tty_mutex); write_lock_irq(&tasklist_lock); pid = find_pid(PIDTYPE_PGID, group_leader->pid); @@ -1241,7 +1389,7 @@ asmlinkage long sys_setsid(void) err = process_group(group_leader); out: write_unlock_irq(&tasklist_lock); - up(&tty_sem); + mutex_unlock(&tty_mutex); return err; } @@ -1375,7 +1523,7 @@ static void groups_sort(struct group_info *group_info) /* a simple bsearch */ int groups_search(struct group_info *group_info, gid_t grp) { - int left, right; + unsigned int left, right; if (!group_info) return 0; @@ -1383,7 +1531,7 @@ int groups_search(struct group_info *group_info, gid_t grp) left = 0; right = group_info->ngroups; while (left < right) { - int mid = (left+right)/2; + unsigned int mid = (left+right)/2; int cmp = grp - GROUP_AT(group_info, mid); if (cmp > 0) left = mid + 1; @@ -1433,7 +1581,6 @@ asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist) return -EINVAL; /* no need to grab task_lock here; it cannot change */ - get_group_info(current->group_info); i = current->group_info->ngroups; if (gidsetsize) { if (i > gidsetsize) { @@ -1446,7 +1593,6 @@ asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist) } } out: - put_group_info(current->group_info); return i; } @@ -1487,9 +1633,7 @@ int in_group_p(gid_t grp) { int retval = 1; if (grp != current->fsgid) { - get_group_info(current->group_info); retval = groups_search(current->group_info, grp); - put_group_info(current->group_info); } return retval; } @@ -1500,9 +1644,7 @@ int in_egroup_p(gid_t grp) { int retval = 1; if (grp != current->egid) { - get_group_info(current->group_info); retval = groups_search(current->group_info, grp); - put_group_info(current->group_info); } return retval; } @@ -1630,20 +1772,21 @@ asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *r asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) { struct rlimit new_rlim, *old_rlim; + unsigned long it_prof_secs; int retval; if (resource >= RLIM_NLIMITS) return -EINVAL; - if(copy_from_user(&new_rlim, rlim, sizeof(*rlim))) + if (copy_from_user(&new_rlim, rlim, sizeof(*rlim))) return -EFAULT; - if (new_rlim.rlim_cur > new_rlim.rlim_max) - return -EINVAL; + if (new_rlim.rlim_cur > new_rlim.rlim_max) + return -EINVAL; old_rlim = current->signal->rlim + resource; if ((new_rlim.rlim_max > old_rlim->rlim_max) && !capable(CAP_SYS_RESOURCE)) return -EPERM; if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > NR_OPEN) - return -EPERM; + return -EPERM; retval = security_task_setrlimit(resource, &new_rlim); if (retval) @@ -1653,19 +1796,40 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) *old_rlim = new_rlim; task_unlock(current->group_leader); - if (resource == RLIMIT_CPU && new_rlim.rlim_cur != RLIM_INFINITY && - (cputime_eq(current->signal->it_prof_expires, cputime_zero) || - new_rlim.rlim_cur <= cputime_to_secs( - current->signal->it_prof_expires))) { - cputime_t cputime = secs_to_cputime(new_rlim.rlim_cur); + if (resource != RLIMIT_CPU) + goto out; + + /* + * RLIMIT_CPU handling. Note that the kernel fails to return an error + * code if it rejected the user's attempt to set RLIMIT_CPU. This is a + * very long-standing error, and fixing it now risks breakage of + * applications, so we live with it + */ + if (new_rlim.rlim_cur == RLIM_INFINITY) + goto out; + + it_prof_secs = cputime_to_secs(current->signal->it_prof_expires); + if (it_prof_secs == 0 || new_rlim.rlim_cur <= it_prof_secs) { + unsigned long rlim_cur = new_rlim.rlim_cur; + cputime_t cputime; + + if (rlim_cur == 0) { + /* + * The caller is asking for an immediate RLIMIT_CPU + * expiry. But we use the zero value to mean "it was + * never set". So let's cheat and make it one second + * instead + */ + rlim_cur = 1; + } + cputime = secs_to_cputime(rlim_cur); read_lock(&tasklist_lock); spin_lock_irq(¤t->sighand->siglock); - set_process_cpu_timer(current, CPUCLOCK_PROF, - &cputime, NULL); + set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL); spin_unlock_irq(¤t->sighand->siglock); read_unlock(&tasklist_lock); } - +out: return 0; } @@ -1677,9 +1841,6 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) * a lot simpler! (Which we're not doing right now because we're not * measuring them yet). * - * This expects to be called with tasklist_lock read-locked or better, - * and the siglock not locked. It may momentarily take the siglock. - * * When sampling multiple threads for RUSAGE_SELF, under SMP we might have * races with threads incrementing their own counters. But since word * reads are atomic, we either get new values or old values and we don't @@ -1687,6 +1848,25 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) * the c* fields from p->signal from races with exit.c updating those * fields when reaping, so a sample either gets all the additions of a * given child after it's reaped, or none so this sample is before reaping. + * + * tasklist_lock locking optimisation: + * If we are current and single threaded, we do not need to take the tasklist + * lock or the siglock. No one else can take our signal_struct away, + * no one else can reap the children to update signal->c* counters, and + * no one else can race with the signal-> fields. + * If we do not take the tasklist_lock, the signal-> fields could be read + * out of order while another thread was just exiting. So we place a + * read memory barrier when we avoid the lock. On the writer side, + * write memory barrier is implied in __exit_signal as __exit_signal releases + * the siglock spinlock after updating the signal-> fields. + * + * We don't really need the siglock when we access the non c* fields + * of the signal_struct (for RUSAGE_SELF) even in multithreaded + * case, since we take the tasklist lock for read and the non c* signal-> + * fields are updated only in __exit_signal, which is called with + * tasklist_lock taken for write, hence these two threads cannot execute + * concurrently. + * */ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) @@ -1694,13 +1874,23 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) struct task_struct *t; unsigned long flags; cputime_t utime, stime; + int need_lock = 0; memset((char *) r, 0, sizeof *r); + utime = stime = cputime_zero; - if (unlikely(!p->signal)) - return; + if (p != current || !thread_group_empty(p)) + need_lock = 1; - utime = stime = cputime_zero; + if (need_lock) { + read_lock(&tasklist_lock); + if (unlikely(!p->signal)) { + read_unlock(&tasklist_lock); + return; + } + } else + /* See locking comments above */ + smp_rmb(); switch (who) { case RUSAGE_BOTH: @@ -1740,6 +1930,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) BUG(); } + if (need_lock) + read_unlock(&tasklist_lock); cputime_to_timeval(utime, &r->ru_utime); cputime_to_timeval(stime, &r->ru_stime); } @@ -1747,9 +1939,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) int getrusage(struct task_struct *p, int who, struct rusage __user *ru) { struct rusage r; - read_lock(&tasklist_lock); k_getrusage(p, who, &r); - read_unlock(&tasklist_lock); return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; } diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 1067090db6b..d82864c4a61 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -42,6 +42,10 @@ cond_syscall(sys_recvmsg); cond_syscall(sys_socketcall); cond_syscall(sys_futex); cond_syscall(compat_sys_futex); +cond_syscall(sys_set_robust_list); +cond_syscall(compat_sys_set_robust_list); +cond_syscall(sys_get_robust_list); +cond_syscall(compat_sys_get_robust_list); cond_syscall(sys_epoll_create); cond_syscall(sys_epoll_ctl); cond_syscall(sys_epoll_wait); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 32b48e8ee36..e82726faeef 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -742,18 +742,18 @@ static ctl_table vm_table[] = { { .ctl_name = VM_DIRTY_WB_CS, .procname = "dirty_writeback_centisecs", - .data = &dirty_writeback_centisecs, - .maxlen = sizeof(dirty_writeback_centisecs), + .data = &dirty_writeback_interval, + .maxlen = sizeof(dirty_writeback_interval), .mode = 0644, .proc_handler = &dirty_writeback_centisecs_handler, }, { .ctl_name = VM_DIRTY_EXPIRE_CS, .procname = "dirty_expire_centisecs", - .data = &dirty_expire_centisecs, - .maxlen = sizeof(dirty_expire_centisecs), + .data = &dirty_expire_interval, + .maxlen = sizeof(dirty_expire_interval), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_dointvec_userhz_jiffies, }, { .ctl_name = VM_NR_PDFLUSH_THREADS, @@ -848,9 +848,8 @@ static ctl_table vm_table[] = { .data = &laptop_mode, .maxlen = sizeof(laptop_mode), .mode = 0644, - .proc_handler = &proc_dointvec, - .strategy = &sysctl_intvec, - .extra1 = &zero, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, }, { .ctl_name = VM_BLOCK_DUMP, @@ -2054,6 +2053,8 @@ static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp, int write, void *data) { if (write) { + if (*lvalp > LONG_MAX / HZ) + return 1; *valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ); } else { int val = *valp; @@ -2075,6 +2076,8 @@ static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp, int write, void *data) { if (write) { + if (USER_HZ < HZ && *lvalp > (LONG_MAX / HZ) * USER_HZ) + return 1; *valp = clock_t_to_jiffies(*negp ? -*lvalp : *lvalp); } else { int val = *valp; diff --git a/kernel/time.c b/kernel/time.c index 804539165d8..ff8e7019c4c 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -202,24 +202,6 @@ asmlinkage long sys_settimeofday(struct timeval __user *tv, return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL); } -long pps_offset; /* pps time offset (us) */ -long pps_jitter = MAXTIME; /* time dispersion (jitter) (us) */ - -long pps_freq; /* frequency offset (scaled ppm) */ -long pps_stabil = MAXFREQ; /* frequency dispersion (scaled ppm) */ - -long pps_valid = PPS_VALID; /* pps signal watchdog counter */ - -int pps_shift = PPS_SHIFT; /* interval duration (s) (shift) */ - -long pps_jitcnt; /* jitter limit exceeded */ -long pps_calcnt; /* calibration intervals */ -long pps_errcnt; /* calibration errors */ -long pps_stbcnt; /* stability limit exceeded */ - -/* hook for a loadable hardpps kernel module */ -void (*hardpps_ptr)(struct timeval *); - /* we call this to notify the arch when the clock is being * controlled. If no such arch routine, do nothing. */ @@ -279,7 +261,7 @@ int do_adjtimex(struct timex *txc) result = -EINVAL; goto leave; } - time_freq = txc->freq - pps_freq; + time_freq = txc->freq; } if (txc->modes & ADJ_MAXERROR) { @@ -312,10 +294,8 @@ int do_adjtimex(struct timex *txc) if ((time_next_adjust = txc->offset) == 0) time_adjust = 0; } - else if ( time_status & (STA_PLL | STA_PPSTIME) ) { - ltemp = (time_status & (STA_PPSTIME | STA_PPSSIGNAL)) == - (STA_PPSTIME | STA_PPSSIGNAL) ? - pps_offset : txc->offset; + else if (time_status & STA_PLL) { + ltemp = txc->offset; /* * Scale the phase adjustment and @@ -356,23 +336,14 @@ int do_adjtimex(struct timex *txc) } time_freq = min(time_freq, time_tolerance); time_freq = max(time_freq, -time_tolerance); - } /* STA_PLL || STA_PPSTIME */ + } /* STA_PLL */ } /* txc->modes & ADJ_OFFSET */ if (txc->modes & ADJ_TICK) { tick_usec = txc->tick; tick_nsec = TICK_USEC_TO_NSEC(tick_usec); } } /* txc->modes */ -leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0 - || ((time_status & (STA_PPSFREQ|STA_PPSTIME)) != 0 - && (time_status & STA_PPSSIGNAL) == 0) - /* p. 24, (b) */ - || ((time_status & (STA_PPSTIME|STA_PPSJITTER)) - == (STA_PPSTIME|STA_PPSJITTER)) - /* p. 24, (c) */ - || ((time_status & STA_PPSFREQ) != 0 - && (time_status & (STA_PPSWANDER|STA_PPSERROR)) != 0)) - /* p. 24, (d) */ +leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0) result = TIME_ERROR; if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) @@ -380,7 +351,7 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0 else { txc->offset = shift_right(time_offset, SHIFT_UPDATE); } - txc->freq = time_freq + pps_freq; + txc->freq = time_freq; txc->maxerror = time_maxerror; txc->esterror = time_esterror; txc->status = time_status; @@ -388,14 +359,16 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0 txc->precision = time_precision; txc->tolerance = time_tolerance; txc->tick = tick_usec; - txc->ppsfreq = pps_freq; - txc->jitter = pps_jitter >> PPS_AVG; - txc->shift = pps_shift; - txc->stabil = pps_stabil; - txc->jitcnt = pps_jitcnt; - txc->calcnt = pps_calcnt; - txc->errcnt = pps_errcnt; - txc->stbcnt = pps_stbcnt; + + /* PPS is not implemented, so these are zero */ + txc->ppsfreq = 0; + txc->jitter = 0; + txc->shift = 0; + txc->stabil = 0; + txc->jitcnt = 0; + txc->calcnt = 0; + txc->errcnt = 0; + txc->stbcnt = 0; write_sequnlock_irq(&xtime_lock); do_gettimeofday(&txc->time); notify_arch_cmos_timer(); @@ -637,7 +610,7 @@ void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec) * * Returns the timespec representation of the nsec parameter. */ -struct timespec ns_to_timespec(const nsec_t nsec) +struct timespec ns_to_timespec(const s64 nsec) { struct timespec ts; @@ -657,7 +630,7 @@ struct timespec ns_to_timespec(const nsec_t nsec) * * Returns the timeval representation of the nsec parameter. */ -struct timeval ns_to_timeval(const nsec_t nsec) +struct timeval ns_to_timeval(const s64 nsec) { struct timespec ts = ns_to_timespec(nsec); struct timeval tv; diff --git a/kernel/timer.c b/kernel/timer.c index 2410c18dbeb..ab189dd187c 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -86,7 +86,8 @@ struct tvec_t_base_s { } ____cacheline_aligned_in_smp; typedef struct tvec_t_base_s tvec_base_t; -static DEFINE_PER_CPU(tvec_base_t, tvec_bases); +static DEFINE_PER_CPU(tvec_base_t *, tvec_bases); +static tvec_base_t boot_tvec_bases; static inline void set_running_timer(tvec_base_t *base, struct timer_list *timer) @@ -157,7 +158,7 @@ EXPORT_SYMBOL(__init_timer_base); void fastcall init_timer(struct timer_list *timer) { timer->entry.next = NULL; - timer->base = &per_cpu(tvec_bases, raw_smp_processor_id()).t_base; + timer->base = &per_cpu(tvec_bases, raw_smp_processor_id())->t_base; } EXPORT_SYMBOL(init_timer); @@ -218,7 +219,7 @@ int __mod_timer(struct timer_list *timer, unsigned long expires) ret = 1; } - new_base = &__get_cpu_var(tvec_bases); + new_base = __get_cpu_var(tvec_bases); if (base != &new_base->t_base) { /* @@ -258,7 +259,7 @@ EXPORT_SYMBOL(__mod_timer); */ void add_timer_on(struct timer_list *timer, int cpu) { - tvec_base_t *base = &per_cpu(tvec_bases, cpu); + tvec_base_t *base = per_cpu(tvec_bases, cpu); unsigned long flags; BUG_ON(timer_pending(timer) || !timer->function); @@ -504,7 +505,7 @@ unsigned long next_timer_interrupt(void) } hr_expires += jiffies; - base = &__get_cpu_var(tvec_bases); + base = __get_cpu_var(tvec_bases); spin_lock(&base->t_base.lock); expires = base->timer_jiffies + (LONG_MAX >> 1); list = NULL; @@ -696,18 +697,9 @@ static void second_overflow(void) /* * Compute the frequency estimate and additional phase adjustment due - * to frequency error for the next second. When the PPS signal is - * engaged, gnaw on the watchdog counter and update the frequency - * computed by the pll and the PPS signal. + * to frequency error for the next second. */ - pps_valid++; - if (pps_valid == PPS_VALID) { /* PPS signal lost */ - pps_jitter = MAXTIME; - pps_stabil = MAXFREQ; - time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | - STA_PPSWANDER | STA_PPSERROR); - } - ltemp = time_freq + pps_freq; + ltemp = time_freq; time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE)); #if HZ == 100 @@ -901,7 +893,7 @@ EXPORT_SYMBOL(xtime_lock); */ static void run_timer_softirq(struct softirq_action *h) { - tvec_base_t *base = &__get_cpu_var(tvec_bases); + tvec_base_t *base = __get_cpu_var(tvec_bases); hrtimer_run_queues(); if (time_after_eq(jiffies, base->timer_jiffies)) @@ -914,6 +906,7 @@ static void run_timer_softirq(struct softirq_action *h) void run_local_timers(void) { raise_softirq(TIMER_SOFTIRQ); + softlockup_tick(); } /* @@ -944,7 +937,6 @@ void do_timer(struct pt_regs *regs) /* prevent loading jiffies before storing new jiffies_64 value. */ barrier(); update_times(); - softlockup_tick(regs); } #ifdef __ARCH_WANT_SYS_ALARM @@ -955,19 +947,7 @@ void do_timer(struct pt_regs *regs) */ asmlinkage unsigned long sys_alarm(unsigned int seconds) { - struct itimerval it_new, it_old; - unsigned int oldalarm; - - it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0; - it_new.it_value.tv_sec = seconds; - it_new.it_value.tv_usec = 0; - do_setitimer(ITIMER_REAL, &it_new, &it_old); - oldalarm = it_old.it_value.tv_sec; - /* ehhh.. We can't return 0 if we have an alarm pending.. */ - /* And we'd better return too much than too little anyway */ - if ((!oldalarm && it_old.it_value.tv_usec) || it_old.it_value.tv_usec >= 500000) - oldalarm++; - return oldalarm; + return alarm_setitimer(seconds); } #endif @@ -1256,12 +1236,32 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info) return 0; } -static void __devinit init_timers_cpu(int cpu) +static int __devinit init_timers_cpu(int cpu) { int j; tvec_base_t *base; - base = &per_cpu(tvec_bases, cpu); + base = per_cpu(tvec_bases, cpu); + if (!base) { + static char boot_done; + + /* + * Cannot do allocation in init_timers as that runs before the + * allocator initializes (and would waste memory if there are + * more possible CPUs than will ever be installed/brought up). + */ + if (boot_done) { + base = kmalloc_node(sizeof(*base), GFP_KERNEL, + cpu_to_node(cpu)); + if (!base) + return -ENOMEM; + memset(base, 0, sizeof(*base)); + } else { + base = &boot_tvec_bases; + boot_done = 1; + } + per_cpu(tvec_bases, cpu) = base; + } spin_lock_init(&base->t_base.lock); for (j = 0; j < TVN_SIZE; j++) { INIT_LIST_HEAD(base->tv5.vec + j); @@ -1273,6 +1273,7 @@ static void __devinit init_timers_cpu(int cpu) INIT_LIST_HEAD(base->tv1.vec + j); base->timer_jiffies = jiffies; + return 0; } #ifdef CONFIG_HOTPLUG_CPU @@ -1295,8 +1296,8 @@ static void __devinit migrate_timers(int cpu) int i; BUG_ON(cpu_online(cpu)); - old_base = &per_cpu(tvec_bases, cpu); - new_base = &get_cpu_var(tvec_bases); + old_base = per_cpu(tvec_bases, cpu); + new_base = get_cpu_var(tvec_bases); local_irq_disable(); spin_lock(&new_base->t_base.lock); @@ -1326,7 +1327,8 @@ static int __devinit timer_cpu_notify(struct notifier_block *self, long cpu = (long)hcpu; switch(action) { case CPU_UP_PREPARE: - init_timers_cpu(cpu); + if (init_timers_cpu(cpu) < 0) + return NOTIFY_BAD; break; #ifdef CONFIG_HOTPLUG_CPU case CPU_DEAD: diff --git a/kernel/user.c b/kernel/user.c index d9deae43a9a..2116642f42c 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -105,15 +105,19 @@ void free_uid(struct user_struct *up) { unsigned long flags; + if (!up) + return; + local_irq_save(flags); - if (up && atomic_dec_and_lock(&up->__count, &uidhash_lock)) { + if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) { uid_hash_remove(up); + spin_unlock_irqrestore(&uidhash_lock, flags); key_put(up->uid_keyring); key_put(up->session_keyring); kmem_cache_free(uid_cachep, up); - spin_unlock(&uidhash_lock); + } else { + local_irq_restore(flags); } - local_irq_restore(flags); } struct user_struct * alloc_uid(uid_t uid) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index b052e2c4c71..e9e464a9037 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -27,6 +27,7 @@ #include <linux/cpu.h> #include <linux/notifier.h> #include <linux/kthread.h> +#include <linux/hardirq.h> /* * The per-CPU workqueue (if single thread, we always use the first @@ -476,6 +477,34 @@ void cancel_rearming_delayed_work(struct work_struct *work) } EXPORT_SYMBOL(cancel_rearming_delayed_work); +/** + * execute_in_process_context - reliably execute the routine with user context + * @fn: the function to execute + * @data: data to pass to the function + * @ew: guaranteed storage for the execute work structure (must + * be available when the work executes) + * + * Executes the function immediately if process context is available, + * otherwise schedules the function for delayed execution. + * + * Returns: 0 - function was executed + * 1 - function was scheduled for execution + */ +int execute_in_process_context(void (*fn)(void *data), void *data, + struct execute_work *ew) +{ + if (!in_interrupt()) { + fn(data); + return 0; + } + + INIT_WORK(&ew->work, fn, data); + schedule_work(&ew->work); + + return 1; +} +EXPORT_SYMBOL_GPL(execute_in_process_context); + int keventd_up(void) { return keventd_wq != NULL; |