diff options
Diffstat (limited to 'kernel')
185 files changed, 13707 insertions, 8797 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index d2b32ac27a3..76768ee812b 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -220,6 +220,20 @@ config INLINE_WRITE_UNLOCK_IRQRESTORE endif +config ARCH_SUPPORTS_ATOMIC_RMW + bool + config MUTEX_SPIN_ON_OWNER def_bool y - depends on SMP && !DEBUG_MUTEXES + depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW + +config RWSEM_SPIN_ON_OWNER + def_bool y + depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW + +config ARCH_USE_QUEUE_RWLOCK + bool + +config QUEUE_RWLOCK + def_bool y if ARCH_USE_QUEUE_RWLOCK + depends on SMP diff --git a/kernel/Makefile b/kernel/Makefile index bc010ee272b..f2a8b6246ce 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -18,11 +18,13 @@ CFLAGS_REMOVE_cgroup-debug.o = -pg CFLAGS_REMOVE_irq_work.o = -pg endif +# cond_syscall is currently not LTO compatible +CFLAGS_sys_ni.o = $(DISABLE_LTO) + obj-y += sched/ obj-y += locking/ obj-y += power/ obj-y += printk/ -obj-y += cpu/ obj-y += irq/ obj-y += rcu/ @@ -93,6 +95,7 @@ obj-$(CONFIG_PADATA) += padata.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o +obj-$(CONFIG_TORTURE_TEST) += torture.o $(obj)/configs.o: $(obj)/config_data.h diff --git a/kernel/acct.c b/kernel/acct.c index 8d6e145138b..808a86ff229 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -55,7 +55,7 @@ #include <linux/times.h> #include <linux/syscalls.h> #include <linux/mount.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/div64.h> #include <linux/blkdev.h> /* sector_div */ #include <linux/pid_namespace.h> @@ -134,7 +134,7 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file) spin_lock(&acct_lock); if (file != acct->file) { if (act) - res = act>0; + res = act > 0; goto out; } @@ -262,7 +262,7 @@ SYSCALL_DEFINE1(acct, const char __user *, name) if (name) { struct filename *tmp = getname(name); if (IS_ERR(tmp)) - return (PTR_ERR(tmp)); + return PTR_ERR(tmp); error = acct_on(tmp); putname(tmp); } else { diff --git a/kernel/audit.c b/kernel/audit.c index 34c5a2310fb..3ef2e0e797e 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -44,7 +44,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/init.h> -#include <asm/types.h> +#include <linux/types.h> #include <linux/atomic.h> #include <linux/mm.h> #include <linux/export.h> @@ -182,7 +182,7 @@ struct audit_buffer { struct audit_reply { __u32 portid; - pid_t pid; + struct net *net; struct sk_buff *skb; }; @@ -396,7 +396,7 @@ static void audit_printk_skb(struct sk_buff *skb) if (printk_ratelimit()) pr_notice("type=%d %s\n", nlh->nlmsg_type, data); else - audit_log_lost("printk limit exceeded\n"); + audit_log_lost("printk limit exceeded"); } audit_hold_skb(skb); @@ -412,7 +412,7 @@ static void kauditd_send_skb(struct sk_buff *skb) BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */ if (audit_pid) { pr_err("*NO* daemon at audit_pid=%d\n", audit_pid); - audit_log_lost("auditd disappeared\n"); + audit_log_lost("auditd disappeared"); audit_pid = 0; audit_sock = NULL; } @@ -424,6 +424,38 @@ static void kauditd_send_skb(struct sk_buff *skb) } /* + * kauditd_send_multicast_skb - send the skb to multicast userspace listeners + * + * This function doesn't consume an skb as might be expected since it has to + * copy it anyways. + */ +static void kauditd_send_multicast_skb(struct sk_buff *skb) +{ + struct sk_buff *copy; + struct audit_net *aunet = net_generic(&init_net, audit_net_id); + struct sock *sock = aunet->nlsk; + + if (!netlink_has_listeners(sock, AUDIT_NLGRP_READLOG)) + return; + + /* + * The seemingly wasteful skb_copy() rather than bumping the refcount + * using skb_get() is necessary because non-standard mods are made to + * the skb by the original kaudit unicast socket send routine. The + * existing auditd daemon assumes this breakage. Fixing this would + * require co-ordinating a change in the established protocol between + * the kaudit kernel subsystem and the auditd userspace code. There is + * no reason for new multicast clients to continue with this + * non-compliance. + */ + copy = skb_copy(skb, GFP_KERNEL); + if (!copy) + return; + + nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, GFP_KERNEL); +} + +/* * flush_hold_queue - empty the hold queue if auditd appears * * If auditd just started, drain the queue of messages already @@ -500,7 +532,7 @@ int audit_send_list(void *_dest) { struct audit_netlink_list *dest = _dest; struct sk_buff *skb; - struct net *net = get_net_ns_by_pid(dest->pid); + struct net *net = dest->net; struct audit_net *aunet = net_generic(net, audit_net_id); /* wait for parent to finish and send an ACK */ @@ -510,6 +542,7 @@ int audit_send_list(void *_dest) while ((skb = __skb_dequeue(&dest->q)) != NULL) netlink_unicast(aunet->nlsk, skb, dest->portid, 0); + put_net(net); kfree(dest); return 0; @@ -543,7 +576,7 @@ out_kfree_skb: static int audit_send_reply_thread(void *arg) { struct audit_reply *reply = (struct audit_reply *)arg; - struct net *net = get_net_ns_by_pid(reply->pid); + struct net *net = reply->net; struct audit_net *aunet = net_generic(net, audit_net_id); mutex_lock(&audit_cmd_mutex); @@ -552,12 +585,13 @@ static int audit_send_reply_thread(void *arg) /* Ignore failure. It'll only happen if the sender goes away, because our timeout is set to infinite. */ netlink_unicast(aunet->nlsk , reply->skb, reply->portid, 0); + put_net(net); kfree(reply); return 0; } /** * audit_send_reply - send an audit reply message via netlink - * @portid: netlink port to which to send reply + * @request_skb: skb of request we are replying to (used to target the reply) * @seq: sequence number * @type: audit message type * @done: done (last) flag @@ -568,9 +602,11 @@ static int audit_send_reply_thread(void *arg) * Allocates an skb, builds the netlink message, and sends it to the port id. * No failure notifications. */ -static void audit_send_reply(__u32 portid, int seq, int type, int done, +static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int done, int multi, const void *payload, int size) { + u32 portid = NETLINK_CB(request_skb).portid; + struct net *net = sock_net(NETLINK_CB(request_skb).sk); struct sk_buff *skb; struct task_struct *tsk; struct audit_reply *reply = kmalloc(sizeof(struct audit_reply), @@ -583,8 +619,8 @@ static void audit_send_reply(__u32 portid, int seq, int type, int done, if (!skb) goto out; + reply->net = get_net(net); reply->portid = portid; - reply->pid = task_pid_vnr(current); reply->skb = skb; tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply"); @@ -603,10 +639,19 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) { int err = 0; - /* Only support the initial namespaces for now. */ - if ((current_user_ns() != &init_user_ns) || - (task_active_pid_ns(current) != &init_pid_ns)) - return -EPERM; + /* Only support initial user namespace for now. */ + /* + * We return ECONNREFUSED because it tricks userspace into thinking + * that audit was not configured into the kernel. Lots of users + * configure their PAM stack (because that's what the distro does) + * to reject login if unable to send messages to audit. If we return + * ECONNREFUSED the PAM stack thinks the kernel does not have audit + * configured in and will let login proceed. If we return EPERM + * userspace will reject all logins. This should be removed when we + * support non init namespaces!! + */ + if (current_user_ns() != &init_user_ns) + return -ECONNREFUSED; switch (msg_type) { case AUDIT_LIST: @@ -625,13 +670,18 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) case AUDIT_TTY_SET: case AUDIT_TRIM: case AUDIT_MAKE_EQUIV: - if (!capable(CAP_AUDIT_CONTROL)) + /* Only support auditd and auditctl in initial pid namespace + * for now. */ + if ((task_active_pid_ns(current) != &init_pid_ns)) + return -EPERM; + + if (!netlink_capable(skb, CAP_AUDIT_CONTROL)) err = -EPERM; break; case AUDIT_USER: case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: - if (!capable(CAP_AUDIT_WRITE)) + if (!netlink_capable(skb, CAP_AUDIT_WRITE)) err = -EPERM; break; default: /* bad msg */ @@ -645,6 +695,7 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type) { int rc = 0; uid_t uid = from_kuid(&init_user_ns, current_uid()); + pid_t pid = task_tgid_nr(current); if (!audit_enabled && msg_type != AUDIT_USER_AVC) { *ab = NULL; @@ -654,7 +705,7 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type) *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); if (unlikely(!*ab)) return rc; - audit_log_format(*ab, "pid=%d uid=%u", task_tgid_vnr(current), uid); + audit_log_format(*ab, "pid=%d uid=%u", pid, uid); audit_log_session_info(*ab); audit_log_task_context(*ab); @@ -673,8 +724,7 @@ static int audit_get_feature(struct sk_buff *skb) seq = nlmsg_hdr(skb)->nlmsg_seq; - audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0, - &af, sizeof(af)); + audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &af, sizeof(af)); return 0; } @@ -794,8 +844,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) s.backlog = skb_queue_len(&audit_skb_queue); s.version = AUDIT_VERSION_LATEST; s.backlog_wait_time = audit_backlog_wait_time; - audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0, - &s, sizeof(s)); + audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s)); break; } case AUDIT_SET: { @@ -905,7 +954,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) seq, data, nlmsg_len(nlh)); break; case AUDIT_LIST_RULES: - err = audit_list_rules_send(NETLINK_CB(skb).portid, seq); + err = audit_list_rules_send(skb, seq); break; case AUDIT_TRIM: audit_trim_trees(); @@ -970,8 +1019,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) memcpy(sig_data->ctx, ctx, len); security_release_secctx(ctx, len); } - audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_SIGNAL_INFO, - 0, 0, sig_data, sizeof(*sig_data) + len); + audit_send_reply(skb, seq, AUDIT_SIGNAL_INFO, 0, 0, + sig_data, sizeof(*sig_data) + len); kfree(sig_data); break; case AUDIT_TTY_GET: { @@ -983,8 +1032,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) s.log_passwd = tsk->signal->audit_tty_log_passwd; spin_unlock(&tsk->sighand->siglock); - audit_send_reply(NETLINK_CB(skb).portid, seq, - AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); + audit_send_reply(skb, seq, AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); break; } case AUDIT_TTY_SET: { @@ -1060,10 +1108,22 @@ static void audit_receive(struct sk_buff *skb) mutex_unlock(&audit_cmd_mutex); } +/* Run custom bind function on netlink socket group connect or bind requests. */ +static int audit_bind(int group) +{ + if (!capable(CAP_AUDIT_READ)) + return -EPERM; + + return 0; +} + static int __net_init audit_net_init(struct net *net) { struct netlink_kernel_cfg cfg = { .input = audit_receive, + .bind = audit_bind, + .flags = NL_CFG_F_NONROOT_RECV, + .groups = AUDIT_NLGRP_MAX, }; struct audit_net *aunet = net_generic(net, audit_net_id); @@ -1086,7 +1146,7 @@ static void __net_exit audit_net_exit(struct net *net) audit_sock = NULL; } - rcu_assign_pointer(aunet->nlsk, NULL); + RCU_INIT_POINTER(aunet->nlsk, NULL); synchronize_net(); netlink_kernel_release(sock); } @@ -1818,11 +1878,11 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) spin_unlock_irq(&tsk->sighand->siglock); audit_log_format(ab, - " ppid=%ld pid=%d auid=%u uid=%u gid=%u" + " ppid=%d pid=%d auid=%u uid=%u gid=%u" " euid=%u suid=%u fsuid=%u" " egid=%u sgid=%u fsgid=%u tty=%s ses=%u", - sys_getppid(), - tsk->pid, + task_ppid_nr(tsk), + task_pid_nr(tsk), from_kuid(&init_user_ns, audit_get_loginuid(tsk)), from_kuid(&init_user_ns, cred->uid), from_kgid(&init_user_ns, cred->gid), @@ -1885,10 +1945,10 @@ out: * audit_log_end - end one audit record * @ab: the audit_buffer * - * The netlink_* functions cannot be called inside an irq context, so - * the audit buffer is placed on a queue and a tasklet is scheduled to - * remove them from the queue outside the irq context. May be called in - * any context. + * netlink_unicast() cannot be called inside an irq context because it blocks + * (last arg, flags, is not set to MSG_DONTWAIT), so the audit buffer is placed + * on a queue and a tasklet is scheduled to remove them from the queue outside + * the irq context. May be called in any context. */ void audit_log_end(struct audit_buffer *ab) { @@ -1898,6 +1958,18 @@ void audit_log_end(struct audit_buffer *ab) audit_log_lost("rate limit exceeded"); } else { struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); + + kauditd_send_multicast_skb(ab->skb); + + /* + * The original kaudit unicast socket sends up messages with + * nlmsg_len set to the payload length rather than the entire + * message length. This breaks the standard set by netlink. + * The existing auditd daemon assumes this breakage. Fixing + * this would require co-ordinating a change in the established + * protocol between the kaudit kernel subsystem and the auditd + * userspace code. + */ nlh->nlmsg_len = ab->skb->len - NLMSG_HDRLEN; if (audit_pid) { diff --git a/kernel/audit.h b/kernel/audit.h index 57cc64d6771..7bb65730c89 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -106,6 +106,11 @@ struct audit_names { bool should_free; }; +struct audit_proctitle { + int len; /* length of the cmdline field. */ + char *value; /* the cmdline field */ +}; + /* The per-task audit context. */ struct audit_context { int dummy; /* must be the first element */ @@ -202,6 +207,7 @@ struct audit_context { } execve; }; int fds[2]; + struct audit_proctitle proctitle; #if AUDIT_DEBUG int put_count; @@ -247,7 +253,7 @@ extern void audit_panic(const char *message); struct audit_netlink_list { __u32 portid; - pid_t pid; + struct net *net; struct sk_buff_head q; }; diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 67ccf0e7cca..135944a7b28 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -916,7 +916,7 @@ static int audit_tree_handle_event(struct fsnotify_group *group, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, u32 mask, void *data, int data_type, - const unsigned char *file_name) + const unsigned char *file_name, u32 cookie) { return 0; } diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 2596fac5dcb..70b4554d2fb 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -471,7 +471,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, u32 mask, void *data, int data_type, - const unsigned char *dname) + const unsigned char *dname, u32 cookie) { struct inode *inode; struct audit_parent *parent; diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 14a78cca384..8e9bc9c3dbb 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -19,6 +19,8 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/audit.h> #include <linux/kthread.h> @@ -29,6 +31,8 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/security.h> +#include <net/net_namespace.h> +#include <net/sock.h> #include "audit.h" /* @@ -224,7 +228,7 @@ static int audit_match_signal(struct audit_entry *entry) #endif /* Common user-space to kernel rule translation. */ -static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) +static inline struct audit_entry *audit_to_entry_common(struct audit_rule_data *rule) { unsigned listnr; struct audit_entry *entry; @@ -247,7 +251,7 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) ; } if (unlikely(rule->action == AUDIT_POSSIBLE)) { - printk(KERN_ERR "AUDIT_POSSIBLE is deprecated\n"); + pr_err("AUDIT_POSSIBLE is deprecated\n"); goto exit_err; } if (rule->action != AUDIT_NEVER && rule->action != AUDIT_ALWAYS) @@ -401,7 +405,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, int i; char *str; - entry = audit_to_entry_common((struct audit_rule *)data); + entry = audit_to_entry_common(data); if (IS_ERR(entry)) goto exit_nofree; @@ -429,6 +433,19 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, f->val = 0; } + if ((f->type == AUDIT_PID) || (f->type == AUDIT_PPID)) { + struct pid *pid; + rcu_read_lock(); + pid = find_vpid(f->val); + if (!pid) { + rcu_read_unlock(); + err = -ESRCH; + goto exit_free; + } + f->val = pid_nr(pid); + rcu_read_unlock(); + } + err = audit_field_valid(entry, f); if (err) goto exit_free; @@ -477,8 +494,8 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, /* Keep currently invalid fields around in case they * become valid after a policy reload. */ if (err == -EINVAL) { - printk(KERN_WARNING "audit rule for LSM " - "\'%s\' is invalid\n", str); + pr_warn("audit rule for LSM \'%s\' is invalid\n", + str); err = 0; } if (err) { @@ -707,8 +724,8 @@ static inline int audit_dupe_lsm_field(struct audit_field *df, /* Keep currently invalid fields around in case they * become valid after a policy reload. */ if (ret == -EINVAL) { - printk(KERN_WARNING "audit rule for LSM \'%s\' is " - "invalid\n", df->lsm_str); + pr_warn("audit rule for LSM \'%s\' is invalid\n", + df->lsm_str); ret = 0; } @@ -1065,11 +1082,13 @@ int audit_rule_change(int type, __u32 portid, int seq, void *data, /** * audit_list_rules_send - list the audit rules - * @portid: target portid for netlink audit messages + * @request_skb: skb of request we are replying to (used to target the reply) * @seq: netlink audit message sequence (serial) number */ -int audit_list_rules_send(__u32 portid, int seq) +int audit_list_rules_send(struct sk_buff *request_skb, int seq) { + u32 portid = NETLINK_CB(request_skb).portid; + struct net *net = sock_net(NETLINK_CB(request_skb).sk); struct task_struct *tsk; struct audit_netlink_list *dest; int err = 0; @@ -1083,8 +1102,8 @@ int audit_list_rules_send(__u32 portid, int seq) dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL); if (!dest) return -ENOMEM; + dest->net = get_net(net); dest->portid = portid; - dest->pid = task_pid_vnr(current); skb_queue_head_init(&dest->q); mutex_lock(&audit_filter_mutex); @@ -1236,12 +1255,14 @@ static int audit_filter_user_rules(struct audit_krule *rule, int type, for (i = 0; i < rule->field_count; i++) { struct audit_field *f = &rule->fields[i]; + pid_t pid; int result = 0; u32 sid; switch (f->type) { case AUDIT_PID: - result = audit_comparator(task_pid_vnr(current), f->op, f->val); + pid = task_pid_nr(current); + result = audit_comparator(pid, f->op, f->val); break; case AUDIT_UID: result = audit_uid_comparator(current_uid(), f->op, f->uid); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 10176cd5956..21eae3c05ec 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -42,6 +42,8 @@ * and <dustin.kirkland@us.ibm.com> for LSPP certification compliance. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/init.h> #include <asm/types.h> #include <linux/atomic.h> @@ -68,6 +70,7 @@ #include <linux/capability.h> #include <linux/fs_struct.h> #include <linux/compat.h> +#include <linux/ctype.h> #include "audit.h" @@ -79,6 +82,9 @@ /* no execve audit message should be longer than this (userspace limits) */ #define MAX_EXECVE_AUDIT_LEN 7500 +/* max length to print of cmdline/proctitle value during audit */ +#define MAX_PROCTITLE_AUDIT_LEN 128 + /* number of audit rules */ int audit_n_rules; @@ -451,15 +457,17 @@ static int audit_filter_rules(struct task_struct *tsk, struct audit_field *f = &rule->fields[i]; struct audit_names *n; int result = 0; + pid_t pid; switch (f->type) { case AUDIT_PID: - result = audit_comparator(tsk->pid, f->op, f->val); + pid = task_pid_nr(tsk); + result = audit_comparator(pid, f->op, f->val); break; case AUDIT_PPID: if (ctx) { if (!ctx->ppid) - ctx->ppid = sys_getppid(); + ctx->ppid = task_ppid_nr(tsk); result = audit_comparator(ctx->ppid, f->op, f->val); } break; @@ -720,6 +728,22 @@ static enum audit_state audit_filter_task(struct task_struct *tsk, char **key) return AUDIT_BUILD_CONTEXT; } +static int audit_in_mask(const struct audit_krule *rule, unsigned long val) +{ + int word, bit; + + if (val > 0xffffffff) + return false; + + word = AUDIT_WORD(val); + if (word >= AUDIT_BITMASK_SIZE) + return false; + + bit = AUDIT_BIT(val); + + return rule->mask[word] & bit; +} + /* At syscall entry and exit time, this filter is called if the * audit_state is not low enough that auditing cannot take place, but is * also not high enough that we already know we have to write an audit @@ -737,11 +761,8 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, rcu_read_lock(); if (!list_empty(list)) { - int word = AUDIT_WORD(ctx->major); - int bit = AUDIT_BIT(ctx->major); - list_for_each_entry_rcu(e, list, list) { - if ((e->rule.mask[word] & bit) == bit && + if (audit_in_mask(&e->rule, ctx->major) && audit_filter_rules(tsk, &e->rule, ctx, NULL, &state, false)) { rcu_read_unlock(); @@ -761,20 +782,16 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, static int audit_filter_inode_name(struct task_struct *tsk, struct audit_names *n, struct audit_context *ctx) { - int word, bit; int h = audit_hash_ino((u32)n->ino); struct list_head *list = &audit_inode_hash[h]; struct audit_entry *e; enum audit_state state; - word = AUDIT_WORD(ctx->major); - bit = AUDIT_BIT(ctx->major); - if (list_empty(list)) return 0; list_for_each_entry_rcu(e, list, list) { - if ((e->rule.mask[word] & bit) == bit && + if (audit_in_mask(&e->rule, ctx->major) && audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) { ctx->current_state = state; return 1; @@ -805,7 +822,8 @@ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx) rcu_read_unlock(); } -static inline struct audit_context *audit_get_context(struct task_struct *tsk, +/* Transfer the audit context pointer to the caller, clearing it in the tsk's struct */ +static inline struct audit_context *audit_take_context(struct task_struct *tsk, int return_valid, long return_code) { @@ -842,6 +860,13 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk, return context; } +static inline void audit_proctitle_free(struct audit_context *context) +{ + kfree(context->proctitle.value); + context->proctitle.value = NULL; + context->proctitle.len = 0; +} + static inline void audit_free_names(struct audit_context *context) { struct audit_names *n, *next; @@ -850,16 +875,15 @@ static inline void audit_free_names(struct audit_context *context) if (context->put_count + context->ino_count != context->name_count) { int i = 0; - printk(KERN_ERR "%s:%d(:%d): major=%d in_syscall=%d" - " name_count=%d put_count=%d" - " ino_count=%d [NOT freeing]\n", - __FILE__, __LINE__, + pr_err("%s:%d(:%d): major=%d in_syscall=%d" + " name_count=%d put_count=%d ino_count=%d" + " [NOT freeing]\n", __FILE__, __LINE__, context->serial, context->major, context->in_syscall, context->name_count, context->put_count, context->ino_count); list_for_each_entry(n, &context->names_list, list) { - printk(KERN_ERR "names[%d] = %p = %s\n", i++, - n->name, n->name->name ?: "(null)"); + pr_err("names[%d] = %p = %s\n", i++, n->name, + n->name->name ?: "(null)"); } dump_stack(); return; @@ -955,6 +979,7 @@ static inline void audit_free_context(struct audit_context *context) audit_free_aux(context); kfree(context->filterkey); kfree(context->sockaddr); + audit_proctitle_free(context); kfree(context); } @@ -1157,7 +1182,7 @@ static void audit_log_execve_info(struct audit_context *context, */ buf = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL); if (!buf) { - audit_panic("out of memory for argv string\n"); + audit_panic("out of memory for argv string"); return; } @@ -1271,6 +1296,59 @@ static void show_special(struct audit_context *context, int *call_panic) audit_log_end(ab); } +static inline int audit_proctitle_rtrim(char *proctitle, int len) +{ + char *end = proctitle + len - 1; + while (end > proctitle && !isprint(*end)) + end--; + + /* catch the case where proctitle is only 1 non-print character */ + len = end - proctitle + 1; + len -= isprint(proctitle[len-1]) == 0; + return len; +} + +static void audit_log_proctitle(struct task_struct *tsk, + struct audit_context *context) +{ + int res; + char *buf; + char *msg = "(null)"; + int len = strlen(msg); + struct audit_buffer *ab; + + ab = audit_log_start(context, GFP_KERNEL, AUDIT_PROCTITLE); + if (!ab) + return; /* audit_panic or being filtered */ + + audit_log_format(ab, "proctitle="); + + /* Not cached */ + if (!context->proctitle.value) { + buf = kmalloc(MAX_PROCTITLE_AUDIT_LEN, GFP_KERNEL); + if (!buf) + goto out; + /* Historically called this from procfs naming */ + res = get_cmdline(tsk, buf, MAX_PROCTITLE_AUDIT_LEN); + if (res == 0) { + kfree(buf); + goto out; + } + res = audit_proctitle_rtrim(buf, res); + if (res == 0) { + kfree(buf); + goto out; + } + context->proctitle.value = buf; + context->proctitle.len = res; + } + msg = context->proctitle.value; + len = context->proctitle.len; +out: + audit_log_n_untrustedstring(ab, msg, len); + audit_log_end(ab); +} + static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) { int i, call_panic = 0; @@ -1388,6 +1466,8 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts audit_log_name(context, n, NULL, i++, &call_panic); } + audit_log_proctitle(tsk, context); + /* Send end of event record to help user space know we are finished */ ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE); if (ab) @@ -1406,7 +1486,7 @@ void __audit_free(struct task_struct *tsk) { struct audit_context *context; - context = audit_get_context(tsk, 0, 0); + context = audit_take_context(tsk, 0, 0); if (!context) return; @@ -1500,7 +1580,7 @@ void __audit_syscall_exit(int success, long return_code) else success = AUDITSC_FAILURE; - context = audit_get_context(tsk, success, return_code); + context = audit_take_context(tsk, success, return_code); if (!context) return; @@ -1550,7 +1630,7 @@ static inline void handle_one(const struct inode *inode) if (likely(put_tree_ref(context, chunk))) return; if (unlikely(!grow_tree_refs(context))) { - printk(KERN_WARNING "out of memory, audit has lost a tree reference\n"); + pr_warn("out of memory, audit has lost a tree reference\n"); audit_set_auditable(context); audit_put_chunk(chunk); unroll_tree_refs(context, p, count); @@ -1609,8 +1689,7 @@ retry: goto retry; } /* too bad */ - printk(KERN_WARNING - "out of memory, audit has lost a tree reference\n"); + pr_warn("out of memory, audit has lost a tree reference\n"); unroll_tree_refs(context, p, count); audit_set_auditable(context); return; @@ -1682,7 +1761,7 @@ void __audit_getname(struct filename *name) if (!context->in_syscall) { #if AUDIT_DEBUG == 2 - printk(KERN_ERR "%s:%d(:%d): ignoring getname(%p)\n", + pr_err("%s:%d(:%d): ignoring getname(%p)\n", __FILE__, __LINE__, context->serial, name); dump_stack(); #endif @@ -1719,17 +1798,17 @@ void audit_putname(struct filename *name) struct audit_context *context = current->audit_context; BUG_ON(!context); - if (!context->in_syscall) { + if (!name->aname || !context->in_syscall) { #if AUDIT_DEBUG == 2 - printk(KERN_ERR "%s:%d(:%d): final_putname(%p)\n", + pr_err("%s:%d(:%d): final_putname(%p)\n", __FILE__, __LINE__, context->serial, name); if (context->name_count) { struct audit_names *n; int i = 0; list_for_each_entry(n, &context->names_list, list) - printk(KERN_ERR "name[%d] = %p = %s\n", i++, - n->name, n->name->name ?: "(null)"); + pr_err("name[%d] = %p = %s\n", i++, n->name, + n->name->name ?: "(null)"); } #endif final_putname(name); @@ -1738,9 +1817,8 @@ void audit_putname(struct filename *name) else { ++context->put_count; if (context->put_count > context->name_count) { - printk(KERN_ERR "%s:%d(:%d): major=%d" - " in_syscall=%d putname(%p) name_count=%d" - " put_count=%d\n", + pr_err("%s:%d(:%d): major=%d in_syscall=%d putname(%p)" + " name_count=%d put_count=%d\n", __FILE__, __LINE__, context->serial, context->major, context->in_syscall, name->name, @@ -1981,12 +2059,10 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); if (!ab) return; - audit_log_format(ab, "pid=%d uid=%u" - " old-auid=%u new-auid=%u old-ses=%u new-ses=%u" - " res=%d", - current->pid, uid, - oldloginuid, loginuid, oldsessionid, sessionid, - !rc); + audit_log_format(ab, "pid=%d uid=%u", task_pid_nr(current), uid); + audit_log_task_context(ab); + audit_log_format(ab, " old-auid=%u auid=%u old-ses=%u ses=%u res=%d", + oldloginuid, loginuid, oldsessionid, sessionid, !rc); audit_log_end(ab); } @@ -2208,7 +2284,7 @@ void __audit_ptrace(struct task_struct *t) { struct audit_context *context = current->audit_context; - context->target_pid = t->pid; + context->target_pid = task_pid_nr(t); context->target_auid = audit_get_loginuid(t); context->target_uid = task_uid(t); context->target_sessionid = audit_get_sessionid(t); @@ -2233,7 +2309,7 @@ int __audit_signal_info(int sig, struct task_struct *t) if (audit_pid && t->tgid == audit_pid) { if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) { - audit_sig_pid = tsk->pid; + audit_sig_pid = task_pid_nr(tsk); if (uid_valid(tsk->loginuid)) audit_sig_uid = tsk->loginuid; else @@ -2247,7 +2323,7 @@ int __audit_signal_info(int sig, struct task_struct *t) /* optimize the common case by putting first signal recipient directly * in audit_context */ if (!ctx->target_pid) { - ctx->target_pid = t->tgid; + ctx->target_pid = task_tgid_nr(t); ctx->target_auid = audit_get_loginuid(t); ctx->target_uid = t_uid; ctx->target_sessionid = audit_get_sessionid(t); @@ -2268,7 +2344,7 @@ int __audit_signal_info(int sig, struct task_struct *t) } BUG_ON(axp->pid_count >= AUDIT_AUX_PIDS); - axp->target_pid[axp->pid_count] = t->tgid; + axp->target_pid[axp->pid_count] = task_tgid_nr(t); axp->target_auid[axp->pid_count] = audit_get_loginuid(t); axp->target_uid[axp->pid_count] = t_uid; axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t); @@ -2368,7 +2444,7 @@ static void audit_log_task(struct audit_buffer *ab) from_kgid(&init_user_ns, gid), sessionid); audit_log_task_context(ab); - audit_log_format(ab, " pid=%d comm=", current->pid); + audit_log_format(ab, " pid=%d comm=", task_pid_nr(current)); audit_log_untrustedstring(ab, current->comm); if (mm) { down_read(&mm->mmap_sem); diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c index a5e026bc45c..1323360d90e 100644 --- a/kernel/backtracetest.c +++ b/kernel/backtracetest.c @@ -19,8 +19,8 @@ static void backtrace_test_normal(void) { - printk("Testing a backtrace from process context.\n"); - printk("The following trace is a kernel self test and not a bug!\n"); + pr_info("Testing a backtrace from process context.\n"); + pr_info("The following trace is a kernel self test and not a bug!\n"); dump_stack(); } @@ -37,8 +37,8 @@ static DECLARE_TASKLET(backtrace_tasklet, &backtrace_test_irq_callback, 0); static void backtrace_test_irq(void) { - printk("Testing a backtrace from irq context.\n"); - printk("The following trace is a kernel self test and not a bug!\n"); + pr_info("Testing a backtrace from irq context.\n"); + pr_info("The following trace is a kernel self test and not a bug!\n"); init_completion(&backtrace_work); tasklet_schedule(&backtrace_tasklet); @@ -51,8 +51,8 @@ static void backtrace_test_saved(void) struct stack_trace trace; unsigned long entries[8]; - printk("Testing a saved backtrace.\n"); - printk("The following trace is a kernel self test and not a bug!\n"); + pr_info("Testing a saved backtrace.\n"); + pr_info("The following trace is a kernel self test and not a bug!\n"); trace.nr_entries = 0; trace.max_entries = ARRAY_SIZE(entries); @@ -65,19 +65,19 @@ static void backtrace_test_saved(void) #else static void backtrace_test_saved(void) { - printk("Saved backtrace test skipped.\n"); + pr_info("Saved backtrace test skipped.\n"); } #endif static int backtrace_regression_test(void) { - printk("====[ backtrace testing ]===========\n"); + pr_info("====[ backtrace testing ]===========\n"); backtrace_test_normal(); backtrace_test_irq(); backtrace_test_saved(); - printk("====[ end of backtrace testing ]====\n"); + pr_info("====[ end of backtrace testing ]====\n"); return 0; } diff --git a/kernel/capability.c b/kernel/capability.c index 34019c57888..a5cf13c018c 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -7,6 +7,8 @@ * 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/audit.h> #include <linux/capability.h> #include <linux/mm.h> @@ -22,7 +24,6 @@ */ const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET; - EXPORT_SYMBOL(__cap_empty_set); int file_caps_enabled = 1; @@ -42,15 +43,10 @@ __setup("no_file_caps", file_caps_disable); static void warn_legacy_capability_use(void) { - static int warned; - if (!warned) { - char name[sizeof(current->comm)]; - - printk(KERN_INFO "warning: `%s' uses 32-bit capabilities" - " (legacy support in use)\n", - get_task_comm(name, current)); - warned = 1; - } + char name[sizeof(current->comm)]; + + pr_info_once("warning: `%s' uses 32-bit capabilities (legacy support in use)\n", + get_task_comm(name, current)); } /* @@ -71,16 +67,10 @@ static void warn_legacy_capability_use(void) static void warn_deprecated_v2(void) { - static int warned; + char name[sizeof(current->comm)]; - if (!warned) { - char name[sizeof(current->comm)]; - - printk(KERN_INFO "warning: `%s' uses deprecated v2" - " capabilities in a way that may be insecure.\n", - get_task_comm(name, current)); - warned = 1; - } + pr_info_once("warning: `%s' uses deprecated v2 capabilities in a way that may be insecure\n", + get_task_comm(name, current)); } /* @@ -198,7 +188,7 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr) * * An alternative would be to return an error here * (-ERANGE), but that causes legacy applications to - * unexpectidly fail; the capget/modify/capset aborts + * unexpectedly fail; the capget/modify/capset aborts * before modification is attempted and the application * fails. */ @@ -380,7 +370,7 @@ bool has_capability_noaudit(struct task_struct *t, int cap) bool ns_capable(struct user_namespace *ns, int cap) { if (unlikely(!cap_valid(cap))) { - printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap); + pr_crit("capable() called with invalid cap=%u\n", cap); BUG(); } @@ -404,7 +394,8 @@ EXPORT_SYMBOL(ns_capable); * This does not set PF_SUPERPRIV because the caller may not * actually be privileged. */ -bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap) +bool file_ns_capable(const struct file *file, struct user_namespace *ns, + int cap) { if (WARN_ON_ONCE(!cap_valid(cap))) return false; @@ -433,23 +424,19 @@ bool capable(int cap) EXPORT_SYMBOL(capable); /** - * inode_capable - Check superior capability over inode + * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped * @inode: The inode in question * @cap: The capability in question * - * Return true if the current task has the given superior capability - * targeted at it's own user namespace and that the given inode is owned - * by the current user namespace or a child namespace. - * - * Currently we check to see if an inode is owned by the current - * user namespace by seeing if the inode's owner maps into the - * current user namespace. - * + * Return true if the current task has the given capability targeted at + * its own user namespace and that the given inode's uid and gid are + * mapped into the current user namespace. */ -bool inode_capable(const struct inode *inode, int cap) +bool capable_wrt_inode_uidgid(const struct inode *inode, int cap) { struct user_namespace *ns = current_user_ns(); - return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid); + return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid) && + kgid_has_mapping(ns, inode->i_gid); } -EXPORT_SYMBOL(inode_capable); +EXPORT_SYMBOL(capable_wrt_inode_uidgid); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e2f46ba37f7..70776aec256 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -26,6 +26,8 @@ * distribution for more details. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/cgroup.h> #include <linux/cred.h> #include <linux/ctype.h> @@ -33,6 +35,7 @@ #include <linux/init_task.h> #include <linux/kernel.h> #include <linux/list.h> +#include <linux/magic.h> #include <linux/mm.h> #include <linux/mutex.h> #include <linux/mount.h> @@ -40,23 +43,20 @@ #include <linux/proc_fs.h> #include <linux/rcupdate.h> #include <linux/sched.h> -#include <linux/backing-dev.h> #include <linux/slab.h> -#include <linux/magic.h> #include <linux/spinlock.h> +#include <linux/rwsem.h> #include <linux/string.h> #include <linux/sort.h> #include <linux/kmod.h> -#include <linux/module.h> #include <linux/delayacct.h> #include <linux/cgroupstats.h> #include <linux/hashtable.h> -#include <linux/namei.h> #include <linux/pid_namespace.h> #include <linux/idr.h> #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ -#include <linux/flex_array.h> /* used in cgroup_attach_task */ #include <linux/kthread.h> +#include <linux/delay.h> #include <linux/atomic.h> @@ -68,44 +68,46 @@ */ #define CGROUP_PIDLIST_DESTROY_DELAY HZ +#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \ + MAX_CFTYPE_NAME + 2) + /* * cgroup_mutex is the master lock. Any modification to cgroup or its * hierarchy must be performed while holding it. * - * cgroup_root_mutex nests inside cgroup_mutex and should be held to modify - * cgroupfs_root of any cgroup hierarchy - subsys list, flags, - * release_agent_path and so on. Modifying requires both cgroup_mutex and - * cgroup_root_mutex. Readers can acquire either of the two. This is to - * break the following locking order cycle. + * css_set_rwsem protects task->cgroups pointer, the list of css_set + * objects, and the chain of tasks off each css_set. * - * A. cgroup_mutex -> cred_guard_mutex -> s_type->i_mutex_key -> namespace_sem - * B. namespace_sem -> cgroup_mutex - * - * B happens only through cgroup_show_options() and using cgroup_root_mutex - * breaks it. + * These locks are exported if CONFIG_PROVE_RCU so that accessors in + * cgroup.h can use them for lockdep annotations. */ #ifdef CONFIG_PROVE_RCU DEFINE_MUTEX(cgroup_mutex); -EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for lockdep */ +DECLARE_RWSEM(css_set_rwsem); +EXPORT_SYMBOL_GPL(cgroup_mutex); +EXPORT_SYMBOL_GPL(css_set_rwsem); #else static DEFINE_MUTEX(cgroup_mutex); +static DECLARE_RWSEM(css_set_rwsem); #endif -static DEFINE_MUTEX(cgroup_root_mutex); +/* + * Protects cgroup_idr and css_idr so that IDs can be released without + * grabbing cgroup_mutex. + */ +static DEFINE_SPINLOCK(cgroup_idr_lock); + +/* + * Protects cgroup_subsys->release_agent_path. Modifying it also requires + * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock. + */ +static DEFINE_SPINLOCK(release_agent_path_lock); #define cgroup_assert_mutex_or_rcu_locked() \ rcu_lockdep_assert(rcu_read_lock_held() || \ lockdep_is_held(&cgroup_mutex), \ "cgroup_mutex or RCU read lock required"); -#ifdef CONFIG_LOCKDEP -#define cgroup_assert_mutex_or_root_locked() \ - WARN_ON_ONCE(debug_locks && (!lockdep_is_held(&cgroup_mutex) && \ - !lockdep_is_held(&cgroup_root_mutex))) -#else -#define cgroup_assert_mutex_or_root_locked() do { } while (0) -#endif - /* * cgroup destruction makes heavy use of work items and there can be a lot * of concurrent destructions. Use a separate workqueue so that cgroup @@ -120,51 +122,56 @@ static struct workqueue_struct *cgroup_destroy_wq; */ static struct workqueue_struct *cgroup_pidlist_destroy_wq; -/* - * Generate an array of cgroup subsystem pointers. At boot time, this is - * populated with the built in subsystems, and modular subsystems are - * registered after that. The mutable section of this array is protected by - * cgroup_mutex. - */ -#define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys, -#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option) -static struct cgroup_subsys *cgroup_subsys[CGROUP_SUBSYS_COUNT] = { +/* generate an array of cgroup subsystem pointers */ +#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys, +static struct cgroup_subsys *cgroup_subsys[] = { +#include <linux/cgroup_subsys.h> +}; +#undef SUBSYS + +/* array of cgroup subsystem names */ +#define SUBSYS(_x) [_x ## _cgrp_id] = #_x, +static const char *cgroup_subsys_name[] = { #include <linux/cgroup_subsys.h> }; +#undef SUBSYS /* - * The dummy hierarchy, reserved for the subsystems that are otherwise + * The default hierarchy, reserved for the subsystems that are otherwise * unattached - it never has more than a single cgroup, and all tasks are * part of that cgroup. */ -static struct cgroupfs_root cgroup_dummy_root; +struct cgroup_root cgrp_dfl_root; -/* dummy_top is a shorthand for the dummy hierarchy's top cgroup */ -static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup; +/* + * The default hierarchy always exists but is hidden until mounted for the + * first time. This is for backward compatibility. + */ +static bool cgrp_dfl_root_visible; + +/* some controllers are not supported in the default hierarchy */ +static const unsigned int cgrp_dfl_root_inhibit_ss_mask = 0 +#ifdef CONFIG_CGROUP_DEBUG + | (1 << debug_cgrp_id) +#endif + ; /* The list of hierarchy roots */ static LIST_HEAD(cgroup_roots); static int cgroup_root_count; -/* - * Hierarchy ID allocation and mapping. It follows the same exclusion - * rules as other root ops - both cgroup_mutex and cgroup_root_mutex for - * writes, either for reads. - */ +/* hierarchy ID allocation and mapping, protected by cgroup_mutex */ static DEFINE_IDR(cgroup_hierarchy_idr); -static struct cgroup_name root_cgroup_name = { .name = "/" }; - /* - * Assign a monotonically increasing serial number to cgroups. It - * guarantees cgroups with bigger numbers are newer than those with smaller - * numbers. Also, as cgroups are always appended to the parent's - * ->children list, it guarantees that sibling cgroups are always sorted in - * the ascending serial number order on the list. Protected by - * cgroup_mutex. + * Assign a monotonically increasing serial number to csses. It guarantees + * cgroups with bigger numbers are newer than those with smaller numbers. + * Also, as csses are always appended to the parent's ->children list, it + * guarantees that sibling csses are always sorted in the ascending serial + * number order on the list. Protected by cgroup_mutex. */ -static u64 cgroup_serial_nr_next = 1; +static u64 css_serial_nr_next = 1; /* This flag indicates whether tasks in the fork and exit paths should * check for fork/exit handlers to call. This avoids us having to do @@ -175,17 +182,61 @@ static int need_forkexit_callback __read_mostly; static struct cftype cgroup_base_files[]; -static void cgroup_destroy_css_killed(struct cgroup *cgrp); +static void cgroup_put(struct cgroup *cgrp); +static int rebind_subsystems(struct cgroup_root *dst_root, + unsigned int ss_mask); static int cgroup_destroy_locked(struct cgroup *cgrp); +static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss); +static void css_release(struct percpu_ref *ref); +static void kill_css(struct cgroup_subsys_state *css); static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], bool is_add); -static int cgroup_file_release(struct inode *inode, struct file *file); static void cgroup_pidlist_destroy_all(struct cgroup *cgrp); +/* IDR wrappers which synchronize using cgroup_idr_lock */ +static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, + gfp_t gfp_mask) +{ + int ret; + + idr_preload(gfp_mask); + spin_lock_bh(&cgroup_idr_lock); + ret = idr_alloc(idr, ptr, start, end, gfp_mask); + spin_unlock_bh(&cgroup_idr_lock); + idr_preload_end(); + return ret; +} + +static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id) +{ + void *ret; + + spin_lock_bh(&cgroup_idr_lock); + ret = idr_replace(idr, ptr, id); + spin_unlock_bh(&cgroup_idr_lock); + return ret; +} + +static void cgroup_idr_remove(struct idr *idr, int id) +{ + spin_lock_bh(&cgroup_idr_lock); + idr_remove(idr, id); + spin_unlock_bh(&cgroup_idr_lock); +} + +static struct cgroup *cgroup_parent(struct cgroup *cgrp) +{ + struct cgroup_subsys_state *parent_css = cgrp->self.parent; + + if (parent_css) + return container_of(parent_css, struct cgroup, self); + return NULL; +} + /** * cgroup_css - obtain a cgroup's css for the specified subsystem * @cgrp: the cgroup of interest - * @ss: the subsystem of interest (%NULL returns the dummy_css) + * @ss: the subsystem of interest (%NULL returns @cgrp->self) * * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This * function must be called either under cgroup_mutex or rcu_read_lock() and @@ -197,17 +248,65 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, struct cgroup_subsys *ss) { if (ss) - return rcu_dereference_check(cgrp->subsys[ss->subsys_id], - lockdep_is_held(&cgroup_mutex)); + return rcu_dereference_check(cgrp->subsys[ss->id], + lockdep_is_held(&cgroup_mutex)); else - return &cgrp->dummy_css; + return &cgrp->self; +} + +/** + * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem + * @cgrp: the cgroup of interest + * @ss: the subsystem of interest (%NULL returns @cgrp->self) + * + * Similar to cgroup_css() but returns the effctive css, which is defined + * as the matching css of the nearest ancestor including self which has @ss + * enabled. If @ss is associated with the hierarchy @cgrp is on, this + * function is guaranteed to return non-NULL css. + */ +static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, + struct cgroup_subsys *ss) +{ + lockdep_assert_held(&cgroup_mutex); + + if (!ss) + return &cgrp->self; + + if (!(cgrp->root->subsys_mask & (1 << ss->id))) + return NULL; + + while (cgroup_parent(cgrp) && + !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id))) + cgrp = cgroup_parent(cgrp); + + return cgroup_css(cgrp, ss); } /* convenient tests for these bits */ static inline bool cgroup_is_dead(const struct cgroup *cgrp) { - return test_bit(CGRP_DEAD, &cgrp->flags); + return !(cgrp->self.flags & CSS_ONLINE); +} + +struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) +{ + struct cgroup *cgrp = of->kn->parent->priv; + struct cftype *cft = of_cft(of); + + /* + * This is open and unprotected implementation of cgroup_css(). + * seq_css() is only called from a kernfs file operation which has + * an active reference on the file. Because all the subsystem + * files are drained before a css is disassociated with a cgroup, + * the matching css from the cgroup's subsys table is guaranteed to + * be and stay valid until the enclosing operation is complete. + */ + if (cft->ss) + return rcu_dereference_raw(cgrp->subsys[cft->ss->id]); + else + return &cgrp->self; } +EXPORT_SYMBOL_GPL(of_css); /** * cgroup_is_descendant - test ancestry @@ -223,11 +322,10 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor) while (cgrp) { if (cgrp == ancestor) return true; - cgrp = cgrp->parent; + cgrp = cgroup_parent(cgrp); } return false; } -EXPORT_SYMBOL_GPL(cgroup_is_descendant); static int cgroup_is_releasable(const struct cgroup *cgrp) { @@ -248,7 +346,7 @@ static int notify_on_release(const struct cgroup *cgrp) * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end * @cgrp: the target cgroup to iterate css's of * - * Should be called under cgroup_mutex. + * Should be called under cgroup_[tree_]mutex. */ #define for_each_css(css, ssid, cgrp) \ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ @@ -258,66 +356,39 @@ static int notify_on_release(const struct cgroup *cgrp) else /** - * for_each_subsys - iterate all loaded cgroup subsystems - * @ss: the iteration cursor - * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end + * for_each_e_css - iterate all effective css's of a cgroup + * @css: the iteration cursor + * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end + * @cgrp: the target cgroup to iterate css's of * - * Iterates through all loaded subsystems. Should be called under - * cgroup_mutex or cgroup_root_mutex. + * Should be called under cgroup_[tree_]mutex. */ -#define for_each_subsys(ss, ssid) \ - for (({ cgroup_assert_mutex_or_root_locked(); (ssid) = 0; }); \ - (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ - if (!((ss) = cgroup_subsys[(ssid)])) { } \ +#define for_each_e_css(css, ssid, cgrp) \ + for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ + if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \ + ; \ else /** - * for_each_builtin_subsys - iterate all built-in cgroup subsystems + * for_each_subsys - iterate all enabled cgroup subsystems * @ss: the iteration cursor - * @i: the index of @ss, CGROUP_BUILTIN_SUBSYS_COUNT after reaching the end - * - * Bulit-in subsystems are always present and iteration itself doesn't - * require any synchronization. + * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end */ -#define for_each_builtin_subsys(ss, i) \ - for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT && \ - (((ss) = cgroup_subsys[i]) || true); (i)++) +#define for_each_subsys(ss, ssid) \ + for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \ + (((ss) = cgroup_subsys[ssid]) || true); (ssid)++) -/* iterate across the active hierarchies */ -#define for_each_active_root(root) \ +/* iterate across the hierarchies */ +#define for_each_root(root) \ list_for_each_entry((root), &cgroup_roots, root_list) -static inline struct cgroup *__d_cgrp(struct dentry *dentry) -{ - return dentry->d_fsdata; -} - -static inline struct cfent *__d_cfe(struct dentry *dentry) -{ - return dentry->d_fsdata; -} - -static inline struct cftype *__d_cft(struct dentry *dentry) -{ - return __d_cfe(dentry)->type; -} - -/** - * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. - * @cgrp: the cgroup to be checked for liveness - * - * On success, returns true; the mutex should be later unlocked. On - * failure returns false with no lock held. - */ -static bool cgroup_lock_live_group(struct cgroup *cgrp) -{ - mutex_lock(&cgroup_mutex); - if (cgroup_is_dead(cgrp)) { - mutex_unlock(&cgroup_mutex); - return false; - } - return true; -} +/* iterate over child cgrps, lock should be held throughout iteration */ +#define cgroup_for_each_live_child(child, cgrp) \ + list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \ + if (({ lockdep_assert_held(&cgroup_mutex); \ + cgroup_is_dead(child); })) \ + ; \ + else /* the list of cgroups eligible for automatic release. Protected by * release_list_lock */ @@ -347,23 +418,60 @@ struct cgrp_cset_link { struct list_head cgrp_link; }; -/* The default css_set - used by init and its children prior to any +/* + * The default css_set - used by init and its children prior to any * hierarchies being mounted. It contains a pointer to the root state * for each subsystem. Also used to anchor the list of css_sets. Not * reference-counted, to improve performance when child cgroups * haven't been created. */ +struct css_set init_css_set = { + .refcount = ATOMIC_INIT(1), + .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), + .tasks = LIST_HEAD_INIT(init_css_set.tasks), + .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), + .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node), + .mg_node = LIST_HEAD_INIT(init_css_set.mg_node), +}; -static struct css_set init_css_set; -static struct cgrp_cset_link init_cgrp_cset_link; +static int css_set_count = 1; /* 1 for init_css_set */ -/* - * css_set_lock protects the list of css_set objects, and the chain of - * tasks off each css_set. Nests outside task->alloc_lock due to - * css_task_iter_start(). +/** + * cgroup_update_populated - updated populated count of a cgroup + * @cgrp: the target cgroup + * @populated: inc or dec populated count + * + * @cgrp is either getting the first task (css_set) or losing the last. + * Update @cgrp->populated_cnt accordingly. The count is propagated + * towards root so that a given cgroup's populated_cnt is zero iff the + * cgroup and all its descendants are empty. + * + * @cgrp's interface file "cgroup.populated" is zero if + * @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt + * changes from or to zero, userland is notified that the content of the + * interface file has changed. This can be used to detect when @cgrp and + * its descendants become populated or empty. */ -static DEFINE_RWLOCK(css_set_lock); -static int css_set_count; +static void cgroup_update_populated(struct cgroup *cgrp, bool populated) +{ + lockdep_assert_held(&css_set_rwsem); + + do { + bool trigger; + + if (populated) + trigger = !cgrp->populated_cnt++; + else + trigger = !--cgrp->populated_cnt; + + if (!trigger) + break; + + if (cgrp->populated_kn) + kernfs_notify(cgrp->populated_kn); + cgrp = cgroup_parent(cgrp); + } while (cgrp); +} /* * hash table for cgroup groups. This improves the performance to find @@ -386,32 +494,20 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) return key; } -/* - * We don't maintain the lists running through each css_set to its task - * until after the first call to css_task_iter_start(). This reduces the - * fork()/exit() overhead for people who have cgroups compiled into their - * kernel but not actually in use. - */ -static int use_task_css_set_links __read_mostly; - -static void __put_css_set(struct css_set *cset, int taskexit) +static void put_css_set_locked(struct css_set *cset, bool taskexit) { struct cgrp_cset_link *link, *tmp_link; + struct cgroup_subsys *ss; + int ssid; - /* - * Ensure that the refcount doesn't hit zero while any readers - * can see it. Similar to atomic_dec_and_lock(), but for an - * rwlock - */ - if (atomic_add_unless(&cset->refcount, -1, 1)) - return; - write_lock(&css_set_lock); - if (!atomic_dec_and_test(&cset->refcount)) { - write_unlock(&css_set_lock); + lockdep_assert_held(&css_set_rwsem); + + if (!atomic_dec_and_test(&cset->refcount)) return; - } /* This css_set is dead. unlink it and release cgroup refcounts */ + for_each_subsys(ss, ssid) + list_del(&cset->e_cset_node[ssid]); hash_del(&cset->hlist); css_set_count--; @@ -421,20 +517,37 @@ static void __put_css_set(struct css_set *cset, int taskexit) list_del(&link->cset_link); list_del(&link->cgrp_link); - /* @cgrp can't go away while we're holding css_set_lock */ - if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) { - if (taskexit) - set_bit(CGRP_RELEASABLE, &cgrp->flags); - check_for_release(cgrp); + /* @cgrp can't go away while we're holding css_set_rwsem */ + if (list_empty(&cgrp->cset_links)) { + cgroup_update_populated(cgrp, false); + if (notify_on_release(cgrp)) { + if (taskexit) + set_bit(CGRP_RELEASABLE, &cgrp->flags); + check_for_release(cgrp); + } } kfree(link); } - write_unlock(&css_set_lock); kfree_rcu(cset, rcu_head); } +static void put_css_set(struct css_set *cset, bool taskexit) +{ + /* + * Ensure that the refcount doesn't hit zero while any readers + * can see it. Similar to atomic_dec_and_lock(), but for an + * rwlock + */ + if (atomic_add_unless(&cset->refcount, -1, 1)) + return; + + down_write(&css_set_rwsem); + put_css_set_locked(cset, taskexit); + up_write(&css_set_rwsem); +} + /* * refcounted get/put for css_set objects */ @@ -443,16 +556,6 @@ static inline void get_css_set(struct css_set *cset) atomic_inc(&cset->refcount); } -static inline void put_css_set(struct css_set *cset) -{ - __put_css_set(cset, 0); -} - -static inline void put_css_set_taskexit(struct css_set *cset) -{ - __put_css_set(cset, 1); -} - /** * compare_css_sets - helper function for find_existing_css_set(). * @cset: candidate css_set being tested @@ -470,20 +573,20 @@ static bool compare_css_sets(struct css_set *cset, { struct list_head *l1, *l2; - if (memcmp(template, cset->subsys, sizeof(cset->subsys))) { - /* Not all subsystems matched */ + /* + * On the default hierarchy, there can be csets which are + * associated with the same set of cgroups but different csses. + * Let's first ensure that csses match. + */ + if (memcmp(template, cset->subsys, sizeof(cset->subsys))) return false; - } /* * Compare cgroup pointers in order to distinguish between - * different cgroups in heirarchies with no subsystems. We - * could get by with just this check alone (and skip the - * memcmp above) but on most setups the memcmp check will - * avoid the need for this more expensive check on almost all - * candidates. + * different cgroups in hierarchies. As different cgroups may + * share the same effective css, this comparison is always + * necessary. */ - l1 = &cset->cgrp_links; l2 = &old_cset->cgrp_links; while (1) { @@ -535,7 +638,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset, struct cgroup *cgrp, struct cgroup_subsys_state *template[]) { - struct cgroupfs_root *root = cgrp->root; + struct cgroup_root *root = cgrp->root; struct cgroup_subsys *ss; struct css_set *cset; unsigned long key; @@ -548,13 +651,16 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset, */ for_each_subsys(ss, i) { if (root->subsys_mask & (1UL << i)) { - /* Subsystem is in this hierarchy. So we want - * the subsystem state from the new - * cgroup */ - template[i] = cgroup_css(cgrp, ss); + /* + * @ss is in this hierarchy, so we want the + * effective css from @cgrp. + */ + template[i] = cgroup_e_css(cgrp, ss); } else { - /* Subsystem is not in this hierarchy, so we - * don't want to change the subsystem state */ + /* + * @ss is not in this hierarchy, so we don't want + * to change the css. + */ template[i] = old_cset->subsys[i]; } } @@ -620,10 +726,18 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset, struct cgrp_cset_link *link; BUG_ON(list_empty(tmp_links)); + + if (cgroup_on_dfl(cgrp)) + cset->dfl_cgrp = cgrp; + link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link); link->cset = cset; link->cgrp = cgrp; + + if (list_empty(&cgrp->cset_links)) + cgroup_update_populated(cgrp, true); list_move(&link->cset_link, &cgrp->cset_links); + /* * Always add links to the tail of the list so that the list * is sorted by order of hierarchy creation @@ -646,17 +760,19 @@ static struct css_set *find_css_set(struct css_set *old_cset, struct css_set *cset; struct list_head tmp_links; struct cgrp_cset_link *link; + struct cgroup_subsys *ss; unsigned long key; + int ssid; lockdep_assert_held(&cgroup_mutex); /* First see if we already have a cgroup group that matches * the desired set */ - read_lock(&css_set_lock); + down_read(&css_set_rwsem); cset = find_existing_css_set(old_cset, cgrp, template); if (cset) get_css_set(cset); - read_unlock(&css_set_lock); + up_read(&css_set_rwsem); if (cset) return cset; @@ -674,13 +790,16 @@ static struct css_set *find_css_set(struct css_set *old_cset, atomic_set(&cset->refcount, 1); INIT_LIST_HEAD(&cset->cgrp_links); INIT_LIST_HEAD(&cset->tasks); + INIT_LIST_HEAD(&cset->mg_tasks); + INIT_LIST_HEAD(&cset->mg_preload_node); + INIT_LIST_HEAD(&cset->mg_node); INIT_HLIST_NODE(&cset->hlist); /* Copy the set of subsystem state objects generated in * find_existing_css_set() */ memcpy(cset->subsys, template, sizeof(cset->subsys)); - write_lock(&css_set_lock); + down_write(&css_set_rwsem); /* Add reference counts and links from the new css_set. */ list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; @@ -694,35 +813,111 @@ static struct css_set *find_css_set(struct css_set *old_cset, css_set_count++; - /* Add this cgroup group to the hash table */ + /* Add @cset to the hash table */ key = css_set_hash(cset->subsys); hash_add(css_set_table, &cset->hlist, key); - write_unlock(&css_set_lock); + for_each_subsys(ss, ssid) + list_add_tail(&cset->e_cset_node[ssid], + &cset->subsys[ssid]->cgroup->e_csets[ssid]); + + up_write(&css_set_rwsem); return cset; } -/* - * Return the cgroup for "task" from the given hierarchy. Must be - * called with cgroup_mutex held. - */ -static struct cgroup *task_cgroup_from_root(struct task_struct *task, - struct cgroupfs_root *root) +static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root) { - struct css_set *cset; - struct cgroup *res = NULL; + struct cgroup *root_cgrp = kf_root->kn->priv; + + return root_cgrp->root; +} + +static int cgroup_init_root_id(struct cgroup_root *root) +{ + int id; + + lockdep_assert_held(&cgroup_mutex); + + id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL); + if (id < 0) + return id; + + root->hierarchy_id = id; + return 0; +} + +static void cgroup_exit_root_id(struct cgroup_root *root) +{ + lockdep_assert_held(&cgroup_mutex); + + if (root->hierarchy_id) { + idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id); + root->hierarchy_id = 0; + } +} + +static void cgroup_free_root(struct cgroup_root *root) +{ + if (root) { + /* hierarhcy ID shoulid already have been released */ + WARN_ON_ONCE(root->hierarchy_id); + + idr_destroy(&root->cgroup_idr); + kfree(root); + } +} + +static void cgroup_destroy_root(struct cgroup_root *root) +{ + struct cgroup *cgrp = &root->cgrp; + struct cgrp_cset_link *link, *tmp_link; + + mutex_lock(&cgroup_mutex); + + BUG_ON(atomic_read(&root->nr_cgrps)); + BUG_ON(!list_empty(&cgrp->self.children)); + + /* Rebind all subsystems back to the default hierarchy */ + rebind_subsystems(&cgrp_dfl_root, root->subsys_mask); - BUG_ON(!mutex_is_locked(&cgroup_mutex)); - read_lock(&css_set_lock); /* - * No need to lock the task - since we hold cgroup_mutex the - * task can't change groups, so the only thing that can happen - * is that it exits and its css is set back to init_css_set. + * Release all the links from cset_links to this hierarchy's + * root cgroup */ - cset = task_css_set(task); + down_write(&css_set_rwsem); + + list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) { + list_del(&link->cset_link); + list_del(&link->cgrp_link); + kfree(link); + } + up_write(&css_set_rwsem); + + if (!list_empty(&root->root_list)) { + list_del(&root->root_list); + cgroup_root_count--; + } + + cgroup_exit_root_id(root); + + mutex_unlock(&cgroup_mutex); + + kernfs_destroy_root(root->kf_root); + cgroup_free_root(root); +} + +/* look up cgroup associated with given css_set on the specified hierarchy */ +static struct cgroup *cset_cgroup_from_root(struct css_set *cset, + struct cgroup_root *root) +{ + struct cgroup *res = NULL; + + lockdep_assert_held(&cgroup_mutex); + lockdep_assert_held(&css_set_rwsem); + if (cset == &init_css_set) { - res = &root->top_cgroup; + res = &root->cgrp; } else { struct cgrp_cset_link *link; @@ -735,16 +930,27 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, } } } - read_unlock(&css_set_lock); + BUG_ON(!res); return res; } /* - * There is one global cgroup mutex. We also require taking - * task_lock() when dereferencing a task's cgroup subsys pointers. - * See "The task_lock() exception", at the end of this comment. - * + * Return the cgroup for "task" from the given hierarchy. Must be + * called with cgroup_mutex and css_set_rwsem held. + */ +static struct cgroup *task_cgroup_from_root(struct task_struct *task, + struct cgroup_root *root) +{ + /* + * No need to lock the task - since we hold cgroup_mutex the + * task can't change groups, so the only thing that can happen + * is that it exits and its css is set back to init_css_set. + */ + return cset_cgroup_from_root(task_css_set(task), root); +} + +/* * A task must hold cgroup_mutex to modify cgroups. * * Any task can increment and decrement the count field without lock. @@ -770,173 +976,139 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, * A cgroup can only be deleted if both its 'count' of using tasks * is zero, and its list of 'children' cgroups is empty. Since all * tasks in the system use _some_ cgroup, and since there is always at - * least one task in the system (init, pid == 1), therefore, top_cgroup + * least one task in the system (init, pid == 1), therefore, root cgroup * always has either children cgroups and/or using tasks. So we don't - * need a special hack to ensure that top_cgroup cannot be deleted. - * - * The task_lock() exception - * - * The need for this exception arises from the action of - * cgroup_attach_task(), which overwrites one task's cgroup pointer with - * another. It does so using cgroup_mutex, however there are - * several performance critical places that need to reference - * task->cgroup without the expense of grabbing a system global - * mutex. Therefore except as noted below, when dereferencing or, as - * in cgroup_attach_task(), modifying a task's cgroup pointer we use - * task_lock(), which acts on a spinlock (task->alloc_lock) already in - * the task_struct routinely used for such matters. + * need a special hack to ensure that root cgroup cannot be deleted. * * P.S. One more locking exception. RCU is used to guard the * update of a tasks cgroup pointer by cgroup_attach_task() */ -/* - * A couple of forward declarations required, due to cyclic reference loop: - * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir -> - * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations - * -> cgroup_mkdir. - */ - -static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); -static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); -static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask); -static const struct inode_operations cgroup_dir_inode_operations; +static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask); +static struct kernfs_syscall_ops cgroup_kf_syscall_ops; static const struct file_operations proc_cgroupstats_operations; -static struct backing_dev_info cgroup_backing_dev_info = { - .name = "cgroup", - .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, -}; - -static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) +static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, + char *buf) { - struct inode *inode = new_inode(sb); - - if (inode) { - inode->i_ino = get_next_ino(); - inode->i_mode = mode; - inode->i_uid = current_fsuid(); - inode->i_gid = current_fsgid(); - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; - inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info; - } - return inode; -} - -static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry) -{ - struct cgroup_name *name; - - name = kmalloc(sizeof(*name) + dentry->d_name.len + 1, GFP_KERNEL); - if (!name) - return NULL; - strcpy(name->name, dentry->d_name.name); - return name; + if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) && + !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) + snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s", + cft->ss->name, cft->name); + else + strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX); + return buf; } -static void cgroup_free_fn(struct work_struct *work) +/** + * cgroup_file_mode - deduce file mode of a control file + * @cft: the control file in question + * + * returns cft->mode if ->mode is not 0 + * returns S_IRUGO|S_IWUSR if it has both a read and a write handler + * returns S_IRUGO if it has only a read handler + * returns S_IWUSR if it has only a write hander + */ +static umode_t cgroup_file_mode(const struct cftype *cft) { - struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); - - mutex_lock(&cgroup_mutex); - cgrp->root->number_of_cgroups--; - mutex_unlock(&cgroup_mutex); - - /* - * We get a ref to the parent's dentry, and put the ref when - * this cgroup is being freed, so it's guaranteed that the - * parent won't be destroyed before its children. - */ - dput(cgrp->parent->dentry); + umode_t mode = 0; - /* - * Drop the active superblock reference that we took when we - * created the cgroup. This will free cgrp->root, if we are - * holding the last reference to @sb. - */ - deactivate_super(cgrp->root->sb); + if (cft->mode) + return cft->mode; - cgroup_pidlist_destroy_all(cgrp); + if (cft->read_u64 || cft->read_s64 || cft->seq_show) + mode |= S_IRUGO; - simple_xattrs_free(&cgrp->xattrs); + if (cft->write_u64 || cft->write_s64 || cft->write) + mode |= S_IWUSR; - kfree(rcu_dereference_raw(cgrp->name)); - kfree(cgrp); + return mode; } -static void cgroup_free_rcu(struct rcu_head *head) +static void cgroup_get(struct cgroup *cgrp) { - struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); - - INIT_WORK(&cgrp->destroy_work, cgroup_free_fn); - queue_work(cgroup_destroy_wq, &cgrp->destroy_work); + WARN_ON_ONCE(cgroup_is_dead(cgrp)); + css_get(&cgrp->self); } -static void cgroup_diput(struct dentry *dentry, struct inode *inode) +static void cgroup_put(struct cgroup *cgrp) { - /* is dentry a directory ? if so, kfree() associated cgroup */ - if (S_ISDIR(inode->i_mode)) { - struct cgroup *cgrp = dentry->d_fsdata; - - BUG_ON(!(cgroup_is_dead(cgrp))); - - /* - * XXX: cgrp->id is only used to look up css's. As cgroup - * and css's lifetimes will be decoupled, it should be made - * per-subsystem and moved to css->id so that lookups are - * successful until the target css is released. - */ - idr_remove(&cgrp->root->cgroup_idr, cgrp->id); - cgrp->id = -1; - - call_rcu(&cgrp->rcu_head, cgroup_free_rcu); - } else { - struct cfent *cfe = __d_cfe(dentry); - struct cgroup *cgrp = dentry->d_parent->d_fsdata; - - WARN_ONCE(!list_empty(&cfe->node) && - cgrp != &cgrp->root->top_cgroup, - "cfe still linked for %s\n", cfe->type->name); - simple_xattrs_free(&cfe->xattrs); - kfree(cfe); - } - iput(inode); + css_put(&cgrp->self); } -static void remove_dir(struct dentry *d) +/** + * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods + * @kn: the kernfs_node being serviced + * + * This helper undoes cgroup_kn_lock_live() and should be invoked before + * the method finishes if locking succeeded. Note that once this function + * returns the cgroup returned by cgroup_kn_lock_live() may become + * inaccessible any time. If the caller intends to continue to access the + * cgroup, it should pin it before invoking this function. + */ +static void cgroup_kn_unlock(struct kernfs_node *kn) { - struct dentry *parent = dget(d->d_parent); + struct cgroup *cgrp; - d_delete(d); - simple_rmdir(parent->d_inode, d); - dput(parent); + if (kernfs_type(kn) == KERNFS_DIR) + cgrp = kn->priv; + else + cgrp = kn->parent->priv; + + mutex_unlock(&cgroup_mutex); + + kernfs_unbreak_active_protection(kn); + cgroup_put(cgrp); } -static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) +/** + * cgroup_kn_lock_live - locking helper for cgroup kernfs methods + * @kn: the kernfs_node being serviced + * + * This helper is to be used by a cgroup kernfs method currently servicing + * @kn. It breaks the active protection, performs cgroup locking and + * verifies that the associated cgroup is alive. Returns the cgroup if + * alive; otherwise, %NULL. A successful return should be undone by a + * matching cgroup_kn_unlock() invocation. + * + * Any cgroup kernfs method implementation which requires locking the + * associated cgroup should use this helper. It avoids nesting cgroup + * locking under kernfs active protection and allows all kernfs operations + * including self-removal. + */ +static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn) { - struct cfent *cfe; + struct cgroup *cgrp; - lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); - lockdep_assert_held(&cgroup_mutex); + if (kernfs_type(kn) == KERNFS_DIR) + cgrp = kn->priv; + else + cgrp = kn->parent->priv; /* - * If we're doing cleanup due to failure of cgroup_create(), - * the corresponding @cfe may not exist. + * We're gonna grab cgroup_mutex which nests outside kernfs + * active_ref. cgroup liveliness check alone provides enough + * protection against removal. Ensure @cgrp stays accessible and + * break the active_ref protection. */ - list_for_each_entry(cfe, &cgrp->files, node) { - struct dentry *d = cfe->dentry; + cgroup_get(cgrp); + kernfs_break_active_protection(kn); - if (cft && cfe->type != cft) - continue; + mutex_lock(&cgroup_mutex); - dget(d); - d_delete(d); - simple_unlink(cgrp->dentry->d_inode, d); - list_del_init(&cfe->node); - dput(d); + if (!cgroup_is_dead(cgrp)) + return cgrp; - break; - } + cgroup_kn_unlock(kn); + return NULL; +} + +static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) +{ + char name[CGROUP_FILE_NAME_MAX]; + + lockdep_assert_held(&cgroup_mutex); + kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); } /** @@ -944,148 +1116,120 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) * @cgrp: target cgroup * @subsys_mask: mask of the subsystem ids whose files should be removed */ -static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) +static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask) { struct cgroup_subsys *ss; int i; for_each_subsys(ss, i) { - struct cftype_set *set; + struct cftype *cfts; - if (!test_bit(i, &subsys_mask)) + if (!(subsys_mask & (1 << i))) continue; - list_for_each_entry(set, &ss->cftsets, node) - cgroup_addrm_files(cgrp, set->cfts, false); + list_for_each_entry(cfts, &ss->cfts, node) + cgroup_addrm_files(cgrp, cfts, false); } } -/* - * NOTE : the dentry must have been dget()'ed - */ -static void cgroup_d_remove_dir(struct dentry *dentry) +static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask) { - struct dentry *parent; - - parent = dentry->d_parent; - spin_lock(&parent->d_lock); - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - list_del_init(&dentry->d_u.d_child); - spin_unlock(&dentry->d_lock); - spin_unlock(&parent->d_lock); - remove_dir(dentry); -} - -/* - * Call with cgroup_mutex held. Drops reference counts on modules, including - * any duplicate ones that parse_cgroupfs_options took. If this function - * returns an error, no reference counts are touched. - */ -static int rebind_subsystems(struct cgroupfs_root *root, - unsigned long added_mask, unsigned removed_mask) -{ - struct cgroup *cgrp = &root->top_cgroup; struct cgroup_subsys *ss; - unsigned long pinned = 0; - int i, ret; + unsigned int tmp_ss_mask; + int ssid, i, ret; - BUG_ON(!mutex_is_locked(&cgroup_mutex)); - BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); + lockdep_assert_held(&cgroup_mutex); - /* Check that any added subsystems are currently free */ - for_each_subsys(ss, i) { - if (!(added_mask & (1 << i))) + for_each_subsys(ss, ssid) { + if (!(ss_mask & (1 << ssid))) continue; - /* is the subsystem mounted elsewhere? */ - if (ss->root != &cgroup_dummy_root) { - ret = -EBUSY; - goto out_put; - } + /* if @ss has non-root csses attached to it, can't move */ + if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss))) + return -EBUSY; - /* pin the module */ - if (!try_module_get(ss->module)) { - ret = -ENOENT; - goto out_put; - } - pinned |= 1 << i; + /* can't move between two non-dummy roots either */ + if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root) + return -EBUSY; } - /* subsys could be missing if unloaded between parsing and here */ - if (added_mask != pinned) { - ret = -ENOENT; - goto out_put; - } + /* skip creating root files on dfl_root for inhibited subsystems */ + tmp_ss_mask = ss_mask; + if (dst_root == &cgrp_dfl_root) + tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask; - ret = cgroup_populate_dir(cgrp, added_mask); - if (ret) - goto out_put; + ret = cgroup_populate_dir(&dst_root->cgrp, tmp_ss_mask); + if (ret) { + if (dst_root != &cgrp_dfl_root) + return ret; + + /* + * Rebinding back to the default root is not allowed to + * fail. Using both default and non-default roots should + * be rare. Moving subsystems back and forth even more so. + * Just warn about it and continue. + */ + if (cgrp_dfl_root_visible) { + pr_warn("failed to create files (%d) while rebinding 0x%x to default root\n", + ret, ss_mask); + pr_warn("you may retry by moving them to a different hierarchy and unbinding\n"); + } + } /* * Nothing can fail from this point on. Remove files for the * removed subsystems and rebind each subsystem. */ - cgroup_clear_dir(cgrp, removed_mask); + for_each_subsys(ss, ssid) + if (ss_mask & (1 << ssid)) + cgroup_clear_dir(&ss->root->cgrp, 1 << ssid); - for_each_subsys(ss, i) { - unsigned long bit = 1UL << i; + for_each_subsys(ss, ssid) { + struct cgroup_root *src_root; + struct cgroup_subsys_state *css; + struct css_set *cset; - if (bit & added_mask) { - /* We're binding this subsystem to this hierarchy */ - BUG_ON(cgroup_css(cgrp, ss)); - BUG_ON(!cgroup_css(cgroup_dummy_top, ss)); - BUG_ON(cgroup_css(cgroup_dummy_top, ss)->cgroup != cgroup_dummy_top); + if (!(ss_mask & (1 << ssid))) + continue; - rcu_assign_pointer(cgrp->subsys[i], - cgroup_css(cgroup_dummy_top, ss)); - cgroup_css(cgrp, ss)->cgroup = cgrp; + src_root = ss->root; + css = cgroup_css(&src_root->cgrp, ss); - ss->root = root; - if (ss->bind) - ss->bind(cgroup_css(cgrp, ss)); + WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss)); - /* refcount was already taken, and we're keeping it */ - root->subsys_mask |= bit; - } else if (bit & removed_mask) { - /* We're removing this subsystem */ - BUG_ON(cgroup_css(cgrp, ss) != cgroup_css(cgroup_dummy_top, ss)); - BUG_ON(cgroup_css(cgrp, ss)->cgroup != cgrp); + RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL); + rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css); + ss->root = dst_root; + css->cgroup = &dst_root->cgrp; - if (ss->bind) - ss->bind(cgroup_css(cgroup_dummy_top, ss)); + down_write(&css_set_rwsem); + hash_for_each(css_set_table, i, cset, hlist) + list_move_tail(&cset->e_cset_node[ss->id], + &dst_root->cgrp.e_csets[ss->id]); + up_write(&css_set_rwsem); - cgroup_css(cgroup_dummy_top, ss)->cgroup = cgroup_dummy_top; - RCU_INIT_POINTER(cgrp->subsys[i], NULL); + src_root->subsys_mask &= ~(1 << ssid); + src_root->cgrp.child_subsys_mask &= ~(1 << ssid); - cgroup_subsys[i]->root = &cgroup_dummy_root; + /* default hierarchy doesn't enable controllers by default */ + dst_root->subsys_mask |= 1 << ssid; + if (dst_root != &cgrp_dfl_root) + dst_root->cgrp.child_subsys_mask |= 1 << ssid; - /* subsystem is now free - drop reference on module */ - module_put(ss->module); - root->subsys_mask &= ~bit; - } + if (ss->bind) + ss->bind(css); } - /* - * Mark @root has finished binding subsystems. @root->subsys_mask - * now matches the bound subsystems. - */ - root->flags |= CGRP_ROOT_SUBSYS_BOUND; - + kernfs_activate(dst_root->cgrp.kn); return 0; - -out_put: - for_each_subsys(ss, i) - if (pinned & (1 << i)) - module_put(ss->module); - return ret; } -static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) +static int cgroup_show_options(struct seq_file *seq, + struct kernfs_root *kf_root) { - struct cgroupfs_root *root = dentry->d_sb->s_fs_info; + struct cgroup_root *root = cgroup_root_from_kf(kf_root); struct cgroup_subsys *ss; int ssid; - mutex_lock(&cgroup_root_mutex); for_each_subsys(ss, ssid) if (root->subsys_mask & (1 << ssid)) seq_printf(seq, ",%s", ss->name); @@ -1095,47 +1239,39 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",noprefix"); if (root->flags & CGRP_ROOT_XATTR) seq_puts(seq, ",xattr"); + + spin_lock(&release_agent_path_lock); if (strlen(root->release_agent_path)) seq_printf(seq, ",release_agent=%s", root->release_agent_path); - if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags)) + spin_unlock(&release_agent_path_lock); + + if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags)) seq_puts(seq, ",clone_children"); if (strlen(root->name)) seq_printf(seq, ",name=%s", root->name); - mutex_unlock(&cgroup_root_mutex); return 0; } struct cgroup_sb_opts { - unsigned long subsys_mask; - unsigned long flags; + unsigned int subsys_mask; + unsigned int flags; char *release_agent; bool cpuset_clone_children; char *name; /* User explicitly requested empty subsystem */ bool none; - - struct cgroupfs_root *new_root; - }; -/* - * Convert a hierarchy specifier into a bitmask of subsystems and - * flags. Call with cgroup_mutex held to protect the cgroup_subsys[] - * array. This function takes refcounts on subsystems to be used, unless it - * returns error, in which case no refcounts are taken. - */ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) { char *token, *o = data; bool all_ss = false, one_ss = false; - unsigned long mask = (unsigned long)-1; + unsigned int mask = -1U; struct cgroup_subsys *ss; int i; - BUG_ON(!mutex_is_locked(&cgroup_mutex)); - #ifdef CONFIG_CPUSETS - mask = ~(1UL << cpuset_subsys_id); + mask = ~(1U << cpuset_cgrp_id); #endif memset(opts, 0, sizeof(*opts)); @@ -1216,7 +1352,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) /* Mutually exclusive option 'all' + subsystem name */ if (all_ss) return -EINVAL; - set_bit(i, &opts->subsys_mask); + opts->subsys_mask |= (1 << i); one_ss = true; break; @@ -1225,30 +1361,34 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) return -ENOENT; } - /* - * If the 'all' option was specified select all the subsystems, - * otherwise if 'none', 'name=' and a subsystem name options - * were not specified, let's default to 'all' - */ - if (all_ss || (!one_ss && !opts->none && !opts->name)) - for_each_subsys(ss, i) - if (!ss->disabled) - set_bit(i, &opts->subsys_mask); - /* Consistency checks */ if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { - pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); + pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); - if (opts->flags & CGRP_ROOT_NOPREFIX) { - pr_err("cgroup: sane_behavior: noprefix is not allowed\n"); + if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) || + opts->cpuset_clone_children || opts->release_agent || + opts->name) { + pr_err("sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n"); return -EINVAL; } + } else { + /* + * If the 'all' option was specified select all the + * subsystems, otherwise if 'none', 'name=' and a subsystem + * name options were not specified, let's default to 'all' + */ + if (all_ss || (!one_ss && !opts->none && !opts->name)) + for_each_subsys(ss, i) + if (!ss->disabled) + opts->subsys_mask |= (1 << i); - if (opts->cpuset_clone_children) { - pr_err("cgroup: sane_behavior: clone_children is not allowed\n"); + /* + * We either have to specify by name or by subsystems. (So + * all empty hierarchies must have a name). + */ + if (!opts->subsys_mask && !opts->name) return -EINVAL; - } } /* @@ -1264,32 +1404,22 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) if (opts->subsys_mask && opts->none) return -EINVAL; - /* - * We either have to specify by name or by subsystems. (So all - * empty hierarchies must have a name). - */ - if (!opts->subsys_mask && !opts->name) - return -EINVAL; - return 0; } -static int cgroup_remount(struct super_block *sb, int *flags, char *data) +static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) { int ret = 0; - struct cgroupfs_root *root = sb->s_fs_info; - struct cgroup *cgrp = &root->top_cgroup; + struct cgroup_root *root = cgroup_root_from_kf(kf_root); struct cgroup_sb_opts opts; - unsigned long added_mask, removed_mask; + unsigned int added_mask, removed_mask; if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) { - pr_err("cgroup: sane_behavior: remount is not allowed\n"); + pr_err("sane_behavior: remount is not allowed\n"); return -EINVAL; } - mutex_lock(&cgrp->dentry->d_inode->i_mutex); mutex_lock(&cgroup_mutex); - mutex_lock(&cgroup_root_mutex); /* See what subsystems are wanted */ ret = parse_cgroupfs_options(data, &opts); @@ -1297,8 +1427,8 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) goto out_unlock; if (opts.subsys_mask != root->subsys_mask || opts.release_agent) - pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", - task_tgid_nr(current), current->comm); + pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n", + task_tgid_nr(current), current->comm); added_mask = opts.subsys_mask & ~root->subsys_mask; removed_mask = root->subsys_mask & ~opts.subsys_mask; @@ -1306,7 +1436,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) /* Don't allow flags or name to change at remount */ if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) || (opts.name && strcmp(opts.name, root->name))) { - pr_err("cgroup: option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n", + pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n", opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "", root->flags & CGRP_ROOT_OPTION_MASK, root->name); ret = -EINVAL; @@ -1314,422 +1444,396 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) } /* remounting is not allowed for populated hierarchies */ - if (root->number_of_cgroups > 1) { + if (!list_empty(&root->cgrp.self.children)) { ret = -EBUSY; goto out_unlock; } - ret = rebind_subsystems(root, added_mask, removed_mask); + ret = rebind_subsystems(root, added_mask); if (ret) goto out_unlock; - if (opts.release_agent) + rebind_subsystems(&cgrp_dfl_root, removed_mask); + + if (opts.release_agent) { + spin_lock(&release_agent_path_lock); strcpy(root->release_agent_path, opts.release_agent); + spin_unlock(&release_agent_path_lock); + } out_unlock: kfree(opts.release_agent); kfree(opts.name); - mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgrp->dentry->d_inode->i_mutex); return ret; } -static const struct super_operations cgroup_ops = { - .statfs = simple_statfs, - .drop_inode = generic_delete_inode, - .show_options = cgroup_show_options, - .remount_fs = cgroup_remount, -}; - -static void init_cgroup_housekeeping(struct cgroup *cgrp) -{ - INIT_LIST_HEAD(&cgrp->sibling); - INIT_LIST_HEAD(&cgrp->children); - INIT_LIST_HEAD(&cgrp->files); - INIT_LIST_HEAD(&cgrp->cset_links); - INIT_LIST_HEAD(&cgrp->release_list); - INIT_LIST_HEAD(&cgrp->pidlists); - mutex_init(&cgrp->pidlist_mutex); - cgrp->dummy_css.cgroup = cgrp; - simple_xattrs_init(&cgrp->xattrs); -} +/* + * To reduce the fork() overhead for systems that are not actually using + * their cgroups capability, we don't maintain the lists running through + * each css_set to its tasks until we see the list actually used - in other + * words after the first mount. + */ +static bool use_task_css_set_links __read_mostly; -static void init_cgroup_root(struct cgroupfs_root *root) +static void cgroup_enable_task_cg_lists(void) { - struct cgroup *cgrp = &root->top_cgroup; - - INIT_LIST_HEAD(&root->root_list); - root->number_of_cgroups = 1; - cgrp->root = root; - RCU_INIT_POINTER(cgrp->name, &root_cgroup_name); - init_cgroup_housekeeping(cgrp); - idr_init(&root->cgroup_idr); -} + struct task_struct *p, *g; -static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end) -{ - int id; + down_write(&css_set_rwsem); - lockdep_assert_held(&cgroup_mutex); - lockdep_assert_held(&cgroup_root_mutex); + if (use_task_css_set_links) + goto out_unlock; - id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, start, end, - GFP_KERNEL); - if (id < 0) - return id; + use_task_css_set_links = true; - root->hierarchy_id = id; - return 0; -} + /* + * We need tasklist_lock because RCU is not safe against + * while_each_thread(). Besides, a forking task that has passed + * cgroup_post_fork() without seeing use_task_css_set_links = 1 + * is not guaranteed to have its child immediately visible in the + * tasklist if we walk through it with RCU. + */ + read_lock(&tasklist_lock); + do_each_thread(g, p) { + WARN_ON_ONCE(!list_empty(&p->cg_list) || + task_css_set(p) != &init_css_set); -static void cgroup_exit_root_id(struct cgroupfs_root *root) -{ - lockdep_assert_held(&cgroup_mutex); - lockdep_assert_held(&cgroup_root_mutex); + /* + * We should check if the process is exiting, otherwise + * it will race with cgroup_exit() in that the list + * entry won't be deleted though the process has exited. + * Do it while holding siglock so that we don't end up + * racing against cgroup_exit(). + */ + spin_lock_irq(&p->sighand->siglock); + if (!(p->flags & PF_EXITING)) { + struct css_set *cset = task_css_set(p); - if (root->hierarchy_id) { - idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id); - root->hierarchy_id = 0; - } + list_add(&p->cg_list, &cset->tasks); + get_css_set(cset); + } + spin_unlock_irq(&p->sighand->siglock); + } while_each_thread(g, p); + read_unlock(&tasklist_lock); +out_unlock: + up_write(&css_set_rwsem); } -static int cgroup_test_super(struct super_block *sb, void *data) +static void init_cgroup_housekeeping(struct cgroup *cgrp) { - struct cgroup_sb_opts *opts = data; - struct cgroupfs_root *root = sb->s_fs_info; + struct cgroup_subsys *ss; + int ssid; - /* If we asked for a name then it must match */ - if (opts->name && strcmp(opts->name, root->name)) - return 0; + INIT_LIST_HEAD(&cgrp->self.sibling); + INIT_LIST_HEAD(&cgrp->self.children); + INIT_LIST_HEAD(&cgrp->cset_links); + INIT_LIST_HEAD(&cgrp->release_list); + INIT_LIST_HEAD(&cgrp->pidlists); + mutex_init(&cgrp->pidlist_mutex); + cgrp->self.cgroup = cgrp; + cgrp->self.flags |= CSS_ONLINE; - /* - * If we asked for subsystems (or explicitly for no - * subsystems) then they must match - */ - if ((opts->subsys_mask || opts->none) - && (opts->subsys_mask != root->subsys_mask)) - return 0; + for_each_subsys(ss, ssid) + INIT_LIST_HEAD(&cgrp->e_csets[ssid]); - return 1; + init_waitqueue_head(&cgrp->offline_waitq); } -static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) +static void init_cgroup_root(struct cgroup_root *root, + struct cgroup_sb_opts *opts) { - struct cgroupfs_root *root; + struct cgroup *cgrp = &root->cgrp; - if (!opts->subsys_mask && !opts->none) - return NULL; - - root = kzalloc(sizeof(*root), GFP_KERNEL); - if (!root) - return ERR_PTR(-ENOMEM); - - init_cgroup_root(root); + INIT_LIST_HEAD(&root->root_list); + atomic_set(&root->nr_cgrps, 1); + cgrp->root = root; + init_cgroup_housekeeping(cgrp); + idr_init(&root->cgroup_idr); - /* - * We need to set @root->subsys_mask now so that @root can be - * matched by cgroup_test_super() before it finishes - * initialization; otherwise, competing mounts with the same - * options may try to bind the same subsystems instead of waiting - * for the first one leading to unexpected mount errors. - * SUBSYS_BOUND will be set once actual binding is complete. - */ - root->subsys_mask = opts->subsys_mask; root->flags = opts->flags; if (opts->release_agent) strcpy(root->release_agent_path, opts->release_agent); if (opts->name) strcpy(root->name, opts->name); if (opts->cpuset_clone_children) - set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags); - return root; + set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } -static void cgroup_free_root(struct cgroupfs_root *root) +static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) { - if (root) { - /* hierarhcy ID shoulid already have been released */ - WARN_ON_ONCE(root->hierarchy_id); + LIST_HEAD(tmp_links); + struct cgroup *root_cgrp = &root->cgrp; + struct css_set *cset; + int i, ret; - idr_destroy(&root->cgroup_idr); - kfree(root); - } -} + lockdep_assert_held(&cgroup_mutex); -static int cgroup_set_super(struct super_block *sb, void *data) -{ - int ret; - struct cgroup_sb_opts *opts = data; + ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_NOWAIT); + if (ret < 0) + goto out; + root_cgrp->id = ret; - /* If we don't have a new root, we can't set up a new sb */ - if (!opts->new_root) - return -EINVAL; + ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release); + if (ret) + goto out; - BUG_ON(!opts->subsys_mask && !opts->none); + /* + * We're accessing css_set_count without locking css_set_rwsem here, + * but that's OK - it can only be increased by someone holding + * cgroup_lock, and that's us. The worst that can happen is that we + * have some link structures left over + */ + ret = allocate_cgrp_cset_links(css_set_count, &tmp_links); + if (ret) + goto cancel_ref; - ret = set_anon_super(sb, NULL); + ret = cgroup_init_root_id(root); if (ret) - return ret; + goto cancel_ref; + + root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops, + KERNFS_ROOT_CREATE_DEACTIVATED, + root_cgrp); + if (IS_ERR(root->kf_root)) { + ret = PTR_ERR(root->kf_root); + goto exit_root_id; + } + root_cgrp->kn = root->kf_root->kn; - sb->s_fs_info = opts->new_root; - opts->new_root->sb = sb; + ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true); + if (ret) + goto destroy_root; - sb->s_blocksize = PAGE_CACHE_SIZE; - sb->s_blocksize_bits = PAGE_CACHE_SHIFT; - sb->s_magic = CGROUP_SUPER_MAGIC; - sb->s_op = &cgroup_ops; + ret = rebind_subsystems(root, ss_mask); + if (ret) + goto destroy_root; - return 0; -} + /* + * There must be no failure case after here, since rebinding takes + * care of subsystems' refcounts, which are explicitly dropped in + * the failure exit path. + */ + list_add(&root->root_list, &cgroup_roots); + cgroup_root_count++; -static int cgroup_get_rootdir(struct super_block *sb) -{ - static const struct dentry_operations cgroup_dops = { - .d_iput = cgroup_diput, - .d_delete = always_delete_dentry, - }; + /* + * Link the root cgroup in this hierarchy into all the css_set + * objects. + */ + down_write(&css_set_rwsem); + hash_for_each(css_set_table, i, cset, hlist) + link_css_set(&tmp_links, cset, root_cgrp); + up_write(&css_set_rwsem); - struct inode *inode = - cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); + BUG_ON(!list_empty(&root_cgrp->self.children)); + BUG_ON(atomic_read(&root->nr_cgrps) != 1); - if (!inode) - return -ENOMEM; + kernfs_activate(root_cgrp->kn); + ret = 0; + goto out; - inode->i_fop = &simple_dir_operations; - inode->i_op = &cgroup_dir_inode_operations; - /* directories start off with i_nlink == 2 (for "." entry) */ - inc_nlink(inode); - sb->s_root = d_make_root(inode); - if (!sb->s_root) - return -ENOMEM; - /* for everything else we want ->d_op set */ - sb->s_d_op = &cgroup_dops; - return 0; +destroy_root: + kernfs_destroy_root(root->kf_root); + root->kf_root = NULL; +exit_root_id: + cgroup_exit_root_id(root); +cancel_ref: + percpu_ref_cancel_init(&root_cgrp->self.refcnt); +out: + free_cgrp_cset_links(&tmp_links); + return ret; } static struct dentry *cgroup_mount(struct file_system_type *fs_type, int flags, const char *unused_dev_name, void *data) { + struct super_block *pinned_sb = NULL; + struct cgroup_subsys *ss; + struct cgroup_root *root; struct cgroup_sb_opts opts; - struct cgroupfs_root *root; - int ret = 0; - struct super_block *sb; - struct cgroupfs_root *new_root; - struct list_head tmp_links; - struct inode *inode; - const struct cred *cred; + struct dentry *dentry; + int ret; + int i; + bool new_sb; + + /* + * The first time anyone tries to mount a cgroup, enable the list + * linking each css_set to its tasks and fix up all existing tasks. + */ + if (!use_task_css_set_links) + cgroup_enable_task_cg_lists(); - /* First find the desired set of subsystems */ mutex_lock(&cgroup_mutex); + + /* First find the desired set of subsystems */ ret = parse_cgroupfs_options(data, &opts); - mutex_unlock(&cgroup_mutex); if (ret) - goto out_err; + goto out_unlock; + + /* look for a matching existing root */ + if (!opts.subsys_mask && !opts.none && !opts.name) { + cgrp_dfl_root_visible = true; + root = &cgrp_dfl_root; + cgroup_get(&root->cgrp); + ret = 0; + goto out_unlock; + } /* - * Allocate a new cgroup root. We may not need it if we're - * reusing an existing hierarchy. + * Destruction of cgroup root is asynchronous, so subsystems may + * still be dying after the previous unmount. Let's drain the + * dying subsystems. We just need to ensure that the ones + * unmounted previously finish dying and don't care about new ones + * starting. Testing ref liveliness is good enough. */ - new_root = cgroup_root_from_opts(&opts); - if (IS_ERR(new_root)) { - ret = PTR_ERR(new_root); - goto out_err; - } - opts.new_root = new_root; + for_each_subsys(ss, i) { + if (!(opts.subsys_mask & (1 << i)) || + ss->root == &cgrp_dfl_root) + continue; - /* Locate an existing or new sb for this hierarchy */ - sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts); - if (IS_ERR(sb)) { - ret = PTR_ERR(sb); - cgroup_free_root(opts.new_root); - goto out_err; + if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) { + mutex_unlock(&cgroup_mutex); + msleep(10); + ret = restart_syscall(); + goto out_free; + } + cgroup_put(&ss->root->cgrp); } - root = sb->s_fs_info; - BUG_ON(!root); - if (root == opts.new_root) { - /* We used the new root structure, so this is a new hierarchy */ - struct cgroup *root_cgrp = &root->top_cgroup; - struct cgroupfs_root *existing_root; - int i; - struct css_set *cset; - - BUG_ON(sb->s_root != NULL); - - ret = cgroup_get_rootdir(sb); - if (ret) - goto drop_new_super; - inode = sb->s_root->d_inode; - - mutex_lock(&inode->i_mutex); - mutex_lock(&cgroup_mutex); - mutex_lock(&cgroup_root_mutex); + for_each_root(root) { + bool name_match = false; - root_cgrp->id = idr_alloc(&root->cgroup_idr, root_cgrp, - 0, 1, GFP_KERNEL); - if (root_cgrp->id < 0) - goto unlock_drop; - - /* Check for name clashes with existing mounts */ - ret = -EBUSY; - if (strlen(root->name)) - for_each_active_root(existing_root) - if (!strcmp(existing_root->name, root->name)) - goto unlock_drop; - - /* - * We're accessing css_set_count without locking - * css_set_lock here, but that's OK - it can only be - * increased by someone holding cgroup_lock, and - * that's us. The worst that can happen is that we - * have some link structures left over - */ - ret = allocate_cgrp_cset_links(css_set_count, &tmp_links); - if (ret) - goto unlock_drop; - - /* ID 0 is reserved for dummy root, 1 for unified hierarchy */ - ret = cgroup_init_root_id(root, 2, 0); - if (ret) - goto unlock_drop; - - sb->s_root->d_fsdata = root_cgrp; - root_cgrp->dentry = sb->s_root; - - /* - * We're inside get_sb() and will call lookup_one_len() to - * create the root files, which doesn't work if SELinux is - * in use. The following cred dancing somehow works around - * it. See 2ce9738ba ("cgroupfs: use init_cred when - * populating new cgroupfs mount") for more details. - */ - cred = override_creds(&init_cred); - - ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true); - if (ret) - goto rm_base_files; - - ret = rebind_subsystems(root, root->subsys_mask, 0); - if (ret) - goto rm_base_files; - - revert_creds(cred); + if (root == &cgrp_dfl_root) + continue; /* - * There must be no failure case after here, since rebinding - * takes care of subsystems' refcounts, which are explicitly - * dropped in the failure exit path. + * If we asked for a name then it must match. Also, if + * name matches but sybsys_mask doesn't, we should fail. + * Remember whether name matched. */ + if (opts.name) { + if (strcmp(opts.name, root->name)) + continue; + name_match = true; + } - list_add(&root->root_list, &cgroup_roots); - cgroup_root_count++; - - /* Link the top cgroup in this hierarchy into all - * the css_set objects */ - write_lock(&css_set_lock); - hash_for_each(css_set_table, i, cset, hlist) - link_css_set(&tmp_links, cset, root_cgrp); - write_unlock(&css_set_lock); - - free_cgrp_cset_links(&tmp_links); - - BUG_ON(!list_empty(&root_cgrp->children)); - BUG_ON(root->number_of_cgroups != 1); - - mutex_unlock(&cgroup_root_mutex); - mutex_unlock(&cgroup_mutex); - mutex_unlock(&inode->i_mutex); - } else { /* - * We re-used an existing hierarchy - the new root (if - * any) is not needed + * If we asked for subsystems (or explicitly for no + * subsystems) then they must match. */ - cgroup_free_root(opts.new_root); + if ((opts.subsys_mask || opts.none) && + (opts.subsys_mask != root->subsys_mask)) { + if (!name_match) + continue; + ret = -EBUSY; + goto out_unlock; + } if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) { if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { - pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); + pr_err("sane_behavior: new mount options should match the existing superblock\n"); ret = -EINVAL; - goto drop_new_super; + goto out_unlock; } else { - pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); + pr_warn("new mount options do not match the existing superblock, will be ignored\n"); } } + + /* + * We want to reuse @root whose lifetime is governed by its + * ->cgrp. Let's check whether @root is alive and keep it + * that way. As cgroup_kill_sb() can happen anytime, we + * want to block it by pinning the sb so that @root doesn't + * get killed before mount is complete. + * + * With the sb pinned, tryget_live can reliably indicate + * whether @root can be reused. If it's being killed, + * drain it. We can use wait_queue for the wait but this + * path is super cold. Let's just sleep a bit and retry. + */ + pinned_sb = kernfs_pin_sb(root->kf_root, NULL); + if (IS_ERR(pinned_sb) || + !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { + mutex_unlock(&cgroup_mutex); + if (!IS_ERR_OR_NULL(pinned_sb)) + deactivate_super(pinned_sb); + msleep(10); + ret = restart_syscall(); + goto out_free; + } + + ret = 0; + goto out_unlock; } - kfree(opts.release_agent); - kfree(opts.name); - return dget(sb->s_root); + /* + * No such thing, create a new one. name= matching without subsys + * specification is allowed for already existing hierarchies but we + * can't create new one without subsys specification. + */ + if (!opts.subsys_mask && !opts.none) { + ret = -EINVAL; + goto out_unlock; + } - rm_base_files: - free_cgrp_cset_links(&tmp_links); - cgroup_addrm_files(&root->top_cgroup, cgroup_base_files, false); - revert_creds(cred); - unlock_drop: - cgroup_exit_root_id(root); - mutex_unlock(&cgroup_root_mutex); - mutex_unlock(&cgroup_mutex); - mutex_unlock(&inode->i_mutex); - drop_new_super: - deactivate_locked_super(sb); - out_err: - kfree(opts.release_agent); - kfree(opts.name); - return ERR_PTR(ret); -} + root = kzalloc(sizeof(*root), GFP_KERNEL); + if (!root) { + ret = -ENOMEM; + goto out_unlock; + } -static void cgroup_kill_sb(struct super_block *sb) -{ - struct cgroupfs_root *root = sb->s_fs_info; - struct cgroup *cgrp = &root->top_cgroup; - struct cgrp_cset_link *link, *tmp_link; - int ret; + init_cgroup_root(root, &opts); - BUG_ON(!root); + ret = cgroup_setup_root(root, opts.subsys_mask); + if (ret) + cgroup_free_root(root); - BUG_ON(root->number_of_cgroups != 1); - BUG_ON(!list_empty(&cgrp->children)); +out_unlock: + mutex_unlock(&cgroup_mutex); +out_free: + kfree(opts.release_agent); + kfree(opts.name); - mutex_lock(&cgrp->dentry->d_inode->i_mutex); - mutex_lock(&cgroup_mutex); - mutex_lock(&cgroup_root_mutex); + if (ret) + return ERR_PTR(ret); - /* Rebind all subsystems back to the default hierarchy */ - if (root->flags & CGRP_ROOT_SUBSYS_BOUND) { - ret = rebind_subsystems(root, 0, root->subsys_mask); - /* Shouldn't be able to fail ... */ - BUG_ON(ret); - } + dentry = kernfs_mount(fs_type, flags, root->kf_root, + CGROUP_SUPER_MAGIC, &new_sb); + if (IS_ERR(dentry) || !new_sb) + cgroup_put(&root->cgrp); /* - * Release all the links from cset_links to this hierarchy's - * root cgroup + * If @pinned_sb, we're reusing an existing root and holding an + * extra ref on its sb. Mount is complete. Put the extra ref. */ - write_lock(&css_set_lock); - - list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) { - list_del(&link->cset_link); - list_del(&link->cgrp_link); - kfree(link); - } - write_unlock(&css_set_lock); - - if (!list_empty(&root->root_list)) { - list_del(&root->root_list); - cgroup_root_count--; + if (pinned_sb) { + WARN_ON(new_sb); + deactivate_super(pinned_sb); } - cgroup_exit_root_id(root); + return dentry; +} - mutex_unlock(&cgroup_root_mutex); - mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgrp->dentry->d_inode->i_mutex); +static void cgroup_kill_sb(struct super_block *sb) +{ + struct kernfs_root *kf_root = kernfs_root_from_sb(sb); + struct cgroup_root *root = cgroup_root_from_kf(kf_root); - simple_xattrs_free(&cgrp->xattrs); + /* + * If @root doesn't have any mounts or children, start killing it. + * This prevents new mounts by disabling percpu_ref_tryget_live(). + * cgroup_mount() may wait for @root's release. + * + * And don't kill the default root. + */ + if (css_has_online_children(&root->cgrp.self) || + root == &cgrp_dfl_root) + cgroup_put(&root->cgrp); + else + percpu_ref_kill(&root->cgrp.self.refcnt); - kill_litter_super(sb); - cgroup_free_root(root); + kernfs_kill_sb(sb); } static struct file_system_type cgroup_fs_type = { @@ -1741,57 +1845,6 @@ static struct file_system_type cgroup_fs_type = { static struct kobject *cgroup_kobj; /** - * cgroup_path - generate the path of a cgroup - * @cgrp: the cgroup in question - * @buf: the buffer to write the path into - * @buflen: the length of the buffer - * - * Writes path of cgroup into buf. Returns 0 on success, -errno on error. - * - * We can't generate cgroup path using dentry->d_name, as accessing - * dentry->name must be protected by irq-unsafe dentry->d_lock or parent - * inode's i_mutex, while on the other hand cgroup_path() can be called - * with some irq-safe spinlocks held. - */ -int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) -{ - int ret = -ENAMETOOLONG; - char *start; - - if (!cgrp->parent) { - if (strlcpy(buf, "/", buflen) >= buflen) - return -ENAMETOOLONG; - return 0; - } - - start = buf + buflen - 1; - *start = '\0'; - - rcu_read_lock(); - do { - const char *name = cgroup_name(cgrp); - int len; - - len = strlen(name); - if ((start -= len) < buf) - goto out; - memcpy(start, name, len); - - if (--start < buf) - goto out; - *start = '/'; - - cgrp = cgrp->parent; - } while (cgrp->parent); - ret = 0; - memmove(buf, start, buf + buflen - start); -out: - rcu_read_unlock(); - return ret; -} -EXPORT_SYMBOL_GPL(cgroup_path); - -/** * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy * @task: target task * @buf: the buffer to write the path into @@ -1802,49 +1855,55 @@ EXPORT_SYMBOL_GPL(cgroup_path); * function grabs cgroup_mutex and shouldn't be used inside locks used by * cgroup controller callbacks. * - * Returns 0 on success, fails with -%ENAMETOOLONG if @buflen is too short. + * Return value is the same as kernfs_path(). */ -int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) +char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) { - struct cgroupfs_root *root; + struct cgroup_root *root; struct cgroup *cgrp; - int hierarchy_id = 1, ret = 0; - - if (buflen < 2) - return -ENAMETOOLONG; + int hierarchy_id = 1; + char *path = NULL; mutex_lock(&cgroup_mutex); + down_read(&css_set_rwsem); root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id); if (root) { cgrp = task_cgroup_from_root(task, root); - ret = cgroup_path(cgrp, buf, buflen); + path = cgroup_path(cgrp, buf, buflen); } else { /* if no hierarchy exists, everyone is in "/" */ - memcpy(buf, "/", 2); + if (strlcpy(buf, "/", buflen) < buflen) + path = buf; } + up_read(&css_set_rwsem); mutex_unlock(&cgroup_mutex); - return ret; + return path; } EXPORT_SYMBOL_GPL(task_cgroup_path); -/* - * Control Group taskset - */ -struct task_and_cgroup { - struct task_struct *task; - struct cgroup *cgrp; - struct css_set *cset; -}; - +/* used to track tasks and other necessary states during migration */ struct cgroup_taskset { - struct task_and_cgroup single; - struct flex_array *tc_array; - int tc_array_len; - int idx; - struct cgroup *cur_cgrp; + /* the src and dst cset list running through cset->mg_node */ + struct list_head src_csets; + struct list_head dst_csets; + + /* + * Fields for cgroup_taskset_*() iteration. + * + * Before migration is committed, the target migration tasks are on + * ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of + * the csets on ->dst_csets. ->csets point to either ->src_csets + * or ->dst_csets depending on whether migration is committed. + * + * ->cur_csets and ->cur_task point to the current task position + * during iteration. + */ + struct list_head *csets; + struct css_set *cur_cset; + struct task_struct *cur_task; }; /** @@ -1855,15 +1914,11 @@ struct cgroup_taskset { */ struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset) { - if (tset->tc_array) { - tset->idx = 0; - return cgroup_taskset_next(tset); - } else { - tset->cur_cgrp = tset->single.cgrp; - return tset->single.task; - } + tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node); + tset->cur_task = NULL; + + return cgroup_taskset_next(tset); } -EXPORT_SYMBOL_GPL(cgroup_taskset_first); /** * cgroup_taskset_next - iterate to the next task in taskset @@ -1874,48 +1929,36 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_first); */ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) { - struct task_and_cgroup *tc; + struct css_set *cset = tset->cur_cset; + struct task_struct *task = tset->cur_task; - if (!tset->tc_array || tset->idx >= tset->tc_array_len) - return NULL; + while (&cset->mg_node != tset->csets) { + if (!task) + task = list_first_entry(&cset->mg_tasks, + struct task_struct, cg_list); + else + task = list_next_entry(task, cg_list); - tc = flex_array_get(tset->tc_array, tset->idx++); - tset->cur_cgrp = tc->cgrp; - return tc->task; -} -EXPORT_SYMBOL_GPL(cgroup_taskset_next); + if (&task->cg_list != &cset->mg_tasks) { + tset->cur_cset = cset; + tset->cur_task = task; + return task; + } -/** - * cgroup_taskset_cur_css - return the matching css for the current task - * @tset: taskset of interest - * @subsys_id: the ID of the target subsystem - * - * Return the css for the current (last returned) task of @tset for - * subsystem specified by @subsys_id. This function must be preceded by - * either cgroup_taskset_first() or cgroup_taskset_next(). - */ -struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset, - int subsys_id) -{ - return cgroup_css(tset->cur_cgrp, cgroup_subsys[subsys_id]); -} -EXPORT_SYMBOL_GPL(cgroup_taskset_cur_css); + cset = list_next_entry(cset, mg_node); + task = NULL; + } -/** - * cgroup_taskset_size - return the number of tasks in taskset - * @tset: taskset of interest - */ -int cgroup_taskset_size(struct cgroup_taskset *tset) -{ - return tset->tc_array ? tset->tc_array_len : 1; + return NULL; } -EXPORT_SYMBOL_GPL(cgroup_taskset_size); - -/* +/** * cgroup_task_migrate - move a task from one cgroup to another. + * @old_cgrp: the cgroup @tsk is being migrated from + * @tsk: the task being migrated + * @new_cset: the new css_set @tsk is being attached to * - * Must be called with cgroup_mutex and threadgroup locked. + * Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked. */ static void cgroup_task_migrate(struct cgroup *old_cgrp, struct task_struct *tsk, @@ -1923,6 +1966,9 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp, { struct css_set *old_cset; + lockdep_assert_held(&cgroup_mutex); + lockdep_assert_held(&css_set_rwsem); + /* * We are synchronized through threadgroup_lock() against PF_EXITING * setting such that we can't race against cgroup_exit() changing the @@ -1931,15 +1977,16 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp, WARN_ON_ONCE(tsk->flags & PF_EXITING); old_cset = task_css_set(tsk); - task_lock(tsk); + get_css_set(new_cset); rcu_assign_pointer(tsk->cgroups, new_cset); - task_unlock(tsk); - /* Update the css_set linked lists if we're using them */ - write_lock(&css_set_lock); - if (!list_empty(&tsk->cg_list)) - list_move(&tsk->cg_list, &new_cset->tasks); - write_unlock(&css_set_lock); + /* + * Use move_tail so that cgroup_taskset_first() still returns the + * leader after migration. This works because cgroup_migrate() + * ensures that the dst_cset of the leader is the first on the + * tset's dst_csets list. + */ + list_move_tail(&tsk->cg_list, &new_cset->mg_tasks); /* * We just gained a reference on old_cset by taking it from the @@ -1947,100 +1994,219 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp, * we're safe to drop it here; it will be freed under RCU. */ set_bit(CGRP_RELEASABLE, &old_cgrp->flags); - put_css_set(old_cset); + put_css_set_locked(old_cset, false); } /** - * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup - * @cgrp: the cgroup to attach to - * @tsk: the task or the leader of the threadgroup to be attached - * @threadgroup: attach the whole threadgroup? + * cgroup_migrate_finish - cleanup after attach + * @preloaded_csets: list of preloaded css_sets * - * Call holding cgroup_mutex and the group_rwsem of the leader. Will take - * task_lock of @tsk or each thread in the threadgroup individually in turn. + * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See + * those functions for details. */ -static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, - bool threadgroup) +static void cgroup_migrate_finish(struct list_head *preloaded_csets) { - int retval, i, group_size; - struct cgroupfs_root *root = cgrp->root; - struct cgroup_subsys_state *css, *failed_css = NULL; - /* threadgroup list cursor and array */ - struct task_struct *leader = tsk; - struct task_and_cgroup *tc; - struct flex_array *group; - struct cgroup_taskset tset = { }; + struct css_set *cset, *tmp_cset; + + lockdep_assert_held(&cgroup_mutex); + + down_write(&css_set_rwsem); + list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) { + cset->mg_src_cgrp = NULL; + cset->mg_dst_cset = NULL; + list_del_init(&cset->mg_preload_node); + put_css_set_locked(cset, false); + } + up_write(&css_set_rwsem); +} + +/** + * cgroup_migrate_add_src - add a migration source css_set + * @src_cset: the source css_set to add + * @dst_cgrp: the destination cgroup + * @preloaded_csets: list of preloaded css_sets + * + * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp. Pin + * @src_cset and add it to @preloaded_csets, which should later be cleaned + * up by cgroup_migrate_finish(). + * + * This function may be called without holding threadgroup_lock even if the + * target is a process. Threads may be created and destroyed but as long + * as cgroup_mutex is not dropped, no new css_set can be put into play and + * the preloaded css_sets are guaranteed to cover all migrations. + */ +static void cgroup_migrate_add_src(struct css_set *src_cset, + struct cgroup *dst_cgrp, + struct list_head *preloaded_csets) +{ + struct cgroup *src_cgrp; + + lockdep_assert_held(&cgroup_mutex); + lockdep_assert_held(&css_set_rwsem); + + src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); + + if (!list_empty(&src_cset->mg_preload_node)) + return; + + WARN_ON(src_cset->mg_src_cgrp); + WARN_ON(!list_empty(&src_cset->mg_tasks)); + WARN_ON(!list_empty(&src_cset->mg_node)); + + src_cset->mg_src_cgrp = src_cgrp; + get_css_set(src_cset); + list_add(&src_cset->mg_preload_node, preloaded_csets); +} + +/** + * cgroup_migrate_prepare_dst - prepare destination css_sets for migration + * @dst_cgrp: the destination cgroup (may be %NULL) + * @preloaded_csets: list of preloaded source css_sets + * + * Tasks are about to be moved to @dst_cgrp and all the source css_sets + * have been preloaded to @preloaded_csets. This function looks up and + * pins all destination css_sets, links each to its source, and append them + * to @preloaded_csets. If @dst_cgrp is %NULL, the destination of each + * source css_set is assumed to be its cgroup on the default hierarchy. + * + * This function must be called after cgroup_migrate_add_src() has been + * called on each migration source css_set. After migration is performed + * using cgroup_migrate(), cgroup_migrate_finish() must be called on + * @preloaded_csets. + */ +static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, + struct list_head *preloaded_csets) +{ + LIST_HEAD(csets); + struct css_set *src_cset, *tmp_cset; + + lockdep_assert_held(&cgroup_mutex); /* - * step 0: in order to do expensive, possibly blocking operations for - * every thread, we cannot iterate the thread group list, since it needs - * rcu or tasklist locked. instead, build an array of all threads in the - * group - group_rwsem prevents new threads from appearing, and if - * threads exit, this will just be an over-estimate. + * Except for the root, child_subsys_mask must be zero for a cgroup + * with tasks so that child cgroups don't compete against tasks. */ - if (threadgroup) - group_size = get_nr_threads(tsk); - else - group_size = 1; - /* flex_array supports very large thread-groups better than kmalloc. */ - group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL); - if (!group) - return -ENOMEM; - /* pre-allocate to guarantee space while iterating in rcu read-side. */ - retval = flex_array_prealloc(group, 0, group_size, GFP_KERNEL); - if (retval) - goto out_free_group_list; + if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && cgroup_parent(dst_cgrp) && + dst_cgrp->child_subsys_mask) + return -EBUSY; + + /* look up the dst cset for each src cset and link it to src */ + list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) { + struct css_set *dst_cset; + + dst_cset = find_css_set(src_cset, + dst_cgrp ?: src_cset->dfl_cgrp); + if (!dst_cset) + goto err; + + WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset); + + /* + * If src cset equals dst, it's noop. Drop the src. + * cgroup_migrate() will skip the cset too. Note that we + * can't handle src == dst as some nodes are used by both. + */ + if (src_cset == dst_cset) { + src_cset->mg_src_cgrp = NULL; + list_del_init(&src_cset->mg_preload_node); + put_css_set(src_cset, false); + put_css_set(dst_cset, false); + continue; + } + + src_cset->mg_dst_cset = dst_cset; + + if (list_empty(&dst_cset->mg_preload_node)) + list_add(&dst_cset->mg_preload_node, &csets); + else + put_css_set(dst_cset, false); + } + + list_splice_tail(&csets, preloaded_csets); + return 0; +err: + cgroup_migrate_finish(&csets); + return -ENOMEM; +} + +/** + * cgroup_migrate - migrate a process or task to a cgroup + * @cgrp: the destination cgroup + * @leader: the leader of the process or the task to migrate + * @threadgroup: whether @leader points to the whole process or a single task + * + * Migrate a process or task denoted by @leader to @cgrp. If migrating a + * process, the caller must be holding threadgroup_lock of @leader. The + * caller is also responsible for invoking cgroup_migrate_add_src() and + * cgroup_migrate_prepare_dst() on the targets before invoking this + * function and following up with cgroup_migrate_finish(). + * + * As long as a controller's ->can_attach() doesn't fail, this function is + * guaranteed to succeed. This means that, excluding ->can_attach() + * failure, when migrating multiple targets, the success or failure can be + * decided for all targets by invoking group_migrate_prepare_dst() before + * actually starting migrating. + */ +static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader, + bool threadgroup) +{ + struct cgroup_taskset tset = { + .src_csets = LIST_HEAD_INIT(tset.src_csets), + .dst_csets = LIST_HEAD_INIT(tset.dst_csets), + .csets = &tset.src_csets, + }; + struct cgroup_subsys_state *css, *failed_css = NULL; + struct css_set *cset, *tmp_cset; + struct task_struct *task, *tmp_task; + int i, ret; - i = 0; /* * Prevent freeing of tasks while we take a snapshot. Tasks that are * already PF_EXITING could be freed from underneath us unless we * take an rcu_read_lock. */ + down_write(&css_set_rwsem); rcu_read_lock(); + task = leader; do { - struct task_and_cgroup ent; + /* @task either already exited or can't exit until the end */ + if (task->flags & PF_EXITING) + goto next; - /* @tsk either already exited or can't exit until the end */ - if (tsk->flags & PF_EXITING) + /* leave @task alone if post_fork() hasn't linked it yet */ + if (list_empty(&task->cg_list)) goto next; - /* as per above, nr_threads may decrease, but not increase. */ - BUG_ON(i >= group_size); - ent.task = tsk; - ent.cgrp = task_cgroup_from_root(tsk, root); - /* nothing to do if this task is already in the cgroup */ - if (ent.cgrp == cgrp) + cset = task_css_set(task); + if (!cset->mg_src_cgrp) goto next; + /* - * saying GFP_ATOMIC has no effect here because we did prealloc - * earlier, but it's good form to communicate our expectations. + * cgroup_taskset_first() must always return the leader. + * Take care to avoid disturbing the ordering. */ - retval = flex_array_put(group, i, &ent, GFP_ATOMIC); - BUG_ON(retval != 0); - i++; + list_move_tail(&task->cg_list, &cset->mg_tasks); + if (list_empty(&cset->mg_node)) + list_add_tail(&cset->mg_node, &tset.src_csets); + if (list_empty(&cset->mg_dst_cset->mg_node)) + list_move_tail(&cset->mg_dst_cset->mg_node, + &tset.dst_csets); next: if (!threadgroup) break; - } while_each_thread(leader, tsk); + } while_each_thread(leader, task); rcu_read_unlock(); - /* remember the number of threads in the array for later. */ - group_size = i; - tset.tc_array = group; - tset.tc_array_len = group_size; + up_write(&css_set_rwsem); /* methods shouldn't be called if no task is actually migrating */ - retval = 0; - if (!group_size) - goto out_free_group_list; + if (list_empty(&tset.src_csets)) + return 0; - /* - * step 1: check that we can legitimately attach to the cgroup. - */ - for_each_css(css, i, cgrp) { + /* check that we can legitimately attach to the cgroup */ + for_each_e_css(css, i, cgrp) { if (css->ss->can_attach) { - retval = css->ss->can_attach(css, &tset); - if (retval) { + ret = css->ss->can_attach(css, &tset); + if (ret) { failed_css = css; goto out_cancel_attach; } @@ -2048,78 +2214,106 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, } /* - * step 2: make sure css_sets exist for all threads to be migrated. - * we use find_css_set, which allocates a new one if necessary. + * Now that we're guaranteed success, proceed to move all tasks to + * the new cgroup. There are no failure cases after here, so this + * is the commit point. */ - for (i = 0; i < group_size; i++) { - struct css_set *old_cset; - - tc = flex_array_get(group, i); - old_cset = task_css_set(tc->task); - tc->cset = find_css_set(old_cset, cgrp); - if (!tc->cset) { - retval = -ENOMEM; - goto out_put_css_set_refs; - } + down_write(&css_set_rwsem); + list_for_each_entry(cset, &tset.src_csets, mg_node) { + list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) + cgroup_task_migrate(cset->mg_src_cgrp, task, + cset->mg_dst_cset); } + up_write(&css_set_rwsem); /* - * step 3: now that we're guaranteed success wrt the css_sets, - * proceed to move all tasks to the new cgroup. There are no - * failure cases after here, so this is the commit point. + * Migration is committed, all target tasks are now on dst_csets. + * Nothing is sensitive to fork() after this point. Notify + * controllers that migration is complete. */ - for (i = 0; i < group_size; i++) { - tc = flex_array_get(group, i); - cgroup_task_migrate(tc->cgrp, tc->task, tc->cset); - } - /* nothing is sensitive to fork() after this point. */ + tset.csets = &tset.dst_csets; - /* - * step 4: do subsystem attach callbacks. - */ - for_each_css(css, i, cgrp) + for_each_e_css(css, i, cgrp) if (css->ss->attach) css->ss->attach(css, &tset); - /* - * step 5: success! and cleanup - */ - retval = 0; -out_put_css_set_refs: - if (retval) { - for (i = 0; i < group_size; i++) { - tc = flex_array_get(group, i); - if (!tc->cset) - break; - put_css_set(tc->cset); - } - } + ret = 0; + goto out_release_tset; + out_cancel_attach: - if (retval) { - for_each_css(css, i, cgrp) { - if (css == failed_css) - break; - if (css->ss->cancel_attach) - css->ss->cancel_attach(css, &tset); - } + for_each_e_css(css, i, cgrp) { + if (css == failed_css) + break; + if (css->ss->cancel_attach) + css->ss->cancel_attach(css, &tset); } -out_free_group_list: - flex_array_free(group); - return retval; +out_release_tset: + down_write(&css_set_rwsem); + list_splice_init(&tset.dst_csets, &tset.src_csets); + list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) { + list_splice_tail_init(&cset->mg_tasks, &cset->tasks); + list_del_init(&cset->mg_node); + } + up_write(&css_set_rwsem); + return ret; +} + +/** + * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup + * @dst_cgrp: the cgroup to attach to + * @leader: the task or the leader of the threadgroup to be attached + * @threadgroup: attach the whole threadgroup? + * + * Call holding cgroup_mutex and threadgroup_lock of @leader. + */ +static int cgroup_attach_task(struct cgroup *dst_cgrp, + struct task_struct *leader, bool threadgroup) +{ + LIST_HEAD(preloaded_csets); + struct task_struct *task; + int ret; + + /* look up all src csets */ + down_read(&css_set_rwsem); + rcu_read_lock(); + task = leader; + do { + cgroup_migrate_add_src(task_css_set(task), dst_cgrp, + &preloaded_csets); + if (!threadgroup) + break; + } while_each_thread(leader, task); + rcu_read_unlock(); + up_read(&css_set_rwsem); + + /* prepare dst csets and commit */ + ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets); + if (!ret) + ret = cgroup_migrate(dst_cgrp, leader, threadgroup); + + cgroup_migrate_finish(&preloaded_csets); + return ret; } /* * Find the task_struct of the task to attach by vpid and pass it along to the * function to attach either it or all tasks in its threadgroup. Will lock - * cgroup_mutex and threadgroup; may take task_lock of task. + * cgroup_mutex and threadgroup. */ -static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) +static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off, bool threadgroup) { struct task_struct *tsk; const struct cred *cred = current_cred(), *tcred; + struct cgroup *cgrp; + pid_t pid; int ret; - if (!cgroup_lock_live_group(cgrp)) + if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) + return -EINVAL; + + cgrp = cgroup_kn_lock_live(of->kn); + if (!cgrp) return -ENODEV; retry_find_task: @@ -2185,8 +2379,8 @@ retry_find_task: put_task_struct(tsk); out_unlock_cgroup: - mutex_unlock(&cgroup_mutex); - return ret; + cgroup_kn_unlock(of->kn); + return ret ?: nbytes; } /** @@ -2196,12 +2390,19 @@ out_unlock_cgroup: */ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) { - struct cgroupfs_root *root; + struct cgroup_root *root; int retval = 0; mutex_lock(&cgroup_mutex); - for_each_active_root(root) { - struct cgroup *from_cgrp = task_cgroup_from_root(from, root); + for_each_root(root) { + struct cgroup *from_cgrp; + + if (root == &cgrp_dfl_root) + continue; + + down_read(&css_set_rwsem); + from_cgrp = task_cgroup_from_root(from, root); + up_read(&css_set_rwsem); retval = cgroup_attach_task(from_cgrp, tsk, false); if (retval) @@ -2213,42 +2414,44 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) } EXPORT_SYMBOL_GPL(cgroup_attach_task_all); -static int cgroup_tasks_write(struct cgroup_subsys_state *css, - struct cftype *cft, u64 pid) +static ssize_t cgroup_tasks_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { - return attach_task_by_pid(css->cgroup, pid, false); + return __cgroup_procs_write(of, buf, nbytes, off, false); } -static int cgroup_procs_write(struct cgroup_subsys_state *css, - struct cftype *cft, u64 tgid) +static ssize_t cgroup_procs_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { - return attach_task_by_pid(css->cgroup, tgid, true); + return __cgroup_procs_write(of, buf, nbytes, off, true); } -static int cgroup_release_agent_write(struct cgroup_subsys_state *css, - struct cftype *cft, const char *buffer) +static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { - BUILD_BUG_ON(sizeof(css->cgroup->root->release_agent_path) < PATH_MAX); - if (strlen(buffer) >= PATH_MAX) - return -EINVAL; - if (!cgroup_lock_live_group(css->cgroup)) + struct cgroup *cgrp; + + BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); + + cgrp = cgroup_kn_lock_live(of->kn); + if (!cgrp) return -ENODEV; - mutex_lock(&cgroup_root_mutex); - strcpy(css->cgroup->root->release_agent_path, buffer); - mutex_unlock(&cgroup_root_mutex); - mutex_unlock(&cgroup_mutex); - return 0; + spin_lock(&release_agent_path_lock); + strlcpy(cgrp->root->release_agent_path, strstrip(buf), + sizeof(cgrp->root->release_agent_path)); + spin_unlock(&release_agent_path_lock); + cgroup_kn_unlock(of->kn); + return nbytes; } static int cgroup_release_agent_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - if (!cgroup_lock_live_group(cgrp)) - return -ENODEV; + spin_lock(&release_agent_path_lock); seq_puts(seq, cgrp->root->release_agent_path); + spin_unlock(&release_agent_path_lock); seq_putc(seq, '\n'); - mutex_unlock(&cgroup_mutex); return 0; } @@ -2260,425 +2463,484 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) return 0; } -/* A buffer size big enough for numbers or short strings */ -#define CGROUP_LOCAL_BUFFER_SIZE 64 +static void cgroup_print_ss_mask(struct seq_file *seq, unsigned int ss_mask) +{ + struct cgroup_subsys *ss; + bool printed = false; + int ssid; -static ssize_t cgroup_file_write(struct file *file, const char __user *userbuf, - size_t nbytes, loff_t *ppos) + for_each_subsys(ss, ssid) { + if (ss_mask & (1 << ssid)) { + if (printed) + seq_putc(seq, ' '); + seq_printf(seq, "%s", ss->name); + printed = true; + } + } + if (printed) + seq_putc(seq, '\n'); +} + +/* show controllers which are currently attached to the default hierarchy */ +static int cgroup_root_controllers_show(struct seq_file *seq, void *v) { - struct cfent *cfe = __d_cfe(file->f_dentry); - struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup_subsys_state *css = cfe->css; - size_t max_bytes = cft->max_write_len ?: CGROUP_LOCAL_BUFFER_SIZE - 1; - char *buf; - int ret; + struct cgroup *cgrp = seq_css(seq)->cgroup; - if (nbytes >= max_bytes) - return -E2BIG; + cgroup_print_ss_mask(seq, cgrp->root->subsys_mask & + ~cgrp_dfl_root_inhibit_ss_mask); + return 0; +} - buf = kmalloc(nbytes + 1, GFP_KERNEL); - if (!buf) - return -ENOMEM; +/* show controllers which are enabled from the parent */ +static int cgroup_controllers_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; - if (copy_from_user(buf, userbuf, nbytes)) { - ret = -EFAULT; - goto out_free; - } + cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->child_subsys_mask); + return 0; +} - buf[nbytes] = '\0'; +/* show controllers which are enabled for a given cgroup's children */ +static int cgroup_subtree_control_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; - if (cft->write_string) { - ret = cft->write_string(css, cft, strstrip(buf)); - } else if (cft->write_u64) { - unsigned long long v; - ret = kstrtoull(buf, 0, &v); - if (!ret) - ret = cft->write_u64(css, cft, v); - } else if (cft->write_s64) { - long long v; - ret = kstrtoll(buf, 0, &v); - if (!ret) - ret = cft->write_s64(css, cft, v); - } else if (cft->trigger) { - ret = cft->trigger(css, (unsigned int)cft->private); - } else { - ret = -EINVAL; - } -out_free: - kfree(buf); - return ret ?: nbytes; + cgroup_print_ss_mask(seq, cgrp->child_subsys_mask); + return 0; } -/* - * seqfile ops/methods for returning structured data. Currently just - * supports string->u64 maps, but can be extended in future. +/** + * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy + * @cgrp: root of the subtree to update csses for + * + * @cgrp's child_subsys_mask has changed and its subtree's (self excluded) + * css associations need to be updated accordingly. This function looks up + * all css_sets which are attached to the subtree, creates the matching + * updated css_sets and migrates the tasks to the new ones. */ - -static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos) +static int cgroup_update_dfl_csses(struct cgroup *cgrp) { - struct cftype *cft = seq_cft(seq); + LIST_HEAD(preloaded_csets); + struct cgroup_subsys_state *css; + struct css_set *src_cset; + int ret; - if (cft->seq_start) { - return cft->seq_start(seq, ppos); - } else { - /* - * The same behavior and code as single_open(). Returns - * !NULL if pos is at the beginning; otherwise, NULL. - */ - return NULL + !*ppos; + lockdep_assert_held(&cgroup_mutex); + + /* look up all csses currently attached to @cgrp's subtree */ + down_read(&css_set_rwsem); + css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) { + struct cgrp_cset_link *link; + + /* self is not affected by child_subsys_mask change */ + if (css->cgroup == cgrp) + continue; + + list_for_each_entry(link, &css->cgroup->cset_links, cset_link) + cgroup_migrate_add_src(link->cset, cgrp, + &preloaded_csets); } -} + up_read(&css_set_rwsem); -static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos) -{ - struct cftype *cft = seq_cft(seq); + /* NULL dst indicates self on default hierarchy */ + ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets); + if (ret) + goto out_finish; + + list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) { + struct task_struct *last_task = NULL, *task; + + /* src_csets precede dst_csets, break on the first dst_cset */ + if (!src_cset->mg_src_cgrp) + break; - if (cft->seq_next) { - return cft->seq_next(seq, v, ppos); - } else { /* - * The same behavior and code as single_open(), always - * terminate after the initial read. + * All tasks in src_cset need to be migrated to the + * matching dst_cset. Empty it process by process. We + * walk tasks but migrate processes. The leader might even + * belong to a different cset but such src_cset would also + * be among the target src_csets because the default + * hierarchy enforces per-process membership. */ - ++*ppos; - return NULL; - } -} + while (true) { + down_read(&css_set_rwsem); + task = list_first_entry_or_null(&src_cset->tasks, + struct task_struct, cg_list); + if (task) { + task = task->group_leader; + WARN_ON_ONCE(!task_css_set(task)->mg_src_cgrp); + get_task_struct(task); + } + up_read(&css_set_rwsem); -static void cgroup_seqfile_stop(struct seq_file *seq, void *v) -{ - struct cftype *cft = seq_cft(seq); + if (!task) + break; - if (cft->seq_stop) - cft->seq_stop(seq, v); -} + /* guard against possible infinite loop */ + if (WARN(last_task == task, + "cgroup: update_dfl_csses failed to make progress, aborting in inconsistent state\n")) + goto out_finish; + last_task = task; + + threadgroup_lock(task); + /* raced against de_thread() from another thread? */ + if (!thread_group_leader(task)) { + threadgroup_unlock(task); + put_task_struct(task); + continue; + } -static int cgroup_seqfile_show(struct seq_file *m, void *arg) -{ - struct cftype *cft = seq_cft(m); - struct cgroup_subsys_state *css = seq_css(m); + ret = cgroup_migrate(src_cset->dfl_cgrp, task, true); - if (cft->seq_show) - return cft->seq_show(m, arg); + threadgroup_unlock(task); + put_task_struct(task); - if (cft->read_u64) - seq_printf(m, "%llu\n", cft->read_u64(css, cft)); - else if (cft->read_s64) - seq_printf(m, "%lld\n", cft->read_s64(css, cft)); - else - return -EINVAL; - return 0; -} + if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret)) + goto out_finish; + } + } -static struct seq_operations cgroup_seq_operations = { - .start = cgroup_seqfile_start, - .next = cgroup_seqfile_next, - .stop = cgroup_seqfile_stop, - .show = cgroup_seqfile_show, -}; +out_finish: + cgroup_migrate_finish(&preloaded_csets); + return ret; +} -static int cgroup_file_open(struct inode *inode, struct file *file) +/* change the enabled child controllers for a cgroup in the default hierarchy */ +static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) { - struct cfent *cfe = __d_cfe(file->f_dentry); - struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); - struct cgroup_subsys_state *css; - struct cgroup_open_file *of; - int err; - - err = generic_file_open(inode, file); - if (err) - return err; + unsigned int enable = 0, disable = 0; + struct cgroup *cgrp, *child; + struct cgroup_subsys *ss; + char *tok; + int ssid, ret; /* - * If the file belongs to a subsystem, pin the css. Will be - * unpinned either on open failure or release. This ensures that - * @css stays alive for all file operations. + * Parse input - space separated list of subsystem names prefixed + * with either + or -. */ - rcu_read_lock(); - css = cgroup_css(cgrp, cft->ss); - if (cft->ss && !css_tryget(css)) - css = NULL; - rcu_read_unlock(); + buf = strstrip(buf); + while ((tok = strsep(&buf, " "))) { + if (tok[0] == '\0') + continue; + for_each_subsys(ss, ssid) { + if (ss->disabled || strcmp(tok + 1, ss->name) || + ((1 << ss->id) & cgrp_dfl_root_inhibit_ss_mask)) + continue; - if (!css) + if (*tok == '+') { + enable |= 1 << ssid; + disable &= ~(1 << ssid); + } else if (*tok == '-') { + disable |= 1 << ssid; + enable &= ~(1 << ssid); + } else { + return -EINVAL; + } + break; + } + if (ssid == CGROUP_SUBSYS_COUNT) + return -EINVAL; + } + + cgrp = cgroup_kn_lock_live(of->kn); + if (!cgrp) return -ENODEV; - /* - * @cfe->css is used by read/write/close to determine the - * associated css. @file->private_data would be a better place but - * that's already used by seqfile. Multiple accessors may use it - * simultaneously which is okay as the association never changes. - */ - WARN_ON_ONCE(cfe->css && cfe->css != css); - cfe->css = css; + for_each_subsys(ss, ssid) { + if (enable & (1 << ssid)) { + if (cgrp->child_subsys_mask & (1 << ssid)) { + enable &= ~(1 << ssid); + continue; + } - of = __seq_open_private(file, &cgroup_seq_operations, - sizeof(struct cgroup_open_file)); - if (of) { - of->cfe = cfe; - return 0; - } + /* + * Because css offlining is asynchronous, userland + * might try to re-enable the same controller while + * the previous instance is still around. In such + * cases, wait till it's gone using offline_waitq. + */ + cgroup_for_each_live_child(child, cgrp) { + DEFINE_WAIT(wait); - if (css->ss) - css_put(css); - return -ENOMEM; -} + if (!cgroup_css(child, ss)) + continue; -static int cgroup_file_release(struct inode *inode, struct file *file) -{ - struct cfent *cfe = __d_cfe(file->f_dentry); - struct cgroup_subsys_state *css = cfe->css; + cgroup_get(child); + prepare_to_wait(&child->offline_waitq, &wait, + TASK_UNINTERRUPTIBLE); + cgroup_kn_unlock(of->kn); + schedule(); + finish_wait(&child->offline_waitq, &wait); + cgroup_put(child); - if (css->ss) - css_put(css); - return seq_release_private(inode, file); -} + return restart_syscall(); + } -/* - * cgroup_rename - Only allow simple rename of directories in place. - */ -static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) -{ - int ret; - struct cgroup_name *name, *old_name; - struct cgroup *cgrp; + /* unavailable or not enabled on the parent? */ + if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) || + (cgroup_parent(cgrp) && + !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ssid)))) { + ret = -ENOENT; + goto out_unlock; + } + } else if (disable & (1 << ssid)) { + if (!(cgrp->child_subsys_mask & (1 << ssid))) { + disable &= ~(1 << ssid); + continue; + } - /* - * It's convinient to use parent dir's i_mutex to protected - * cgrp->name. - */ - lockdep_assert_held(&old_dir->i_mutex); + /* a child has it enabled? */ + cgroup_for_each_live_child(child, cgrp) { + if (child->child_subsys_mask & (1 << ssid)) { + ret = -EBUSY; + goto out_unlock; + } + } + } + } - if (!S_ISDIR(old_dentry->d_inode->i_mode)) - return -ENOTDIR; - if (new_dentry->d_inode) - return -EEXIST; - if (old_dir != new_dir) - return -EIO; + if (!enable && !disable) { + ret = 0; + goto out_unlock; + } - cgrp = __d_cgrp(old_dentry); + /* + * Except for the root, child_subsys_mask must be zero for a cgroup + * with tasks so that child cgroups don't compete against tasks. + */ + if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) { + ret = -EBUSY; + goto out_unlock; + } /* - * This isn't a proper migration and its usefulness is very - * limited. Disallow if sane_behavior. + * Create csses for enables and update child_subsys_mask. This + * changes cgroup_e_css() results which in turn makes the + * subsequent cgroup_update_dfl_csses() associate all tasks in the + * subtree to the updated csses. */ - if (cgroup_sane_behavior(cgrp)) - return -EPERM; + for_each_subsys(ss, ssid) { + if (!(enable & (1 << ssid))) + continue; - name = cgroup_alloc_name(new_dentry); - if (!name) - return -ENOMEM; + cgroup_for_each_live_child(child, cgrp) { + ret = create_css(child, ss); + if (ret) + goto err_undo_css; + } + } - ret = simple_rename(old_dir, old_dentry, new_dir, new_dentry); - if (ret) { - kfree(name); - return ret; + cgrp->child_subsys_mask |= enable; + cgrp->child_subsys_mask &= ~disable; + + ret = cgroup_update_dfl_csses(cgrp); + if (ret) + goto err_undo_css; + + /* all tasks are now migrated away from the old csses, kill them */ + for_each_subsys(ss, ssid) { + if (!(disable & (1 << ssid))) + continue; + + cgroup_for_each_live_child(child, cgrp) + kill_css(cgroup_css(child, ss)); } - old_name = rcu_dereference_protected(cgrp->name, true); - rcu_assign_pointer(cgrp->name, name); + kernfs_activate(cgrp->kn); + ret = 0; +out_unlock: + cgroup_kn_unlock(of->kn); + return ret ?: nbytes; - kfree_rcu(old_name, rcu_head); - return 0; -} +err_undo_css: + cgrp->child_subsys_mask &= ~enable; + cgrp->child_subsys_mask |= disable; -static struct simple_xattrs *__d_xattrs(struct dentry *dentry) -{ - if (S_ISDIR(dentry->d_inode->i_mode)) - return &__d_cgrp(dentry)->xattrs; - else - return &__d_cfe(dentry)->xattrs; + for_each_subsys(ss, ssid) { + if (!(enable & (1 << ssid))) + continue; + + cgroup_for_each_live_child(child, cgrp) { + struct cgroup_subsys_state *css = cgroup_css(child, ss); + if (css) + kill_css(css); + } + } + goto out_unlock; } -static inline int xattr_enabled(struct dentry *dentry) +static int cgroup_populated_show(struct seq_file *seq, void *v) { - struct cgroupfs_root *root = dentry->d_sb->s_fs_info; - return root->flags & CGRP_ROOT_XATTR; + seq_printf(seq, "%d\n", (bool)seq_css(seq)->cgroup->populated_cnt); + return 0; } -static bool is_valid_xattr(const char *name) +static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) { - if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || - !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) - return true; - return false; + struct cgroup *cgrp = of->kn->parent->priv; + struct cftype *cft = of->kn->priv; + struct cgroup_subsys_state *css; + int ret; + + if (cft->write) + return cft->write(of, buf, nbytes, off); + + /* + * kernfs guarantees that a file isn't deleted with operations in + * flight, which means that the matching css is and stays alive and + * doesn't need to be pinned. The RCU locking is not necessary + * either. It's just for the convenience of using cgroup_css(). + */ + rcu_read_lock(); + css = cgroup_css(cgrp, cft->ss); + rcu_read_unlock(); + + if (cft->write_u64) { + unsigned long long v; + ret = kstrtoull(buf, 0, &v); + if (!ret) + ret = cft->write_u64(css, cft, v); + } else if (cft->write_s64) { + long long v; + ret = kstrtoll(buf, 0, &v); + if (!ret) + ret = cft->write_s64(css, cft, v); + } else { + ret = -EINVAL; + } + + return ret ?: nbytes; } -static int cgroup_setxattr(struct dentry *dentry, const char *name, - const void *val, size_t size, int flags) +static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos) { - if (!xattr_enabled(dentry)) - return -EOPNOTSUPP; - if (!is_valid_xattr(name)) - return -EINVAL; - return simple_xattr_set(__d_xattrs(dentry), name, val, size, flags); + return seq_cft(seq)->seq_start(seq, ppos); } -static int cgroup_removexattr(struct dentry *dentry, const char *name) +static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos) { - if (!xattr_enabled(dentry)) - return -EOPNOTSUPP; - if (!is_valid_xattr(name)) - return -EINVAL; - return simple_xattr_remove(__d_xattrs(dentry), name); + return seq_cft(seq)->seq_next(seq, v, ppos); } -static ssize_t cgroup_getxattr(struct dentry *dentry, const char *name, - void *buf, size_t size) +static void cgroup_seqfile_stop(struct seq_file *seq, void *v) { - if (!xattr_enabled(dentry)) - return -EOPNOTSUPP; - if (!is_valid_xattr(name)) - return -EINVAL; - return simple_xattr_get(__d_xattrs(dentry), name, buf, size); + seq_cft(seq)->seq_stop(seq, v); } -static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size) +static int cgroup_seqfile_show(struct seq_file *m, void *arg) { - if (!xattr_enabled(dentry)) - return -EOPNOTSUPP; - return simple_xattr_list(__d_xattrs(dentry), buf, size); -} + struct cftype *cft = seq_cft(m); + struct cgroup_subsys_state *css = seq_css(m); -static const struct file_operations cgroup_file_operations = { - .read = seq_read, - .write = cgroup_file_write, - .llseek = generic_file_llseek, - .open = cgroup_file_open, - .release = cgroup_file_release, -}; + if (cft->seq_show) + return cft->seq_show(m, arg); -static const struct inode_operations cgroup_file_inode_operations = { - .setxattr = cgroup_setxattr, - .getxattr = cgroup_getxattr, - .listxattr = cgroup_listxattr, - .removexattr = cgroup_removexattr, + if (cft->read_u64) + seq_printf(m, "%llu\n", cft->read_u64(css, cft)); + else if (cft->read_s64) + seq_printf(m, "%lld\n", cft->read_s64(css, cft)); + else + return -EINVAL; + return 0; +} + +static struct kernfs_ops cgroup_kf_single_ops = { + .atomic_write_len = PAGE_SIZE, + .write = cgroup_file_write, + .seq_show = cgroup_seqfile_show, }; -static const struct inode_operations cgroup_dir_inode_operations = { - .lookup = simple_lookup, - .mkdir = cgroup_mkdir, - .rmdir = cgroup_rmdir, - .rename = cgroup_rename, - .setxattr = cgroup_setxattr, - .getxattr = cgroup_getxattr, - .listxattr = cgroup_listxattr, - .removexattr = cgroup_removexattr, +static struct kernfs_ops cgroup_kf_ops = { + .atomic_write_len = PAGE_SIZE, + .write = cgroup_file_write, + .seq_start = cgroup_seqfile_start, + .seq_next = cgroup_seqfile_next, + .seq_stop = cgroup_seqfile_stop, + .seq_show = cgroup_seqfile_show, }; -static int cgroup_create_file(struct dentry *dentry, umode_t mode, - struct super_block *sb) +/* + * cgroup_rename - Only allow simple rename of directories in place. + */ +static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, + const char *new_name_str) { - struct inode *inode; + struct cgroup *cgrp = kn->priv; + int ret; - if (!dentry) - return -ENOENT; - if (dentry->d_inode) - return -EEXIST; + if (kernfs_type(kn) != KERNFS_DIR) + return -ENOTDIR; + if (kn->parent != new_parent) + return -EIO; - inode = cgroup_new_inode(mode, sb); - if (!inode) - return -ENOMEM; + /* + * This isn't a proper migration and its usefulness is very + * limited. Disallow if sane_behavior. + */ + if (cgroup_sane_behavior(cgrp)) + return -EPERM; - if (S_ISDIR(mode)) { - inode->i_op = &cgroup_dir_inode_operations; - inode->i_fop = &simple_dir_operations; + /* + * We're gonna grab cgroup_mutex which nests outside kernfs + * active_ref. kernfs_rename() doesn't require active_ref + * protection. Break them before grabbing cgroup_mutex. + */ + kernfs_break_active_protection(new_parent); + kernfs_break_active_protection(kn); - /* start off with i_nlink == 2 (for "." entry) */ - inc_nlink(inode); - inc_nlink(dentry->d_parent->d_inode); + mutex_lock(&cgroup_mutex); - /* - * Control reaches here with cgroup_mutex held. - * @inode->i_mutex should nest outside cgroup_mutex but we - * want to populate it immediately without releasing - * cgroup_mutex. As @inode isn't visible to anyone else - * yet, trylock will always succeed without affecting - * lockdep checks. - */ - WARN_ON_ONCE(!mutex_trylock(&inode->i_mutex)); - } else if (S_ISREG(mode)) { - inode->i_size = 0; - inode->i_fop = &cgroup_file_operations; - inode->i_op = &cgroup_file_inode_operations; - } - d_instantiate(dentry, inode); - dget(dentry); /* Extra count - pin the dentry in core */ - return 0; -} + ret = kernfs_rename(kn, new_parent, new_name_str); -/** - * cgroup_file_mode - deduce file mode of a control file - * @cft: the control file in question - * - * returns cft->mode if ->mode is not 0 - * returns S_IRUGO|S_IWUSR if it has both a read and a write handler - * returns S_IRUGO if it has only a read handler - * returns S_IWUSR if it has only a write hander - */ -static umode_t cgroup_file_mode(const struct cftype *cft) -{ - umode_t mode = 0; + mutex_unlock(&cgroup_mutex); - if (cft->mode) - return cft->mode; + kernfs_unbreak_active_protection(kn); + kernfs_unbreak_active_protection(new_parent); + return ret; +} - if (cft->read_u64 || cft->read_s64 || cft->seq_show) - mode |= S_IRUGO; +/* set uid and gid of cgroup dirs and files to that of the creator */ +static int cgroup_kn_set_ugid(struct kernfs_node *kn) +{ + struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID, + .ia_uid = current_fsuid(), + .ia_gid = current_fsgid(), }; - if (cft->write_u64 || cft->write_s64 || cft->write_string || - cft->trigger) - mode |= S_IWUSR; + if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) && + gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID)) + return 0; - return mode; + return kernfs_setattr(kn, &iattr); } static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) { - struct dentry *dir = cgrp->dentry; - struct cgroup *parent = __d_cgrp(dir); - struct dentry *dentry; - struct cfent *cfe; - int error; - umode_t mode; - char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; - - if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) && - !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { - strcpy(name, cft->ss->name); - strcat(name, "."); - } - strcat(name, cft->name); - - BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex)); + char name[CGROUP_FILE_NAME_MAX]; + struct kernfs_node *kn; + struct lock_class_key *key = NULL; + int ret; - cfe = kzalloc(sizeof(*cfe), GFP_KERNEL); - if (!cfe) - return -ENOMEM; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + key = &cft->lockdep_key; +#endif + kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name), + cgroup_file_mode(cft), 0, cft->kf_ops, cft, + NULL, false, key); + if (IS_ERR(kn)) + return PTR_ERR(kn); - dentry = lookup_one_len(name, dir, strlen(name)); - if (IS_ERR(dentry)) { - error = PTR_ERR(dentry); - goto out; + ret = cgroup_kn_set_ugid(kn); + if (ret) { + kernfs_remove(kn); + return ret; } - cfe->type = (void *)cft; - cfe->dentry = dentry; - dentry->d_fsdata = cfe; - simple_xattrs_init(&cfe->xattrs); - - mode = cgroup_file_mode(cft); - error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb); - if (!error) { - list_add_tail(&cfe->node, &parent->files); - cfe = NULL; - } - dput(dentry); -out: - kfree(cfe); - return error; + if (cft->seq_show == cgroup_populated_show) + cgrp->populated_kn = kn; + return 0; } /** @@ -2698,23 +2960,24 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], struct cftype *cft; int ret; - lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); lockdep_assert_held(&cgroup_mutex); for (cft = cfts; cft->name[0] != '\0'; cft++) { /* does cft->flags tell us to skip this file on @cgrp? */ + if ((cft->flags & CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp)) + continue; if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp)) continue; - if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) + if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp)) continue; - if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) + if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp)) continue; if (is_add) { ret = cgroup_add_file(cgrp, cft); if (ret) { - pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n", - cft->name, ret); + pr_warn("%s: failed to add %s, err=%d\n", + __func__, cft->name, ret); return ret; } } else { @@ -2724,115 +2987,92 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], return 0; } -static void cgroup_cfts_prepare(void) - __acquires(&cgroup_mutex) -{ - /* - * Thanks to the entanglement with vfs inode locking, we can't walk - * the existing cgroups under cgroup_mutex and create files. - * Instead, we use css_for_each_descendant_pre() and drop RCU read - * lock before calling cgroup_addrm_files(). - */ - mutex_lock(&cgroup_mutex); -} - -static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) - __releases(&cgroup_mutex) +static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) { LIST_HEAD(pending); struct cgroup_subsys *ss = cfts[0].ss; - struct cgroup *root = &ss->root->top_cgroup; - struct super_block *sb = ss->root->sb; - struct dentry *prev = NULL; - struct inode *inode; + struct cgroup *root = &ss->root->cgrp; struct cgroup_subsys_state *css; - u64 update_before; int ret = 0; - /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ - if (!cfts || ss->root == &cgroup_dummy_root || - !atomic_inc_not_zero(&sb->s_active)) { - mutex_unlock(&cgroup_mutex); - return 0; - } - - /* - * All cgroups which are created after we drop cgroup_mutex will - * have the updated set of files, so we only need to update the - * cgroups created before the current @cgroup_serial_nr_next. - */ - update_before = cgroup_serial_nr_next; - - mutex_unlock(&cgroup_mutex); + lockdep_assert_held(&cgroup_mutex); /* add/rm files for all cgroups created before */ - rcu_read_lock(); css_for_each_descendant_pre(css, cgroup_css(root, ss)) { struct cgroup *cgrp = css->cgroup; if (cgroup_is_dead(cgrp)) continue; - inode = cgrp->dentry->d_inode; - dget(cgrp->dentry); - rcu_read_unlock(); - - dput(prev); - prev = cgrp->dentry; - - mutex_lock(&inode->i_mutex); - mutex_lock(&cgroup_mutex); - if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) - ret = cgroup_addrm_files(cgrp, cfts, is_add); - mutex_unlock(&cgroup_mutex); - mutex_unlock(&inode->i_mutex); - - rcu_read_lock(); + ret = cgroup_addrm_files(cgrp, cfts, is_add); if (ret) break; } - rcu_read_unlock(); - dput(prev); - deactivate_super(sb); + + if (is_add && !ret) + kernfs_activate(root->kn); return ret; } -/** - * cgroup_add_cftypes - add an array of cftypes to a subsystem - * @ss: target cgroup subsystem - * @cfts: zero-length name terminated array of cftypes - * - * Register @cfts to @ss. Files described by @cfts are created for all - * existing cgroups to which @ss is attached and all future cgroups will - * have them too. This function can be called anytime whether @ss is - * attached or not. - * - * Returns 0 on successful registration, -errno on failure. Note that this - * function currently returns 0 as long as @cfts registration is successful - * even if some file creation attempts on existing cgroups fail. - */ -int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) +static void cgroup_exit_cftypes(struct cftype *cfts) { - struct cftype_set *set; struct cftype *cft; - int ret; - set = kzalloc(sizeof(*set), GFP_KERNEL); - if (!set) - return -ENOMEM; + for (cft = cfts; cft->name[0] != '\0'; cft++) { + /* free copy for custom atomic_write_len, see init_cftypes() */ + if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) + kfree(cft->kf_ops); + cft->kf_ops = NULL; + cft->ss = NULL; + } +} + +static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) +{ + struct cftype *cft; + + for (cft = cfts; cft->name[0] != '\0'; cft++) { + struct kernfs_ops *kf_ops; + + WARN_ON(cft->ss || cft->kf_ops); + + if (cft->seq_start) + kf_ops = &cgroup_kf_ops; + else + kf_ops = &cgroup_kf_single_ops; + + /* + * Ugh... if @cft wants a custom max_write_len, we need to + * make a copy of kf_ops to set its atomic_write_len. + */ + if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) { + kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL); + if (!kf_ops) { + cgroup_exit_cftypes(cfts); + return -ENOMEM; + } + kf_ops->atomic_write_len = cft->max_write_len; + } - for (cft = cfts; cft->name[0] != '\0'; cft++) + cft->kf_ops = kf_ops; cft->ss = ss; + } - cgroup_cfts_prepare(); - set->cfts = cfts; - list_add_tail(&set->node, &ss->cftsets); - ret = cgroup_cfts_commit(cfts, true); - if (ret) - cgroup_rm_cftypes(cfts); - return ret; + return 0; +} + +static int cgroup_rm_cftypes_locked(struct cftype *cfts) +{ + lockdep_assert_held(&cgroup_mutex); + + if (!cfts || !cfts[0].ss) + return -ENOENT; + + list_del(&cfts->node); + cgroup_apply_cftypes(cfts, false); + cgroup_exit_cftypes(cfts); + return 0; } -EXPORT_SYMBOL_GPL(cgroup_add_cftypes); /** * cgroup_rm_cftypes - remove an array of cftypes from a subsystem @@ -2847,24 +3087,51 @@ EXPORT_SYMBOL_GPL(cgroup_add_cftypes); */ int cgroup_rm_cftypes(struct cftype *cfts) { - struct cftype_set *set; + int ret; - if (!cfts || !cfts[0].ss) - return -ENOENT; + mutex_lock(&cgroup_mutex); + ret = cgroup_rm_cftypes_locked(cfts); + mutex_unlock(&cgroup_mutex); + return ret; +} - cgroup_cfts_prepare(); +/** + * cgroup_add_cftypes - add an array of cftypes to a subsystem + * @ss: target cgroup subsystem + * @cfts: zero-length name terminated array of cftypes + * + * Register @cfts to @ss. Files described by @cfts are created for all + * existing cgroups to which @ss is attached and all future cgroups will + * have them too. This function can be called anytime whether @ss is + * attached or not. + * + * Returns 0 on successful registration, -errno on failure. Note that this + * function currently returns 0 as long as @cfts registration is successful + * even if some file creation attempts on existing cgroups fail. + */ +int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) +{ + int ret; - list_for_each_entry(set, &cfts[0].ss->cftsets, node) { - if (set->cfts == cfts) { - list_del(&set->node); - kfree(set); - cgroup_cfts_commit(cfts, false); - return 0; - } - } + if (ss->disabled) + return 0; + + if (!cfts || cfts[0].name[0] == '\0') + return 0; + + ret = cgroup_init_cftypes(ss, cfts); + if (ret) + return ret; + + mutex_lock(&cgroup_mutex); + + list_add_tail(&cfts->node, &ss->cfts); + ret = cgroup_apply_cftypes(cfts, true); + if (ret) + cgroup_rm_cftypes_locked(cfts); - cgroup_cfts_commit(NULL, false); - return -ENOENT; + mutex_unlock(&cgroup_mutex); + return ret; } /** @@ -2873,107 +3140,80 @@ int cgroup_rm_cftypes(struct cftype *cfts) * * Return the number of tasks in the cgroup. */ -int cgroup_task_count(const struct cgroup *cgrp) +static int cgroup_task_count(const struct cgroup *cgrp) { int count = 0; struct cgrp_cset_link *link; - read_lock(&css_set_lock); + down_read(&css_set_rwsem); list_for_each_entry(link, &cgrp->cset_links, cset_link) count += atomic_read(&link->cset->refcount); - read_unlock(&css_set_lock); + up_read(&css_set_rwsem); return count; } -/* - * To reduce the fork() overhead for systems that are not actually using - * their cgroups capability, we don't maintain the lists running through - * each css_set to its tasks until we see the list actually used - in other - * words after the first call to css_task_iter_start(). - */ -static void cgroup_enable_task_cg_lists(void) -{ - struct task_struct *p, *g; - write_lock(&css_set_lock); - use_task_css_set_links = 1; - /* - * We need tasklist_lock because RCU is not safe against - * while_each_thread(). Besides, a forking task that has passed - * cgroup_post_fork() without seeing use_task_css_set_links = 1 - * is not guaranteed to have its child immediately visible in the - * tasklist if we walk through it with RCU. - */ - read_lock(&tasklist_lock); - do_each_thread(g, p) { - task_lock(p); - /* - * We should check if the process is exiting, otherwise - * it will race with cgroup_exit() in that the list - * entry won't be deleted though the process has exited. - */ - if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list)) - list_add(&p->cg_list, &task_css_set(p)->tasks); - task_unlock(p); - } while_each_thread(g, p); - read_unlock(&tasklist_lock); - write_unlock(&css_set_lock); -} - /** * css_next_child - find the next child of a given css - * @pos_css: the current position (%NULL to initiate traversal) - * @parent_css: css whose children to walk + * @pos: the current position (%NULL to initiate traversal) + * @parent: css whose children to walk * - * This function returns the next child of @parent_css and should be called + * This function returns the next child of @parent and should be called * under either cgroup_mutex or RCU read lock. The only requirement is - * that @parent_css and @pos_css are accessible. The next sibling is - * guaranteed to be returned regardless of their states. + * that @parent and @pos are accessible. The next sibling is guaranteed to + * be returned regardless of their states. + * + * If a subsystem synchronizes ->css_online() and the start of iteration, a + * css which finished ->css_online() is guaranteed to be visible in the + * future iterations and will stay visible until the last reference is put. + * A css which hasn't finished ->css_online() or already finished + * ->css_offline() may show up during traversal. It's each subsystem's + * responsibility to synchronize against on/offlining. */ -struct cgroup_subsys_state * -css_next_child(struct cgroup_subsys_state *pos_css, - struct cgroup_subsys_state *parent_css) +struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, + struct cgroup_subsys_state *parent) { - struct cgroup *pos = pos_css ? pos_css->cgroup : NULL; - struct cgroup *cgrp = parent_css->cgroup; - struct cgroup *next; + struct cgroup_subsys_state *next; cgroup_assert_mutex_or_rcu_locked(); /* - * @pos could already have been removed. Once a cgroup is removed, - * its ->sibling.next is no longer updated when its next sibling - * changes. As CGRP_DEAD assertion is serialized and happens - * before the cgroup is taken off the ->sibling list, if we see it - * unasserted, it's guaranteed that the next sibling hasn't - * finished its grace period even if it's already removed, and thus - * safe to dereference from this RCU critical section. If - * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed - * to be visible as %true here. + * @pos could already have been unlinked from the sibling list. + * Once a cgroup is removed, its ->sibling.next is no longer + * updated when its next sibling changes. CSS_RELEASED is set when + * @pos is taken off list, at which time its next pointer is valid, + * and, as releases are serialized, the one pointed to by the next + * pointer is guaranteed to not have started release yet. This + * implies that if we observe !CSS_RELEASED on @pos in this RCU + * critical section, the one pointed to by its next pointer is + * guaranteed to not have finished its RCU grace period even if we + * have dropped rcu_read_lock() inbetween iterations. * - * If @pos is dead, its next pointer can't be dereferenced; - * however, as each cgroup is given a monotonically increasing - * unique serial number and always appended to the sibling list, - * the next one can be found by walking the parent's children until - * we see a cgroup with higher serial number than @pos's. While - * this path can be slower, it's taken only when either the current - * cgroup is removed or iteration and removal race. + * If @pos has CSS_RELEASED set, its next pointer can't be + * dereferenced; however, as each css is given a monotonically + * increasing unique serial number and always appended to the + * sibling list, the next one can be found by walking the parent's + * children until the first css with higher serial number than + * @pos's. While this path can be slower, it happens iff iteration + * races against release and the race window is very small. */ if (!pos) { - next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling); - } else if (likely(!cgroup_is_dead(pos))) { - next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); + next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling); + } else if (likely(!(pos->flags & CSS_RELEASED))) { + next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling); } else { - list_for_each_entry_rcu(next, &cgrp->children, sibling) + list_for_each_entry_rcu(next, &parent->children, sibling) if (next->serial_nr > pos->serial_nr) break; } - if (&next->sibling == &cgrp->children) - return NULL; - - return cgroup_css(next, parent_css->ss); + /* + * @next, if not pointing to the head, can be dereferenced and is + * the next sibling. + */ + if (&next->sibling != &parent->children) + return next; + return NULL; } -EXPORT_SYMBOL_GPL(css_next_child); /** * css_next_descendant_pre - find the next descendant for pre-order walk @@ -2988,6 +3228,13 @@ EXPORT_SYMBOL_GPL(css_next_child); * doesn't require the whole traversal to be contained in a single critical * section. This function will return the correct next descendant as long * as both @pos and @root are accessible and @pos is a descendant of @root. + * + * If a subsystem synchronizes ->css_online() and the start of iteration, a + * css which finished ->css_online() is guaranteed to be visible in the + * future iterations and will stay visible until the last reference is put. + * A css which hasn't finished ->css_online() or already finished + * ->css_offline() may show up during traversal. It's each subsystem's + * responsibility to synchronize against on/offlining. */ struct cgroup_subsys_state * css_next_descendant_pre(struct cgroup_subsys_state *pos, @@ -3008,15 +3255,14 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos, /* no child, visit my or the closest ancestor's next sibling */ while (pos != root) { - next = css_next_child(pos, css_parent(pos)); + next = css_next_child(pos, pos->parent); if (next) return next; - pos = css_parent(pos); + pos = pos->parent; } return NULL; } -EXPORT_SYMBOL_GPL(css_next_descendant_pre); /** * css_rightmost_descendant - return the rightmost descendant of a css @@ -3048,7 +3294,6 @@ css_rightmost_descendant(struct cgroup_subsys_state *pos) return last; } -EXPORT_SYMBOL_GPL(css_rightmost_descendant); static struct cgroup_subsys_state * css_leftmost_descendant(struct cgroup_subsys_state *pos) @@ -3077,6 +3322,13 @@ css_leftmost_descendant(struct cgroup_subsys_state *pos) * section. This function will return the correct next descendant as long * as both @pos and @cgroup are accessible and @pos is a descendant of * @cgroup. + * + * If a subsystem synchronizes ->css_online() and the start of iteration, a + * css which finished ->css_online() is guaranteed to be visible in the + * future iterations and will stay visible until the last reference is put. + * A css which hasn't finished ->css_online() or already finished + * ->css_offline() may show up during traversal. It's each subsystem's + * responsibility to synchronize against on/offlining. */ struct cgroup_subsys_state * css_next_descendant_post(struct cgroup_subsys_state *pos, @@ -3095,14 +3347,37 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, return NULL; /* if there's an unvisited sibling, visit its leftmost descendant */ - next = css_next_child(pos, css_parent(pos)); + next = css_next_child(pos, pos->parent); if (next) return css_leftmost_descendant(next); /* no sibling left, visit parent */ - return css_parent(pos); + return pos->parent; +} + +/** + * css_has_online_children - does a css have online children + * @css: the target css + * + * Returns %true if @css has any online children; otherwise, %false. This + * function can be called from any context but the caller is responsible + * for synchronizing against on/offlining as necessary. + */ +bool css_has_online_children(struct cgroup_subsys_state *css) +{ + struct cgroup_subsys_state *child; + bool ret = false; + + rcu_read_lock(); + css_for_each_child(child, css) { + if (child->flags & CSS_ONLINE) { + ret = true; + break; + } + } + rcu_read_unlock(); + return ret; } -EXPORT_SYMBOL_GPL(css_next_descendant_post); /** * css_advance_task_iter - advance a task itererator to the next css_set @@ -3112,22 +3387,36 @@ EXPORT_SYMBOL_GPL(css_next_descendant_post); */ static void css_advance_task_iter(struct css_task_iter *it) { - struct list_head *l = it->cset_link; + struct list_head *l = it->cset_pos; struct cgrp_cset_link *link; struct css_set *cset; /* Advance to the next non-empty css_set */ do { l = l->next; - if (l == &it->origin_css->cgroup->cset_links) { - it->cset_link = NULL; + if (l == it->cset_head) { + it->cset_pos = NULL; return; } - link = list_entry(l, struct cgrp_cset_link, cset_link); - cset = link->cset; - } while (list_empty(&cset->tasks)); - it->cset_link = l; - it->task = cset->tasks.next; + + if (it->ss) { + cset = container_of(l, struct css_set, + e_cset_node[it->ss->id]); + } else { + link = list_entry(l, struct cgrp_cset_link, cset_link); + cset = link->cset; + } + } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks)); + + it->cset_pos = l; + + if (!list_empty(&cset->tasks)) + it->task_pos = cset->tasks.next; + else + it->task_pos = cset->mg_tasks.next; + + it->tasks_head = &cset->tasks; + it->mg_tasks_head = &cset->mg_tasks; } /** @@ -3146,20 +3435,21 @@ static void css_advance_task_iter(struct css_task_iter *it) */ void css_task_iter_start(struct cgroup_subsys_state *css, struct css_task_iter *it) - __acquires(css_set_lock) + __acquires(css_set_rwsem) { - /* - * The first time anyone tries to iterate across a css, we need to - * enable the list linking each css_set to its tasks, and fix up - * all existing tasks. - */ - if (!use_task_css_set_links) - cgroup_enable_task_cg_lists(); + /* no one should try to iterate before mounting cgroups */ + WARN_ON_ONCE(!use_task_css_set_links); + + down_read(&css_set_rwsem); - read_lock(&css_set_lock); + it->ss = css->ss; + + if (it->ss) + it->cset_pos = &css->cgroup->e_csets[css->ss->id]; + else + it->cset_pos = &css->cgroup->cset_links; - it->origin_css = css; - it->cset_link = &css->cgroup->cset_links; + it->cset_head = it->cset_pos; css_advance_task_iter(it); } @@ -3175,25 +3465,28 @@ void css_task_iter_start(struct cgroup_subsys_state *css, struct task_struct *css_task_iter_next(struct css_task_iter *it) { struct task_struct *res; - struct list_head *l = it->task; - struct cgrp_cset_link *link; + struct list_head *l = it->task_pos; /* If the iterator cg is NULL, we have no tasks */ - if (!it->cset_link) + if (!it->cset_pos) return NULL; res = list_entry(l, struct task_struct, cg_list); - /* Advance iterator to find next entry */ + + /* + * Advance iterator to find next entry. cset->tasks is consumed + * first and then ->mg_tasks. After ->mg_tasks, we move onto the + * next cset. + */ l = l->next; - link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link); - if (l == &link->cset->tasks) { - /* - * We reached the end of this task list - move on to the - * next cgrp_cset_link. - */ + + if (l == it->tasks_head) + l = it->mg_tasks_head->next; + + if (l == it->mg_tasks_head) css_advance_task_iter(it); - } else { - it->task = l; - } + else + it->task_pos = l; + return res; } @@ -3204,191 +3497,62 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) * Finish task iteration started by css_task_iter_start(). */ void css_task_iter_end(struct css_task_iter *it) - __releases(css_set_lock) + __releases(css_set_rwsem) { - read_unlock(&css_set_lock); -} - -static inline int started_after_time(struct task_struct *t1, - struct timespec *time, - struct task_struct *t2) -{ - int start_diff = timespec_compare(&t1->start_time, time); - if (start_diff > 0) { - return 1; - } else if (start_diff < 0) { - return 0; - } else { - /* - * Arbitrarily, if two processes started at the same - * time, we'll say that the lower pointer value - * started first. Note that t2 may have exited by now - * so this may not be a valid pointer any longer, but - * that's fine - it still serves to distinguish - * between two tasks started (effectively) simultaneously. - */ - return t1 > t2; - } -} - -/* - * This function is a callback from heap_insert() and is used to order - * the heap. - * In this case we order the heap in descending task start time. - */ -static inline int started_after(void *p1, void *p2) -{ - struct task_struct *t1 = p1; - struct task_struct *t2 = p2; - return started_after_time(t1, &t2->start_time, t2); + up_read(&css_set_rwsem); } /** - * css_scan_tasks - iterate though all the tasks in a css - * @css: the css to iterate tasks of - * @test: optional test callback - * @process: process callback - * @data: data passed to @test and @process - * @heap: optional pre-allocated heap used for task iteration - * - * Iterate through all the tasks in @css, calling @test for each, and if it - * returns %true, call @process for it also. - * - * @test may be NULL, meaning always true (select all tasks), which - * effectively duplicates css_task_iter_{start,next,end}() but does not - * lock css_set_lock for the call to @process. - * - * It is guaranteed that @process will act on every task that is a member - * of @css for the duration of this call. This function may or may not - * call @process for tasks that exit or move to a different css during the - * call, or are forked or move into the css during the call. - * - * Note that @test may be called with locks held, and may in some - * situations be called multiple times for the same task, so it should be - * cheap. + * cgroup_trasnsfer_tasks - move tasks from one cgroup to another + * @to: cgroup to which the tasks will be moved + * @from: cgroup in which the tasks currently reside * - * If @heap is non-NULL, a heap has been pre-allocated and will be used for - * heap operations (and its "gt" member will be overwritten), else a - * temporary heap will be used (allocation of which may cause this function - * to fail). + * Locking rules between cgroup_post_fork() and the migration path + * guarantee that, if a task is forking while being migrated, the new child + * is guaranteed to be either visible in the source cgroup after the + * parent's migration is complete or put into the target cgroup. No task + * can slip out of migration through forking. */ -int css_scan_tasks(struct cgroup_subsys_state *css, - bool (*test)(struct task_struct *, void *), - void (*process)(struct task_struct *, void *), - void *data, struct ptr_heap *heap) +int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) { - int retval, i; + LIST_HEAD(preloaded_csets); + struct cgrp_cset_link *link; struct css_task_iter it; - struct task_struct *p, *dropped; - /* Never dereference latest_task, since it's not refcounted */ - struct task_struct *latest_task = NULL; - struct ptr_heap tmp_heap; - struct timespec latest_time = { 0, 0 }; - - if (heap) { - /* The caller supplied our heap and pre-allocated its memory */ - heap->gt = &started_after; - } else { - /* We need to allocate our own heap memory */ - heap = &tmp_heap; - retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after); - if (retval) - /* cannot allocate the heap */ - return retval; - } + struct task_struct *task; + int ret; - again: - /* - * Scan tasks in the css, using the @test callback to determine - * which are of interest, and invoking @process callback on the - * ones which need an update. Since we don't want to hold any - * locks during the task updates, gather tasks to be processed in a - * heap structure. The heap is sorted by descending task start - * time. If the statically-sized heap fills up, we overflow tasks - * that started later, and in future iterations only consider tasks - * that started after the latest task in the previous pass. This - * guarantees forward progress and that we don't miss any tasks. - */ - heap->size = 0; - css_task_iter_start(css, &it); - while ((p = css_task_iter_next(&it))) { - /* - * Only affect tasks that qualify per the caller's callback, - * if he provided one - */ - if (test && !test(p, data)) - continue; - /* - * Only process tasks that started after the last task - * we processed - */ - if (!started_after_time(p, &latest_time, latest_task)) - continue; - dropped = heap_insert(heap, p); - if (dropped == NULL) { - /* - * The new task was inserted; the heap wasn't - * previously full - */ - get_task_struct(p); - } else if (dropped != p) { - /* - * The new task was inserted, and pushed out a - * different task - */ - get_task_struct(p); - put_task_struct(dropped); - } - /* - * Else the new task was newer than anything already in - * the heap and wasn't inserted - */ - } - css_task_iter_end(&it); + mutex_lock(&cgroup_mutex); - if (heap->size) { - for (i = 0; i < heap->size; i++) { - struct task_struct *q = heap->ptrs[i]; - if (i == 0) { - latest_time = q->start_time; - latest_task = q; - } - /* Process the task per the caller's callback */ - process(q, data); - put_task_struct(q); - } - /* - * If we had to process any tasks at all, scan again - * in case some of them were in the middle of forking - * children that didn't get processed. - * Not the most efficient way to do it, but it avoids - * having to take callback_mutex in the fork path - */ - goto again; - } - if (heap == &tmp_heap) - heap_free(&tmp_heap); - return 0; -} + /* all tasks in @from are being moved, all csets are source */ + down_read(&css_set_rwsem); + list_for_each_entry(link, &from->cset_links, cset_link) + cgroup_migrate_add_src(link->cset, to, &preloaded_csets); + up_read(&css_set_rwsem); -static void cgroup_transfer_one_task(struct task_struct *task, void *data) -{ - struct cgroup *new_cgroup = data; + ret = cgroup_migrate_prepare_dst(to, &preloaded_csets); + if (ret) + goto out_err; - mutex_lock(&cgroup_mutex); - cgroup_attach_task(new_cgroup, task, false); + /* + * Migrate tasks one-by-one until @form is empty. This fails iff + * ->can_attach() fails. + */ + do { + css_task_iter_start(&from->self, &it); + task = css_task_iter_next(&it); + if (task) + get_task_struct(task); + css_task_iter_end(&it); + + if (task) { + ret = cgroup_migrate(to, task, false); + put_task_struct(task); + } + } while (task && !ret); +out_err: + cgroup_migrate_finish(&preloaded_csets); mutex_unlock(&cgroup_mutex); -} - -/** - * cgroup_trasnsfer_tasks - move tasks from one cgroup to another - * @to: cgroup to which the tasks will be moved - * @from: cgroup in which the tasks currently reside - */ -int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) -{ - return css_scan_tasks(&from->dummy_css, NULL, cgroup_transfer_one_task, - to, NULL); + return ret; } /* @@ -3639,7 +3803,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, if (!array) return -ENOMEM; /* now, populate the array */ - css_task_iter_start(&cgrp->dummy_css, &it); + css_task_iter_start(&cgrp->self, &it); while ((tsk = css_task_iter_next(&it))) { if (unlikely(n == length)) break; @@ -3687,23 +3851,33 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, */ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) { - int ret = -EINVAL; + struct kernfs_node *kn = kernfs_node_from_dentry(dentry); struct cgroup *cgrp; struct css_task_iter it; struct task_struct *tsk; + /* it should be kernfs_node belonging to cgroupfs and is a directory */ + if (dentry->d_sb->s_type != &cgroup_fs_type || !kn || + kernfs_type(kn) != KERNFS_DIR) + return -EINVAL; + + mutex_lock(&cgroup_mutex); + /* - * Validate dentry by checking the superblock operations, - * and make sure it's a directory. + * We aren't being called from kernfs and there's no guarantee on + * @kn->priv's validity. For this and css_tryget_online_from_dir(), + * @kn->priv is RCU safe. Let's do the RCU dancing. */ - if (dentry->d_sb->s_op != &cgroup_ops || - !S_ISDIR(dentry->d_inode->i_mode)) - goto err; - - ret = 0; - cgrp = dentry->d_fsdata; + rcu_read_lock(); + cgrp = rcu_dereference(kn->priv); + if (!cgrp || cgroup_is_dead(cgrp)) { + rcu_read_unlock(); + mutex_unlock(&cgroup_mutex); + return -ENOENT; + } + rcu_read_unlock(); - css_task_iter_start(&cgrp->dummy_css, &it); + css_task_iter_start(&cgrp->self, &it); while ((tsk = css_task_iter_next(&it))) { switch (tsk->state) { case TASK_RUNNING: @@ -3726,8 +3900,8 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) } css_task_iter_end(&it); -err: - return ret; + mutex_unlock(&cgroup_mutex); + return 0; } @@ -3745,7 +3919,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) * after a seek to the start). Use a binary-search to find the * next pid to display, if any */ - struct cgroup_open_file *of = s->private; + struct kernfs_open_file *of = s->private; struct cgroup *cgrp = seq_css(s)->cgroup; struct cgroup_pidlist *l; enum cgroup_filetype type = seq_cft(s)->private; @@ -3800,7 +3974,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) static void cgroup_pidlist_stop(struct seq_file *s, void *v) { - struct cgroup_open_file *of = s->private; + struct kernfs_open_file *of = s->private; struct cgroup_pidlist *l = of->priv; if (l) @@ -3811,7 +3985,7 @@ static void cgroup_pidlist_stop(struct seq_file *s, void *v) static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) { - struct cgroup_open_file *of = s->private; + struct kernfs_open_file *of = s->private; struct cgroup_pidlist *l = of->priv; pid_t *p = v; pid_t *end = l->list + l->length; @@ -3833,17 +4007,6 @@ static int cgroup_pidlist_show(struct seq_file *s, void *v) return seq_printf(s, "%d\n", *(int *)v); } -/* - * seq_operations functions for iterating on pidlists through seq_file - - * independent of whether it's tasks or procs - */ -static const struct seq_operations cgroup_pidlist_seq_operations = { - .start = cgroup_pidlist_start, - .stop = cgroup_pidlist_stop, - .next = cgroup_pidlist_next, - .show = cgroup_pidlist_show, -}; - static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -3861,23 +4024,6 @@ static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css, return 0; } -/* - * When dput() is called asynchronously, if umount has been done and - * then deactivate_super() in cgroup_free_fn() kills the superblock, - * there's a small window that vfs will see the root dentry with non-zero - * refcnt and trigger BUG(). - * - * That's why we hold a reference before dput() and drop it right after. - */ -static void cgroup_dput(struct cgroup *cgrp) -{ - struct super_block *sb = cgrp->root->sb; - - atomic_inc(&sb->s_active); - dput(cgrp->dentry); - deactivate_super(sb); -} - static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -3902,7 +4048,7 @@ static struct cftype cgroup_base_files[] = { .seq_stop = cgroup_pidlist_stop, .seq_show = cgroup_pidlist_show, .private = CGROUP_FILE_PROCS, - .write_u64 = cgroup_procs_write, + .write = cgroup_procs_write, .mode = S_IRUGO | S_IWUSR, }, { @@ -3916,6 +4062,27 @@ static struct cftype cgroup_base_files[] = { .flags = CFTYPE_ONLY_ON_ROOT, .seq_show = cgroup_sane_behavior_show, }, + { + .name = "cgroup.controllers", + .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_ONLY_ON_ROOT, + .seq_show = cgroup_root_controllers_show, + }, + { + .name = "cgroup.controllers", + .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_controllers_show, + }, + { + .name = "cgroup.subtree_control", + .flags = CFTYPE_ONLY_ON_DFL, + .seq_show = cgroup_subtree_control_show, + .write = cgroup_subtree_control_write, + }, + { + .name = "cgroup.populated", + .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_populated_show, + }, /* * Historical crazy stuff. These don't have "cgroup." prefix and @@ -3930,7 +4097,7 @@ static struct cftype cgroup_base_files[] = { .seq_stop = cgroup_pidlist_stop, .seq_show = cgroup_pidlist_show, .private = CGROUP_FILE_TASKS, - .write_u64 = cgroup_tasks_write, + .write = cgroup_tasks_write, .mode = S_IRUGO | S_IWUSR, }, { @@ -3943,8 +4110,8 @@ static struct cftype cgroup_base_files[] = { .name = "release_agent", .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, .seq_show = cgroup_release_agent_show, - .write_string = cgroup_release_agent_write, - .max_write_len = PATH_MAX, + .write = cgroup_release_agent_write, + .max_write_len = PATH_MAX - 1, }, { } /* terminate */ }; @@ -3956,20 +4123,20 @@ static struct cftype cgroup_base_files[] = { * * On failure, no file is added. */ -static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) +static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask) { struct cgroup_subsys *ss; int i, ret = 0; /* process cftsets of each subsystem */ for_each_subsys(ss, i) { - struct cftype_set *set; + struct cftype *cfts; - if (!test_bit(i, &subsys_mask)) + if (!(subsys_mask & (1 << i))) continue; - list_for_each_entry(set, &ss->cftsets, node) { - ret = cgroup_addrm_files(cgrp, set->cfts, true); + list_for_each_entry(cfts, &ss->cfts, node) { + ret = cgroup_addrm_files(cgrp, cfts, true); if (ret < 0) goto err; } @@ -3987,9 +4154,9 @@ err: * Implemented in kill_css(). * * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs - * and thus css_tryget() is guaranteed to fail, the css can be offlined - * by invoking offline_css(). After offlining, the base ref is put. - * Implemented in css_killed_work_fn(). + * and thus css_tryget_online() is guaranteed to fail, the css can be + * offlined by invoking offline_css(). After offlining, the base ref is + * put. Implemented in css_killed_work_fn(). * * 3. When the percpu_ref reaches zero, the only possible remaining * accessors are inside RCU read sections. css_release() schedules the @@ -4008,11 +4175,37 @@ static void css_free_work_fn(struct work_struct *work) container_of(work, struct cgroup_subsys_state, destroy_work); struct cgroup *cgrp = css->cgroup; - if (css->parent) - css_put(css->parent); + if (css->ss) { + /* css free path */ + if (css->parent) + css_put(css->parent); - css->ss->css_free(css); - cgroup_dput(cgrp); + css->ss->css_free(css); + cgroup_put(cgrp); + } else { + /* cgroup free path */ + atomic_dec(&cgrp->root->nr_cgrps); + cgroup_pidlist_destroy_all(cgrp); + + if (cgroup_parent(cgrp)) { + /* + * We get a ref to the parent, and put the ref when + * this cgroup is being freed, so it's guaranteed + * that the parent won't be destroyed before its + * children. + */ + cgroup_put(cgroup_parent(cgrp)); + kernfs_put(cgrp->kn); + kfree(cgrp); + } else { + /* + * This is root cgroup's refcnt reaching zero, + * which indicates that the root should be + * released. + */ + cgroup_destroy_root(cgrp->root); + } + } } static void css_free_rcu_fn(struct rcu_head *rcu_head) @@ -4020,34 +4213,63 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head) struct cgroup_subsys_state *css = container_of(rcu_head, struct cgroup_subsys_state, rcu_head); - /* - * css holds an extra ref to @cgrp->dentry which is put on the last - * css_put(). dput() requires process context which we don't have. - */ INIT_WORK(&css->destroy_work, css_free_work_fn); queue_work(cgroup_destroy_wq, &css->destroy_work); } +static void css_release_work_fn(struct work_struct *work) +{ + struct cgroup_subsys_state *css = + container_of(work, struct cgroup_subsys_state, destroy_work); + struct cgroup_subsys *ss = css->ss; + struct cgroup *cgrp = css->cgroup; + + mutex_lock(&cgroup_mutex); + + css->flags |= CSS_RELEASED; + list_del_rcu(&css->sibling); + + if (ss) { + /* css release path */ + cgroup_idr_remove(&ss->css_idr, css->id); + } else { + /* cgroup release path */ + cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); + cgrp->id = -1; + } + + mutex_unlock(&cgroup_mutex); + + call_rcu(&css->rcu_head, css_free_rcu_fn); +} + static void css_release(struct percpu_ref *ref) { struct cgroup_subsys_state *css = container_of(ref, struct cgroup_subsys_state, refcnt); - rcu_assign_pointer(css->cgroup->subsys[css->ss->subsys_id], NULL); - call_rcu(&css->rcu_head, css_free_rcu_fn); + INIT_WORK(&css->destroy_work, css_release_work_fn); + queue_work(cgroup_destroy_wq, &css->destroy_work); } -static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, - struct cgroup *cgrp) +static void init_and_link_css(struct cgroup_subsys_state *css, + struct cgroup_subsys *ss, struct cgroup *cgrp) { + lockdep_assert_held(&cgroup_mutex); + + cgroup_get(cgrp); + + memset(css, 0, sizeof(*css)); css->cgroup = cgrp; css->ss = ss; - css->flags = 0; + INIT_LIST_HEAD(&css->sibling); + INIT_LIST_HEAD(&css->children); + css->serial_nr = css_serial_nr_next++; - if (cgrp->parent) - css->parent = cgroup_css(cgrp->parent, ss); - else - css->flags |= CSS_ROOT; + if (cgroup_parent(cgrp)) { + css->parent = cgroup_css(cgroup_parent(cgrp), ss); + css_get(css->parent); + } BUG_ON(cgroup_css(cgrp, ss)); } @@ -4064,8 +4286,7 @@ static int online_css(struct cgroup_subsys_state *css) ret = ss->css_online(css); if (!ret) { css->flags |= CSS_ONLINE; - css->cgroup->nr_css++; - rcu_assign_pointer(css->cgroup->subsys[ss->subsys_id], css); + rcu_assign_pointer(css->cgroup->subsys[ss->id], css); } return ret; } @@ -4084,8 +4305,9 @@ static void offline_css(struct cgroup_subsys_state *css) ss->css_offline(css); css->flags &= ~CSS_ONLINE; - css->cgroup->nr_css--; - RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css); + RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL); + + wake_up_all(&css->cgroup->offline_waitq); } /** @@ -4099,114 +4321,102 @@ static void offline_css(struct cgroup_subsys_state *css) */ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) { - struct cgroup *parent = cgrp->parent; + struct cgroup *parent = cgroup_parent(cgrp); + struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss); struct cgroup_subsys_state *css; int err; - lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); lockdep_assert_held(&cgroup_mutex); - css = ss->css_alloc(cgroup_css(parent, ss)); + css = ss->css_alloc(parent_css); if (IS_ERR(css)) return PTR_ERR(css); + init_and_link_css(css, ss, cgrp); + err = percpu_ref_init(&css->refcnt, css_release); if (err) - goto err_free; + goto err_free_css; - init_css(css, ss, cgrp); + err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_NOWAIT); + if (err < 0) + goto err_free_percpu_ref; + css->id = err; - err = cgroup_populate_dir(cgrp, 1 << ss->subsys_id); + err = cgroup_populate_dir(cgrp, 1 << ss->id); if (err) - goto err_free; + goto err_free_id; + + /* @css is ready to be brought online now, make it visible */ + list_add_tail_rcu(&css->sibling, &parent_css->children); + cgroup_idr_replace(&ss->css_idr, css, css->id); err = online_css(css); if (err) - goto err_free; - - dget(cgrp->dentry); - css_get(css->parent); + goto err_list_del; if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && - parent->parent) { - pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", - current->comm, current->pid, ss->name); + cgroup_parent(parent)) { + pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", + current->comm, current->pid, ss->name); if (!strcmp(ss->name, "memory")) - pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n"); + pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n"); ss->warned_broken_hierarchy = true; } return 0; -err_free: +err_list_del: + list_del_rcu(&css->sibling); + cgroup_clear_dir(css->cgroup, 1 << css->ss->id); +err_free_id: + cgroup_idr_remove(&ss->css_idr, css->id); +err_free_percpu_ref: percpu_ref_cancel_init(&css->refcnt); - ss->css_free(css); +err_free_css: + call_rcu(&css->rcu_head, css_free_rcu_fn); return err; } -/* - * cgroup_create - create a cgroup - * @parent: cgroup that will be parent of the new cgroup - * @dentry: dentry of the new cgroup - * @mode: mode to set on new inode - * - * Must be called with the mutex on the parent inode held - */ -static long cgroup_create(struct cgroup *parent, struct dentry *dentry, - umode_t mode) +static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, + umode_t mode) { - struct cgroup *cgrp; - struct cgroup_name *name; - struct cgroupfs_root *root = parent->root; - int ssid, err = 0; + struct cgroup *parent, *cgrp; + struct cgroup_root *root; struct cgroup_subsys *ss; - struct super_block *sb = root->sb; + struct kernfs_node *kn; + int ssid, ret; + + parent = cgroup_kn_lock_live(parent_kn); + if (!parent) + return -ENODEV; + root = parent->root; /* allocate the cgroup and its ID, 0 is reserved for the root */ cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); - if (!cgrp) - return -ENOMEM; + if (!cgrp) { + ret = -ENOMEM; + goto out_unlock; + } - name = cgroup_alloc_name(dentry); - if (!name) - goto err_free_cgrp; - rcu_assign_pointer(cgrp->name, name); + ret = percpu_ref_init(&cgrp->self.refcnt, css_release); + if (ret) + goto out_free_cgrp; /* * Temporarily set the pointer to NULL, so idr_find() won't return * a half-baked cgroup. */ - cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL); - if (cgrp->id < 0) - goto err_free_name; - - /* - * Only live parents can have children. Note that the liveliness - * check isn't strictly necessary because cgroup_mkdir() and - * cgroup_rmdir() are fully synchronized by i_mutex; however, do it - * anyway so that locking is contained inside cgroup proper and we - * don't get nasty surprises if we ever grow another caller. - */ - if (!cgroup_lock_live_group(parent)) { - err = -ENODEV; - goto err_free_id; + cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT); + if (cgrp->id < 0) { + ret = -ENOMEM; + goto out_cancel_ref; } - /* Grab a reference on the superblock so the hierarchy doesn't - * get deleted on unmount if there are child cgroups. This - * can be done outside cgroup_mutex, since the sb can't - * disappear while someone has an open control file on the - * fs */ - atomic_inc(&sb->s_active); - init_cgroup_housekeeping(cgrp); - dentry->d_fsdata = cgrp; - cgrp->dentry = dentry; - - cgrp->parent = parent; - cgrp->dummy_css.parent = &parent->dummy_css; - cgrp->root = parent->root; + cgrp->self.parent = &parent->self; + cgrp->root = root; if (notify_on_release(parent)) set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); @@ -4214,111 +4424,91 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); + /* create the directory */ + kn = kernfs_create_dir(parent->kn, name, mode, cgrp); + if (IS_ERR(kn)) { + ret = PTR_ERR(kn); + goto out_free_id; + } + cgrp->kn = kn; + /* - * Create directory. cgroup_create_file() returns with the new - * directory locked on success so that it can be populated without - * dropping cgroup_mutex. + * This extra ref will be put in cgroup_free_fn() and guarantees + * that @cgrp->kn is always accessible. */ - err = cgroup_create_file(dentry, S_IFDIR | mode, sb); - if (err < 0) - goto err_unlock; - lockdep_assert_held(&dentry->d_inode->i_mutex); + kernfs_get(kn); - cgrp->serial_nr = cgroup_serial_nr_next++; + cgrp->self.serial_nr = css_serial_nr_next++; /* allocation complete, commit to creation */ - list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); - root->number_of_cgroups++; - - /* hold a ref to the parent's dentry */ - dget(parent->dentry); + list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children); + atomic_inc(&root->nr_cgrps); + cgroup_get(parent); /* * @cgrp is now fully operational. If something fails after this * point, it'll be released via the normal destruction path. */ - idr_replace(&root->cgroup_idr, cgrp, cgrp->id); + cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id); - err = cgroup_addrm_files(cgrp, cgroup_base_files, true); - if (err) - goto err_destroy; + ret = cgroup_kn_set_ugid(kn); + if (ret) + goto out_destroy; + + ret = cgroup_addrm_files(cgrp, cgroup_base_files, true); + if (ret) + goto out_destroy; /* let's create and online css's */ for_each_subsys(ss, ssid) { - if (root->subsys_mask & (1 << ssid)) { - err = create_css(cgrp, ss); - if (err) - goto err_destroy; + if (parent->child_subsys_mask & (1 << ssid)) { + ret = create_css(cgrp, ss); + if (ret) + goto out_destroy; } } - mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgrp->dentry->d_inode->i_mutex); + /* + * On the default hierarchy, a child doesn't automatically inherit + * child_subsys_mask from the parent. Each is configured manually. + */ + if (!cgroup_on_dfl(cgrp)) + cgrp->child_subsys_mask = parent->child_subsys_mask; - return 0; + kernfs_activate(kn); -err_unlock: - mutex_unlock(&cgroup_mutex); - /* Release the reference count that we took on the superblock */ - deactivate_super(sb); -err_free_id: - idr_remove(&root->cgroup_idr, cgrp->id); -err_free_name: - kfree(rcu_dereference_raw(cgrp->name)); -err_free_cgrp: + ret = 0; + goto out_unlock; + +out_free_id: + cgroup_idr_remove(&root->cgroup_idr, cgrp->id); +out_cancel_ref: + percpu_ref_cancel_init(&cgrp->self.refcnt); +out_free_cgrp: kfree(cgrp); - return err; +out_unlock: + cgroup_kn_unlock(parent_kn); + return ret; -err_destroy: +out_destroy: cgroup_destroy_locked(cgrp); - mutex_unlock(&cgroup_mutex); - mutex_unlock(&dentry->d_inode->i_mutex); - return err; -} - -static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) -{ - struct cgroup *c_parent = dentry->d_parent->d_fsdata; - - /* the vfs holds inode->i_mutex already */ - return cgroup_create(c_parent, dentry, mode | S_IFDIR); + goto out_unlock; } /* * This is called when the refcnt of a css is confirmed to be killed. - * css_tryget() is now guaranteed to fail. + * css_tryget_online() is now guaranteed to fail. Tell the subsystem to + * initate destruction and put the css ref from kill_css(). */ static void css_killed_work_fn(struct work_struct *work) { struct cgroup_subsys_state *css = container_of(work, struct cgroup_subsys_state, destroy_work); - struct cgroup *cgrp = css->cgroup; mutex_lock(&cgroup_mutex); - - /* - * css_tryget() is guaranteed to fail now. Tell subsystems to - * initate destruction. - */ offline_css(css); - - /* - * If @cgrp is marked dead, it's waiting for refs of all css's to - * be disabled before proceeding to the second phase of cgroup - * destruction. If we are the last one, kick it off. - */ - if (!cgrp->nr_css && cgroup_is_dead(cgrp)) - cgroup_destroy_css_killed(cgrp); - mutex_unlock(&cgroup_mutex); - /* - * Put the css refs from kill_css(). Each css holds an extra - * reference to the cgroup's dentry and cgroup removal proceeds - * regardless of css refs. On the last put of each css, whenever - * that may be, the extra dentry ref is put so that dentry - * destruction happens only after all css's are released. - */ css_put(css); } @@ -4338,12 +4528,18 @@ static void css_killed_ref_fn(struct percpu_ref *ref) * * This function initiates destruction of @css by removing cgroup interface * files and putting its base reference. ->css_offline() will be invoked - * asynchronously once css_tryget() is guaranteed to fail and when the - * reference count reaches zero, @css will be released. + * asynchronously once css_tryget_online() is guaranteed to fail and when + * the reference count reaches zero, @css will be released. */ static void kill_css(struct cgroup_subsys_state *css) { - cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id); + lockdep_assert_held(&cgroup_mutex); + + /* + * This must happen before css is disassociated with its cgroup. + * See seq_css() for details. + */ + cgroup_clear_dir(css->cgroup, 1 << css->ss->id); /* * Killing would put the base ref, but we need to keep it alive @@ -4354,7 +4550,7 @@ static void kill_css(struct cgroup_subsys_state *css) /* * cgroup core guarantees that, by the time ->css_offline() is * invoked, no new css reference will be given out via - * css_tryget(). We can't simply call percpu_ref_kill() and + * css_tryget_online(). We can't simply call percpu_ref_kill() and * proceed to offlining css's because percpu_ref_kill() doesn't * guarantee that the ref is seen as killed on all CPUs on return. * @@ -4370,9 +4566,9 @@ static void kill_css(struct cgroup_subsys_state *css) * * css's make use of percpu refcnts whose killing latency shouldn't be * exposed to userland and are RCU protected. Also, cgroup core needs to - * guarantee that css_tryget() won't succeed by the time ->css_offline() is - * invoked. To satisfy all the requirements, destruction is implemented in - * the following two steps. + * guarantee that css_tryget_online() won't succeed by the time + * ->css_offline() is invoked. To satisfy all the requirements, + * destruction is implemented in the following two steps. * * s1. Verify @cgrp can be destroyed and mark it dying. Remove all * userland visible parts and start killing the percpu refcnts of @@ -4391,141 +4587,98 @@ static void kill_css(struct cgroup_subsys_state *css) static int cgroup_destroy_locked(struct cgroup *cgrp) __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { - struct dentry *d = cgrp->dentry; struct cgroup_subsys_state *css; - struct cgroup *child; bool empty; int ssid; - lockdep_assert_held(&d->d_inode->i_mutex); lockdep_assert_held(&cgroup_mutex); /* - * css_set_lock synchronizes access to ->cset_links and prevents - * @cgrp from being removed while __put_css_set() is in progress. + * css_set_rwsem synchronizes access to ->cset_links and prevents + * @cgrp from being removed while put_css_set() is in progress. */ - read_lock(&css_set_lock); + down_read(&css_set_rwsem); empty = list_empty(&cgrp->cset_links); - read_unlock(&css_set_lock); + up_read(&css_set_rwsem); if (!empty) return -EBUSY; /* - * Make sure there's no live children. We can't test ->children - * emptiness as dead children linger on it while being destroyed; - * otherwise, "rmdir parent/child parent" may fail with -EBUSY. + * Make sure there's no live children. We can't test emptiness of + * ->self.children as dead children linger on it while being + * drained; otherwise, "rmdir parent/child parent" may fail. */ - empty = true; - rcu_read_lock(); - list_for_each_entry_rcu(child, &cgrp->children, sibling) { - empty = cgroup_is_dead(child); - if (!empty) - break; - } - rcu_read_unlock(); - if (!empty) + if (css_has_online_children(&cgrp->self)) return -EBUSY; /* - * Initiate massacre of all css's. cgroup_destroy_css_killed() - * will be invoked to perform the rest of destruction once the - * percpu refs of all css's are confirmed to be killed. + * Mark @cgrp dead. This prevents further task migration and child + * creation by disabling cgroup_lock_live_group(). */ + cgrp->self.flags &= ~CSS_ONLINE; + + /* initiate massacre of all css's */ for_each_css(css, ssid, cgrp) kill_css(css); - /* - * Mark @cgrp dead. This prevents further task migration and child - * creation by disabling cgroup_lock_live_group(). Note that - * CGRP_DEAD assertion is depended upon by css_next_child() to - * resume iteration after dropping RCU read lock. See - * css_next_child() for details. - */ - set_bit(CGRP_DEAD, &cgrp->flags); - - /* CGRP_DEAD is set, remove from ->release_list for the last time */ + /* CSS_ONLINE is clear, remove from ->release_list for the last time */ raw_spin_lock(&release_list_lock); if (!list_empty(&cgrp->release_list)) list_del_init(&cgrp->release_list); raw_spin_unlock(&release_list_lock); /* - * If @cgrp has css's attached, the second stage of cgroup - * destruction is kicked off from css_killed_work_fn() after the - * refs of all attached css's are killed. If @cgrp doesn't have - * any css, we kick it off here. + * Remove @cgrp directory along with the base files. @cgrp has an + * extra ref on its kn. */ - if (!cgrp->nr_css) - cgroup_destroy_css_killed(cgrp); + kernfs_remove(cgrp->kn); - /* - * Clear the base files and remove @cgrp directory. The removal - * puts the base ref but we aren't quite done with @cgrp yet, so - * hold onto it. - */ - cgroup_addrm_files(cgrp, cgroup_base_files, false); - dget(d); - cgroup_d_remove_dir(d); + set_bit(CGRP_RELEASABLE, &cgroup_parent(cgrp)->flags); + check_for_release(cgroup_parent(cgrp)); + + /* put the base reference */ + percpu_ref_kill(&cgrp->self.refcnt); return 0; }; -/** - * cgroup_destroy_css_killed - the second step of cgroup destruction - * @work: cgroup->destroy_free_work - * - * This function is invoked from a work item for a cgroup which is being - * destroyed after all css's are offlined and performs the rest of - * destruction. This is the second step of destruction described in the - * comment above cgroup_destroy_locked(). - */ -static void cgroup_destroy_css_killed(struct cgroup *cgrp) +static int cgroup_rmdir(struct kernfs_node *kn) { - struct cgroup *parent = cgrp->parent; - struct dentry *d = cgrp->dentry; - - lockdep_assert_held(&cgroup_mutex); - - /* delete this cgroup from parent->children */ - list_del_rcu(&cgrp->sibling); - - dput(d); - - set_bit(CGRP_RELEASABLE, &parent->flags); - check_for_release(parent); -} - -static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) -{ - int ret; + struct cgroup *cgrp; + int ret = 0; - mutex_lock(&cgroup_mutex); - ret = cgroup_destroy_locked(dentry->d_fsdata); - mutex_unlock(&cgroup_mutex); + cgrp = cgroup_kn_lock_live(kn); + if (!cgrp) + return 0; + cgroup_get(cgrp); /* for @kn->priv clearing */ - return ret; -} + ret = cgroup_destroy_locked(cgrp); -static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) -{ - INIT_LIST_HEAD(&ss->cftsets); + cgroup_kn_unlock(kn); /* - * base_cftset is embedded in subsys itself, no need to worry about - * deregistration. + * There are two control paths which try to determine cgroup from + * dentry without going through kernfs - cgroupstats_build() and + * css_tryget_online_from_dir(). Those are supported by RCU + * protecting clearing of cgrp->kn->priv backpointer, which should + * happen after all files under it have been removed. */ - if (ss->base_cftypes) { - struct cftype *cft; + if (!ret) + RCU_INIT_POINTER(*(void __rcu __force **)&kn->priv, NULL); - for (cft = ss->base_cftypes; cft->name[0] != '\0'; cft++) - cft->ss = ss; - - ss->base_cftset.cfts = ss->base_cftypes; - list_add_tail(&ss->base_cftset.node, &ss->cftsets); - } + cgroup_put(cgrp); + return ret; } -static void __init cgroup_init_subsys(struct cgroup_subsys *ss) +static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { + .remount_fs = cgroup_remount, + .show_options = cgroup_show_options, + .mkdir = cgroup_mkdir, + .rmdir = cgroup_rmdir, + .rename = cgroup_rename, +}; + +static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) { struct cgroup_subsys_state *css; @@ -4533,21 +4686,35 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) mutex_lock(&cgroup_mutex); - /* init base cftset */ - cgroup_init_cftsets(ss); + idr_init(&ss->css_idr); + INIT_LIST_HEAD(&ss->cfts); - /* Create the top cgroup state for this subsystem */ - ss->root = &cgroup_dummy_root; - css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss)); + /* Create the root cgroup state for this subsystem */ + ss->root = &cgrp_dfl_root; + css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss)); /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); - init_css(css, ss, cgroup_dummy_top); + init_and_link_css(css, ss, &cgrp_dfl_root.cgrp); + + /* + * Root csses are never destroyed and we can't initialize + * percpu_ref during early init. Disable refcnting. + */ + css->flags |= CSS_NO_REF; + + if (early) { + /* allocation can't be done safely during early init */ + css->id = 1; + } else { + css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL); + BUG_ON(css->id < 0); + } /* Update the init_css_set to contain a subsys * pointer to this state - since the subsystem is * newly registered, all tasks and hence the - * init_css_set is in the subsystem's top cgroup. */ - init_css_set.subsys[ss->subsys_id] = css; + * init_css_set is in the subsystem's root cgroup. */ + init_css_set.subsys[ss->id] = css; need_forkexit_callback |= ss->fork || ss->exit; @@ -4559,186 +4726,9 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) BUG_ON(online_css(css)); mutex_unlock(&cgroup_mutex); - - /* this function shouldn't be used with modular subsystems, since they - * need to register a subsys_id, among other things */ - BUG_ON(ss->module); } /** - * cgroup_load_subsys: load and register a modular subsystem at runtime - * @ss: the subsystem to load - * - * This function should be called in a modular subsystem's initcall. If the - * subsystem is built as a module, it will be assigned a new subsys_id and set - * up for use. If the subsystem is built-in anyway, work is delegated to the - * simpler cgroup_init_subsys. - */ -int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) -{ - struct cgroup_subsys_state *css; - int i, ret; - struct hlist_node *tmp; - struct css_set *cset; - unsigned long key; - - /* check name and function validity */ - if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || - ss->css_alloc == NULL || ss->css_free == NULL) - return -EINVAL; - - /* - * we don't support callbacks in modular subsystems. this check is - * before the ss->module check for consistency; a subsystem that could - * be a module should still have no callbacks even if the user isn't - * compiling it as one. - */ - if (ss->fork || ss->exit) - return -EINVAL; - - /* - * an optionally modular subsystem is built-in: we want to do nothing, - * since cgroup_init_subsys will have already taken care of it. - */ - if (ss->module == NULL) { - /* a sanity check */ - BUG_ON(cgroup_subsys[ss->subsys_id] != ss); - return 0; - } - - /* init base cftset */ - cgroup_init_cftsets(ss); - - mutex_lock(&cgroup_mutex); - mutex_lock(&cgroup_root_mutex); - cgroup_subsys[ss->subsys_id] = ss; - - /* - * no ss->css_alloc seems to need anything important in the ss - * struct, so this can happen first (i.e. before the dummy root - * attachment). - */ - css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss)); - if (IS_ERR(css)) { - /* failure case - need to deassign the cgroup_subsys[] slot. */ - cgroup_subsys[ss->subsys_id] = NULL; - mutex_unlock(&cgroup_root_mutex); - mutex_unlock(&cgroup_mutex); - return PTR_ERR(css); - } - - ss->root = &cgroup_dummy_root; - - /* our new subsystem will be attached to the dummy hierarchy. */ - init_css(css, ss, cgroup_dummy_top); - - /* - * Now we need to entangle the css into the existing css_sets. unlike - * in cgroup_init_subsys, there are now multiple css_sets, so each one - * will need a new pointer to it; done by iterating the css_set_table. - * furthermore, modifying the existing css_sets will corrupt the hash - * table state, so each changed css_set will need its hash recomputed. - * this is all done under the css_set_lock. - */ - write_lock(&css_set_lock); - hash_for_each_safe(css_set_table, i, tmp, cset, hlist) { - /* skip entries that we already rehashed */ - if (cset->subsys[ss->subsys_id]) - continue; - /* remove existing entry */ - hash_del(&cset->hlist); - /* set new value */ - cset->subsys[ss->subsys_id] = css; - /* recompute hash and restore entry */ - key = css_set_hash(cset->subsys); - hash_add(css_set_table, &cset->hlist, key); - } - write_unlock(&css_set_lock); - - ret = online_css(css); - if (ret) { - ss->css_free(css); - goto err_unload; - } - - /* success! */ - mutex_unlock(&cgroup_root_mutex); - mutex_unlock(&cgroup_mutex); - return 0; - -err_unload: - mutex_unlock(&cgroup_root_mutex); - mutex_unlock(&cgroup_mutex); - /* @ss can't be mounted here as try_module_get() would fail */ - cgroup_unload_subsys(ss); - return ret; -} -EXPORT_SYMBOL_GPL(cgroup_load_subsys); - -/** - * cgroup_unload_subsys: unload a modular subsystem - * @ss: the subsystem to unload - * - * This function should be called in a modular subsystem's exitcall. When this - * function is invoked, the refcount on the subsystem's module will be 0, so - * the subsystem will not be attached to any hierarchy. - */ -void cgroup_unload_subsys(struct cgroup_subsys *ss) -{ - struct cgrp_cset_link *link; - struct cgroup_subsys_state *css; - - BUG_ON(ss->module == NULL); - - /* - * we shouldn't be called if the subsystem is in use, and the use of - * try_module_get() in rebind_subsystems() should ensure that it - * doesn't start being used while we're killing it off. - */ - BUG_ON(ss->root != &cgroup_dummy_root); - - mutex_lock(&cgroup_mutex); - mutex_lock(&cgroup_root_mutex); - - css = cgroup_css(cgroup_dummy_top, ss); - if (css) - offline_css(css); - - /* deassign the subsys_id */ - cgroup_subsys[ss->subsys_id] = NULL; - - /* - * disentangle the css from all css_sets attached to the dummy - * top. as in loading, we need to pay our respects to the hashtable - * gods. - */ - write_lock(&css_set_lock); - list_for_each_entry(link, &cgroup_dummy_top->cset_links, cset_link) { - struct css_set *cset = link->cset; - unsigned long key; - - hash_del(&cset->hlist); - cset->subsys[ss->subsys_id] = NULL; - key = css_set_hash(cset->subsys); - hash_add(css_set_table, &cset->hlist, key); - } - write_unlock(&css_set_lock); - - /* - * remove subsystem's css from the cgroup_dummy_top and free it - - * need to free before marking as null because ss->css_free needs - * the cgrp->subsys pointer to find their state. - */ - if (css) - ss->css_free(css); - RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL); - - mutex_unlock(&cgroup_root_mutex); - mutex_unlock(&cgroup_mutex); -} -EXPORT_SYMBOL_GPL(cgroup_unload_subsys); - -/** * cgroup_init_early - cgroup initialization at system boot * * Initialize cgroups at system boot, and initialize any @@ -4746,37 +4736,29 @@ EXPORT_SYMBOL_GPL(cgroup_unload_subsys); */ int __init cgroup_init_early(void) { + static struct cgroup_sb_opts __initdata opts = + { .flags = CGRP_ROOT_SANE_BEHAVIOR }; struct cgroup_subsys *ss; int i; - atomic_set(&init_css_set.refcount, 1); - INIT_LIST_HEAD(&init_css_set.cgrp_links); - INIT_LIST_HEAD(&init_css_set.tasks); - INIT_HLIST_NODE(&init_css_set.hlist); - css_set_count = 1; - init_cgroup_root(&cgroup_dummy_root); - cgroup_root_count = 1; + init_cgroup_root(&cgrp_dfl_root, &opts); + cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF; + RCU_INIT_POINTER(init_task.cgroups, &init_css_set); - init_cgrp_cset_link.cset = &init_css_set; - init_cgrp_cset_link.cgrp = cgroup_dummy_top; - list_add(&init_cgrp_cset_link.cset_link, &cgroup_dummy_top->cset_links); - list_add(&init_cgrp_cset_link.cgrp_link, &init_css_set.cgrp_links); - - /* at bootup time, we don't worry about modular subsystems */ - for_each_builtin_subsys(ss, i) { - BUG_ON(!ss->name); - BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); - BUG_ON(!ss->css_alloc); - BUG_ON(!ss->css_free); - if (ss->subsys_id != i) { - printk(KERN_ERR "cgroup: Subsys %s id == %d\n", - ss->name, ss->subsys_id); - BUG(); - } + for_each_subsys(ss, i) { + WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id, + "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n", + i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free, + ss->id, ss->name); + WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN, + "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]); + + ss->id = i; + ss->name = cgroup_subsys_name[i]; if (ss->early_init) - cgroup_init_subsys(ss); + cgroup_init_subsys(ss, true); } return 0; } @@ -4791,53 +4773,58 @@ int __init cgroup_init(void) { struct cgroup_subsys *ss; unsigned long key; - int i, err; + int ssid, err; - err = bdi_init(&cgroup_backing_dev_info); - if (err) - return err; - - for_each_builtin_subsys(ss, i) { - if (!ss->early_init) - cgroup_init_subsys(ss); - } + BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); - /* allocate id for the dummy hierarchy */ mutex_lock(&cgroup_mutex); - mutex_lock(&cgroup_root_mutex); /* Add init_css_set to the hash table */ key = css_set_hash(init_css_set.subsys); hash_add(css_set_table, &init_css_set.hlist, key); - BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1)); + BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0)); - err = idr_alloc(&cgroup_dummy_root.cgroup_idr, cgroup_dummy_top, - 0, 1, GFP_KERNEL); - BUG_ON(err < 0); - - mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); - cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); - if (!cgroup_kobj) { - err = -ENOMEM; - goto out; + for_each_subsys(ss, ssid) { + if (ss->early_init) { + struct cgroup_subsys_state *css = + init_css_set.subsys[ss->id]; + + css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, + GFP_KERNEL); + BUG_ON(css->id < 0); + } else { + cgroup_init_subsys(ss, false); + } + + list_add_tail(&init_css_set.e_cset_node[ssid], + &cgrp_dfl_root.cgrp.e_csets[ssid]); + + /* + * Setting dfl_root subsys_mask needs to consider the + * disabled flag and cftype registration needs kmalloc, + * both of which aren't available during early_init. + */ + if (!ss->disabled) { + cgrp_dfl_root.subsys_mask |= 1 << ss->id; + WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes)); + } } + cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); + if (!cgroup_kobj) + return -ENOMEM; + err = register_filesystem(&cgroup_fs_type); if (err < 0) { kobject_put(cgroup_kobj); - goto out; + return err; } proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations); - -out: - if (err) - bdi_destroy(&cgroup_backing_dev_info); - - return err; + return 0; } static int __init cgroup_wq_init(void) @@ -4869,12 +4856,6 @@ core_initcall(cgroup_wq_init); * proc_cgroup_show() * - Print task's cgroup paths into seq_file, one line for each hierarchy * - Used for /proc/<pid>/cgroup. - * - No need to task_lock(tsk) on this tsk->cgroup reference, as it - * doesn't really matter if tsk->cgroup changes after we read it, - * and we take cgroup_mutex, keeping cgroup_attach_task() from changing it - * anyway. No need to check that tsk->cgroup != NULL, thanks to - * the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks - * cgroup to top_cgroup. */ /* TODO: Use a proper seq_file iterator */ @@ -4882,12 +4863,12 @@ int proc_cgroup_show(struct seq_file *m, void *v) { struct pid *pid; struct task_struct *tsk; - char *buf; + char *buf, *path; int retval; - struct cgroupfs_root *root; + struct cgroup_root *root; retval = -ENOMEM; - buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + buf = kmalloc(PATH_MAX, GFP_KERNEL); if (!buf) goto out; @@ -4900,12 +4881,16 @@ int proc_cgroup_show(struct seq_file *m, void *v) retval = 0; mutex_lock(&cgroup_mutex); + down_read(&css_set_rwsem); - for_each_active_root(root) { + for_each_root(root) { struct cgroup_subsys *ss; struct cgroup *cgrp; int ssid, count = 0; + if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible) + continue; + seq_printf(m, "%d:", root->hierarchy_id); for_each_subsys(ss, ssid) if (root->subsys_mask & (1 << ssid)) @@ -4915,14 +4900,17 @@ int proc_cgroup_show(struct seq_file *m, void *v) root->name); seq_putc(m, ':'); cgrp = task_cgroup_from_root(tsk, root); - retval = cgroup_path(cgrp, buf, PAGE_SIZE); - if (retval < 0) + path = cgroup_path(cgrp, buf, PATH_MAX); + if (!path) { + retval = -ENAMETOOLONG; goto out_unlock; - seq_puts(m, buf); + } + seq_puts(m, path); seq_putc(m, '\n'); } out_unlock: + up_read(&css_set_rwsem); mutex_unlock(&cgroup_mutex); put_task_struct(tsk); out_free: @@ -4948,7 +4936,7 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v) for_each_subsys(ss, i) seq_printf(m, "%s\t%d\t%d\t%d\n", ss->name, ss->root->hierarchy_id, - ss->root->number_of_cgroups, !ss->disabled); + atomic_read(&ss->root->nr_cgrps), !ss->disabled); mutex_unlock(&cgroup_mutex); return 0; @@ -4967,27 +4955,16 @@ static const struct file_operations proc_cgroupstats_operations = { }; /** - * cgroup_fork - attach newly forked task to its parents cgroup. + * cgroup_fork - initialize cgroup related fields during copy_process() * @child: pointer to task_struct of forking parent process. * - * Description: A task inherits its parent's cgroup at fork(). - * - * A pointer to the shared css_set was automatically copied in - * fork.c by dup_task_struct(). However, we ignore that copy, since - * it was not made under the protection of RCU or cgroup_mutex, so - * might no longer be a valid cgroup pointer. cgroup_attach_task() might - * have already changed current->cgroups, allowing the previously - * referenced cgroup group to be removed and freed. - * - * At the point that cgroup_fork() is called, 'current' is the parent - * task, and the passed argument 'child' points to the child task. + * A task is associated with the init_css_set until cgroup_post_fork() + * attaches it to the parent's css_set. Empty cg_list indicates that + * @child isn't holding reference to its css_set. */ void cgroup_fork(struct task_struct *child) { - task_lock(current); - get_css_set(task_css_set(current)); - child->cgroups = current->cgroups; - task_unlock(current); + RCU_INIT_POINTER(child->cgroups, &init_css_set); INIT_LIST_HEAD(&child->cg_list); } @@ -5007,23 +4984,37 @@ void cgroup_post_fork(struct task_struct *child) int i; /* - * use_task_css_set_links is set to 1 before we walk the tasklist - * under the tasklist_lock and we read it here after we added the child - * to the tasklist under the tasklist_lock as well. If the child wasn't - * yet in the tasklist when we walked through it from - * cgroup_enable_task_cg_lists(), then use_task_css_set_links value - * should be visible now due to the paired locking and barriers implied - * by LOCK/UNLOCK: it is written before the tasklist_lock unlock - * in cgroup_enable_task_cg_lists() and read here after the tasklist_lock - * lock on fork. + * This may race against cgroup_enable_task_cg_links(). As that + * function sets use_task_css_set_links before grabbing + * tasklist_lock and we just went through tasklist_lock to add + * @child, it's guaranteed that either we see the set + * use_task_css_set_links or cgroup_enable_task_cg_lists() sees + * @child during its iteration. + * + * If we won the race, @child is associated with %current's + * css_set. Grabbing css_set_rwsem guarantees both that the + * association is stable, and, on completion of the parent's + * migration, @child is visible in the source of migration or + * already in the destination cgroup. This guarantee is necessary + * when implementing operations which need to migrate all tasks of + * a cgroup to another. + * + * Note that if we lose to cgroup_enable_task_cg_links(), @child + * will remain in init_css_set. This is safe because all tasks are + * in the init_css_set before cg_links is enabled and there's no + * operation which transfers all tasks out of init_css_set. */ if (use_task_css_set_links) { - write_lock(&css_set_lock); - task_lock(child); - if (list_empty(&child->cg_list)) - list_add(&child->cg_list, &task_css_set(child)->tasks); - task_unlock(child); - write_unlock(&css_set_lock); + struct css_set *cset; + + down_write(&css_set_rwsem); + cset = task_css_set(current); + if (list_empty(&child->cg_list)) { + rcu_assign_pointer(child->cgroups, cset); + list_add(&child->cg_list, &cset->tasks); + get_css_set(cset); + } + up_write(&css_set_rwsem); } /* @@ -5032,15 +5023,7 @@ void cgroup_post_fork(struct task_struct *child) * and addition to css_set. */ if (need_forkexit_callback) { - /* - * fork/exit callbacks are supported only for builtin - * subsystems, and the builtin section of the subsys - * array is immutable, so we don't need to lock the - * subsys array here. On the other hand, modular section - * of the array can be freed at module unload, so we - * can't touch that. - */ - for_each_builtin_subsys(ss, i) + for_each_subsys(ss, i) if (ss->fork) ss->fork(child); } @@ -5049,7 +5032,6 @@ void cgroup_post_fork(struct task_struct *child) /** * cgroup_exit - detach cgroup from exiting task * @tsk: pointer to task_struct of exiting process - * @run_callback: run exit callbacks? * * Description: Detach cgroup from @tsk and release it. * @@ -5059,57 +5041,38 @@ void cgroup_post_fork(struct task_struct *child) * use notify_on_release cgroups where very high task exit scaling * is required on large systems. * - * the_top_cgroup_hack: - * - * Set the exiting tasks cgroup to the root cgroup (top_cgroup). - * - * We call cgroup_exit() while the task is still competent to - * handle notify_on_release(), then leave the task attached to the - * root cgroup in each hierarchy for the remainder of its exit. - * - * To do this properly, we would increment the reference count on - * top_cgroup, and near the very end of the kernel/exit.c do_exit() - * code we would add a second cgroup function call, to drop that - * reference. This would just create an unnecessary hot spot on - * the top_cgroup reference count, to no avail. - * - * Normally, holding a reference to a cgroup without bumping its - * count is unsafe. The cgroup could go away, or someone could - * attach us to a different cgroup, decrementing the count on - * the first cgroup that we never incremented. But in this case, - * top_cgroup isn't going away, and either task has PF_EXITING set, - * which wards off any cgroup_attach_task() attempts, or task is a failed - * fork, never visible to cgroup_attach_task. + * We set the exiting tasks cgroup to the root cgroup (top_cgroup). We + * call cgroup_exit() while the task is still competent to handle + * notify_on_release(), then leave the task attached to the root cgroup in + * each hierarchy for the remainder of its exit. No need to bother with + * init_css_set refcnting. init_css_set never goes away and we can't race + * with migration path - PF_EXITING is visible to migration path. */ -void cgroup_exit(struct task_struct *tsk, int run_callbacks) +void cgroup_exit(struct task_struct *tsk) { struct cgroup_subsys *ss; struct css_set *cset; + bool put_cset = false; int i; /* - * Unlink from the css_set task list if necessary. - * Optimistically check cg_list before taking - * css_set_lock + * Unlink from @tsk from its css_set. As migration path can't race + * with us, we can check cg_list without grabbing css_set_rwsem. */ if (!list_empty(&tsk->cg_list)) { - write_lock(&css_set_lock); - if (!list_empty(&tsk->cg_list)) - list_del_init(&tsk->cg_list); - write_unlock(&css_set_lock); + down_write(&css_set_rwsem); + list_del_init(&tsk->cg_list); + up_write(&css_set_rwsem); + put_cset = true; } /* Reassign the task to the init_css_set. */ - task_lock(tsk); cset = task_css_set(tsk); RCU_INIT_POINTER(tsk->cgroups, &init_css_set); - if (run_callbacks && need_forkexit_callback) { - /* - * fork/exit callbacks are supported only for builtin - * subsystems, see cgroup_post_fork() for details. - */ - for_each_builtin_subsys(ss, i) { + if (need_forkexit_callback) { + /* see cgroup_post_fork() for details */ + for_each_subsys(ss, i) { if (ss->exit) { struct cgroup_subsys_state *old_css = cset->subsys[i]; struct cgroup_subsys_state *css = task_css(tsk, i); @@ -5118,15 +5081,15 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) } } } - task_unlock(tsk); - put_css_set_taskexit(cset); + if (put_cset) + put_css_set(cset, true); } static void check_for_release(struct cgroup *cgrp) { - if (cgroup_is_releasable(cgrp) && - list_empty(&cgrp->cset_links) && list_empty(&cgrp->children)) { + if (cgroup_is_releasable(cgrp) && list_empty(&cgrp->cset_links) && + !css_has_online_children(&cgrp->self)) { /* * Control Group is currently removeable. If it's not * already queued for a userspace notification, queue @@ -5177,16 +5140,17 @@ static void cgroup_release_agent(struct work_struct *work) while (!list_empty(&release_list)) { char *argv[3], *envp[3]; int i; - char *pathbuf = NULL, *agentbuf = NULL; + char *pathbuf = NULL, *agentbuf = NULL, *path; struct cgroup *cgrp = list_entry(release_list.next, struct cgroup, release_list); list_del_init(&cgrp->release_list); raw_spin_unlock(&release_list_lock); - pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); + pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); if (!pathbuf) goto continue_free; - if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0) + path = cgroup_path(cgrp, pathbuf, PATH_MAX); + if (!path) goto continue_free; agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); if (!agentbuf) @@ -5194,7 +5158,7 @@ static void cgroup_release_agent(struct work_struct *work) i = 0; argv[i++] = agentbuf; - argv[i++] = pathbuf; + argv[i++] = path; argv[i] = NULL; i = 0; @@ -5228,11 +5192,7 @@ static int __init cgroup_disable(char *str) if (!*token) continue; - /* - * cgroup_disable, being at boot time, can't know about - * module subsystems, so we don't worry about them. - */ - for_each_builtin_subsys(ss, i) { + for_each_subsys(ss, i) { if (!strcmp(token, ss->name)) { ss->disabled = 1; printk(KERN_INFO "Disabling %s control group" @@ -5246,28 +5206,42 @@ static int __init cgroup_disable(char *str) __setup("cgroup_disable=", cgroup_disable); /** - * css_from_dir - get corresponding css from the dentry of a cgroup dir + * css_tryget_online_from_dir - get corresponding css from a cgroup dentry * @dentry: directory dentry of interest * @ss: subsystem of interest * - * Must be called under cgroup_mutex or RCU read lock. The caller is - * responsible for pinning the returned css if it needs to be accessed - * outside the critical section. + * If @dentry is a directory for a cgroup which has @ss enabled on it, try + * to get the corresponding css and return it. If such css doesn't exist + * or can't be pinned, an ERR_PTR value is returned. */ -struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, - struct cgroup_subsys *ss) +struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, + struct cgroup_subsys *ss) { + struct kernfs_node *kn = kernfs_node_from_dentry(dentry); + struct cgroup_subsys_state *css = NULL; struct cgroup *cgrp; - cgroup_assert_mutex_or_rcu_locked(); - /* is @dentry a cgroup dir? */ - if (!dentry->d_inode || - dentry->d_inode->i_op != &cgroup_dir_inode_operations) + if (dentry->d_sb->s_type != &cgroup_fs_type || !kn || + kernfs_type(kn) != KERNFS_DIR) return ERR_PTR(-EBADF); - cgrp = __d_cgrp(dentry); - return cgroup_css(cgrp, ss) ?: ERR_PTR(-ENOENT); + rcu_read_lock(); + + /* + * This path doesn't originate from kernfs and @kn could already + * have been or be removed at any point. @kn->priv is RCU + * protected for this access. See cgroup_rmdir() for details. + */ + cgrp = rcu_dereference(kn->priv); + if (cgrp) + css = cgroup_css(cgrp, ss); + + if (!css || !css_tryget_online(css)) + css = ERR_PTR(-ENOENT); + + rcu_read_unlock(); + return css; } /** @@ -5280,14 +5254,8 @@ struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, */ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) { - struct cgroup *cgrp; - - cgroup_assert_mutex_or_rcu_locked(); - - cgrp = idr_find(&ss->root->cgroup_idr, id); - if (cgrp) - return cgroup_css(cgrp, ss); - return NULL; + WARN_ON_ONCE(!rcu_read_lock_held()); + return idr_find(&ss->css_idr, id); } #ifdef CONFIG_CGROUP_DEBUG @@ -5334,23 +5302,25 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) { struct cgrp_cset_link *link; struct css_set *cset; + char *name_buf; - read_lock(&css_set_lock); + name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL); + if (!name_buf) + return -ENOMEM; + + down_read(&css_set_rwsem); rcu_read_lock(); cset = rcu_dereference(current->cgroups); list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; - const char *name; - if (c->dentry) - name = c->dentry->d_name.name; - else - name = "?"; + cgroup_name(c, name_buf, NAME_MAX + 1); seq_printf(seq, "Root %d group %s\n", - c->root->hierarchy_id, name); + c->root->hierarchy_id, name_buf); } rcu_read_unlock(); - read_unlock(&css_set_lock); + up_read(&css_set_rwsem); + kfree(name_buf); return 0; } @@ -5360,23 +5330,30 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) struct cgroup_subsys_state *css = seq_css(seq); struct cgrp_cset_link *link; - read_lock(&css_set_lock); + down_read(&css_set_rwsem); list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { struct css_set *cset = link->cset; struct task_struct *task; int count = 0; + seq_printf(seq, "css_set %p\n", cset); + list_for_each_entry(task, &cset->tasks, cg_list) { - if (count++ > MAX_TASKS_SHOWN_PER_CSS) { - seq_puts(seq, " ...\n"); - break; - } else { - seq_printf(seq, " task %d\n", - task_pid_vnr(task)); - } + if (count++ > MAX_TASKS_SHOWN_PER_CSS) + goto overflow; + seq_printf(seq, " task %d\n", task_pid_vnr(task)); + } + + list_for_each_entry(task, &cset->mg_tasks, cg_list) { + if (count++ > MAX_TASKS_SHOWN_PER_CSS) + goto overflow; + seq_printf(seq, " task %d\n", task_pid_vnr(task)); } + continue; + overflow: + seq_puts(seq, " ...\n"); } - read_unlock(&css_set_lock); + up_read(&css_set_rwsem); return 0; } @@ -5419,11 +5396,9 @@ static struct cftype debug_files[] = { { } /* terminate */ }; -struct cgroup_subsys debug_subsys = { - .name = "debug", +struct cgroup_subsys debug_cgrp_subsys = { .css_alloc = debug_css_alloc, .css_free = debug_css_free, - .subsys_id = debug_subsys_id, .base_cftypes = debug_files, }; #endif /* CONFIG_CGROUP_DEBUG */ diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 6c3154e477f..a79e40f9d70 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -21,6 +21,7 @@ #include <linux/uaccess.h> #include <linux/freezer.h> #include <linux/seq_file.h> +#include <linux/mutex.h> /* * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is @@ -42,9 +43,10 @@ enum freezer_state_flags { struct freezer { struct cgroup_subsys_state css; unsigned int state; - spinlock_t lock; }; +static DEFINE_MUTEX(freezer_mutex); + static inline struct freezer *css_freezer(struct cgroup_subsys_state *css) { return css ? container_of(css, struct freezer, css) : NULL; @@ -52,12 +54,12 @@ static inline struct freezer *css_freezer(struct cgroup_subsys_state *css) static inline struct freezer *task_freezer(struct task_struct *task) { - return css_freezer(task_css(task, freezer_subsys_id)); + return css_freezer(task_css(task, freezer_cgrp_id)); } static struct freezer *parent_freezer(struct freezer *freezer) { - return css_freezer(css_parent(&freezer->css)); + return css_freezer(freezer->css.parent); } bool cgroup_freezing(struct task_struct *task) @@ -71,10 +73,6 @@ bool cgroup_freezing(struct task_struct *task) return ret; } -/* - * cgroups_write_string() limits the size of freezer state strings to - * CGROUP_LOCAL_BUFFER_SIZE - */ static const char *freezer_state_strs(unsigned int state) { if (state & CGROUP_FROZEN) @@ -84,8 +82,6 @@ static const char *freezer_state_strs(unsigned int state) return "THAWED"; }; -struct cgroup_subsys freezer_subsys; - static struct cgroup_subsys_state * freezer_css_alloc(struct cgroup_subsys_state *parent_css) { @@ -95,7 +91,6 @@ freezer_css_alloc(struct cgroup_subsys_state *parent_css) if (!freezer) return ERR_PTR(-ENOMEM); - spin_lock_init(&freezer->lock); return &freezer->css; } @@ -112,14 +107,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css) struct freezer *freezer = css_freezer(css); struct freezer *parent = parent_freezer(freezer); - /* - * The following double locking and freezing state inheritance - * guarantee that @cgroup can never escape ancestors' freezing - * states. See css_for_each_descendant_pre() for details. - */ - if (parent) - spin_lock_irq(&parent->lock); - spin_lock_nested(&freezer->lock, SINGLE_DEPTH_NESTING); + mutex_lock(&freezer_mutex); freezer->state |= CGROUP_FREEZER_ONLINE; @@ -128,10 +116,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css) atomic_inc(&system_freezing_cnt); } - spin_unlock(&freezer->lock); - if (parent) - spin_unlock_irq(&parent->lock); - + mutex_unlock(&freezer_mutex); return 0; } @@ -146,14 +131,14 @@ static void freezer_css_offline(struct cgroup_subsys_state *css) { struct freezer *freezer = css_freezer(css); - spin_lock_irq(&freezer->lock); + mutex_lock(&freezer_mutex); if (freezer->state & CGROUP_FREEZING) atomic_dec(&system_freezing_cnt); freezer->state = 0; - spin_unlock_irq(&freezer->lock); + mutex_unlock(&freezer_mutex); } static void freezer_css_free(struct cgroup_subsys_state *css) @@ -177,7 +162,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css, struct task_struct *task; bool clear_frozen = false; - spin_lock_irq(&freezer->lock); + mutex_lock(&freezer_mutex); /* * Make the new tasks conform to the current state of @new_css. @@ -189,7 +174,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css, * current state before executing the following - !frozen tasks may * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. */ - cgroup_taskset_for_each(task, new_css, tset) { + cgroup_taskset_for_each(task, tset) { if (!(freezer->state & CGROUP_FREEZING)) { __thaw_task(task); } else { @@ -199,43 +184,48 @@ static void freezer_attach(struct cgroup_subsys_state *new_css, } } - spin_unlock_irq(&freezer->lock); - - /* - * Propagate FROZEN clearing upwards. We may race with - * update_if_frozen(), but as long as both work bottom-up, either - * update_if_frozen() sees child's FROZEN cleared or we clear the - * parent's FROZEN later. No parent w/ !FROZEN children can be - * left FROZEN. - */ + /* propagate FROZEN clearing upwards */ while (clear_frozen && (freezer = parent_freezer(freezer))) { - spin_lock_irq(&freezer->lock); freezer->state &= ~CGROUP_FROZEN; clear_frozen = freezer->state & CGROUP_FREEZING; - spin_unlock_irq(&freezer->lock); } + + mutex_unlock(&freezer_mutex); } +/** + * freezer_fork - cgroup post fork callback + * @task: a task which has just been forked + * + * @task has just been created and should conform to the current state of + * the cgroup_freezer it belongs to. This function may race against + * freezer_attach(). Losing to freezer_attach() means that we don't have + * to do anything as freezer_attach() will put @task into the appropriate + * state. + */ static void freezer_fork(struct task_struct *task) { struct freezer *freezer; - rcu_read_lock(); - freezer = task_freezer(task); - /* - * The root cgroup is non-freezable, so we can skip the - * following check. + * The root cgroup is non-freezable, so we can skip locking the + * freezer. This is safe regardless of race with task migration. + * If we didn't race or won, skipping is obviously the right thing + * to do. If we lost and root is the new cgroup, noop is still the + * right thing to do. */ - if (!parent_freezer(freezer)) - goto out; + if (task_css_is_root(task, freezer_cgrp_id)) + return; - spin_lock_irq(&freezer->lock); + mutex_lock(&freezer_mutex); + rcu_read_lock(); + + freezer = task_freezer(task); if (freezer->state & CGROUP_FREEZING) freeze_task(task); - spin_unlock_irq(&freezer->lock); -out: + rcu_read_unlock(); + mutex_unlock(&freezer_mutex); } /** @@ -261,22 +251,24 @@ static void update_if_frozen(struct cgroup_subsys_state *css) struct css_task_iter it; struct task_struct *task; - WARN_ON_ONCE(!rcu_read_lock_held()); - - spin_lock_irq(&freezer->lock); + lockdep_assert_held(&freezer_mutex); if (!(freezer->state & CGROUP_FREEZING) || (freezer->state & CGROUP_FROZEN)) - goto out_unlock; + return; /* are all (live) children frozen? */ + rcu_read_lock(); css_for_each_child(pos, css) { struct freezer *child = css_freezer(pos); if ((child->state & CGROUP_FREEZER_ONLINE) && - !(child->state & CGROUP_FROZEN)) - goto out_unlock; + !(child->state & CGROUP_FROZEN)) { + rcu_read_unlock(); + return; + } } + rcu_read_unlock(); /* are all tasks frozen? */ css_task_iter_start(css, &it); @@ -297,21 +289,29 @@ static void update_if_frozen(struct cgroup_subsys_state *css) freezer->state |= CGROUP_FROZEN; out_iter_end: css_task_iter_end(&it); -out_unlock: - spin_unlock_irq(&freezer->lock); } static int freezer_read(struct seq_file *m, void *v) { struct cgroup_subsys_state *css = seq_css(m), *pos; + mutex_lock(&freezer_mutex); rcu_read_lock(); /* update states bottom-up */ - css_for_each_descendant_post(pos, css) + css_for_each_descendant_post(pos, css) { + if (!css_tryget_online(pos)) + continue; + rcu_read_unlock(); + update_if_frozen(pos); + rcu_read_lock(); + css_put(pos); + } + rcu_read_unlock(); + mutex_unlock(&freezer_mutex); seq_puts(m, freezer_state_strs(css_freezer(css)->state)); seq_putc(m, '\n'); @@ -353,7 +353,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze, unsigned int state) { /* also synchronizes against task migration, see freezer_attach() */ - lockdep_assert_held(&freezer->lock); + lockdep_assert_held(&freezer_mutex); if (!(freezer->state & CGROUP_FREEZER_ONLINE)) return; @@ -394,47 +394,47 @@ static void freezer_change_state(struct freezer *freezer, bool freeze) * descendant will try to inherit its parent's FREEZING state as * CGROUP_FREEZING_PARENT. */ + mutex_lock(&freezer_mutex); rcu_read_lock(); css_for_each_descendant_pre(pos, &freezer->css) { struct freezer *pos_f = css_freezer(pos); struct freezer *parent = parent_freezer(pos_f); - spin_lock_irq(&pos_f->lock); + if (!css_tryget_online(pos)) + continue; + rcu_read_unlock(); - if (pos_f == freezer) { + if (pos_f == freezer) freezer_apply_state(pos_f, freeze, CGROUP_FREEZING_SELF); - } else { - /* - * Our update to @parent->state is already visible - * which is all we need. No need to lock @parent. - * For more info on synchronization, see - * freezer_post_create(). - */ + else freezer_apply_state(pos_f, parent->state & CGROUP_FREEZING, CGROUP_FREEZING_PARENT); - } - spin_unlock_irq(&pos_f->lock); + rcu_read_lock(); + css_put(pos); } rcu_read_unlock(); + mutex_unlock(&freezer_mutex); } -static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft, - const char *buffer) +static ssize_t freezer_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { bool freeze; - if (strcmp(buffer, freezer_state_strs(0)) == 0) + buf = strstrip(buf); + + if (strcmp(buf, freezer_state_strs(0)) == 0) freeze = false; - else if (strcmp(buffer, freezer_state_strs(CGROUP_FROZEN)) == 0) + else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0) freeze = true; else return -EINVAL; - freezer_change_state(css_freezer(css), freeze); - return 0; + freezer_change_state(css_freezer(of_css(of)), freeze); + return nbytes; } static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css, @@ -458,7 +458,7 @@ static struct cftype files[] = { .name = "state", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = freezer_read, - .write_string = freezer_write, + .write = freezer_write, }, { .name = "self_freezing", @@ -473,13 +473,11 @@ static struct cftype files[] = { { } /* terminate */ }; -struct cgroup_subsys freezer_subsys = { - .name = "freezer", +struct cgroup_subsys freezer_cgrp_subsys = { .css_alloc = freezer_css_alloc, .css_online = freezer_css_online, .css_offline = freezer_css_offline, .css_free = freezer_css_free, - .subsys_id = freezer_subsys_id, .attach = freezer_attach, .fork = freezer_fork, .base_cftypes = files, diff --git a/kernel/compat.c b/kernel/compat.c index 0a09e481b70..633394f442f 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -30,28 +30,6 @@ #include <asm/uaccess.h> -/* - * Get/set struct timeval with struct timespec on the native side - */ -static int compat_get_timeval_convert(struct timespec *o, - struct compat_timeval __user *i) -{ - long usec; - - if (get_user(o->tv_sec, &i->tv_sec) || - get_user(usec, &i->tv_usec)) - return -EFAULT; - o->tv_nsec = usec * 1000; - return 0; -} - -static int compat_put_timeval_convert(struct compat_timeval __user *o, - struct timeval *i) -{ - return (put_user(i->tv_sec, &o->tv_sec) || - put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0; -} - static int compat_get_timex(struct timex *txc, struct compat_timex __user *utp) { memset(txc, 0, sizeof(struct timex)); @@ -110,13 +88,13 @@ static int compat_put_timex(struct compat_timex __user *utp, struct timex *txc) return 0; } -asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv, - struct timezone __user *tz) +COMPAT_SYSCALL_DEFINE2(gettimeofday, struct compat_timeval __user *, tv, + struct timezone __user *, tz) { if (tv) { struct timeval ktv; do_gettimeofday(&ktv); - if (compat_put_timeval_convert(tv, &ktv)) + if (compat_put_timeval(&ktv, tv)) return -EFAULT; } if (tz) { @@ -127,92 +105,114 @@ asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv, return 0; } -asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv, - struct timezone __user *tz) +COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv, + struct timezone __user *, tz) { - struct timespec kts; - struct timezone ktz; + struct timeval user_tv; + struct timespec new_ts; + struct timezone new_tz; if (tv) { - if (compat_get_timeval_convert(&kts, tv)) + if (compat_get_timeval(&user_tv, tv)) return -EFAULT; + new_ts.tv_sec = user_tv.tv_sec; + new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC; } if (tz) { - if (copy_from_user(&ktz, tz, sizeof(ktz))) + if (copy_from_user(&new_tz, tz, sizeof(*tz))) return -EFAULT; } - return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL); + return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL); } -int get_compat_timeval(struct timeval *tv, const struct compat_timeval __user *ctv) +static int __compat_get_timeval(struct timeval *tv, const struct compat_timeval __user *ctv) { return (!access_ok(VERIFY_READ, ctv, sizeof(*ctv)) || __get_user(tv->tv_sec, &ctv->tv_sec) || __get_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0; } -EXPORT_SYMBOL_GPL(get_compat_timeval); -int put_compat_timeval(const struct timeval *tv, struct compat_timeval __user *ctv) +static int __compat_put_timeval(const struct timeval *tv, struct compat_timeval __user *ctv) { return (!access_ok(VERIFY_WRITE, ctv, sizeof(*ctv)) || __put_user(tv->tv_sec, &ctv->tv_sec) || __put_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0; } -EXPORT_SYMBOL_GPL(put_compat_timeval); -int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts) +static int __compat_get_timespec(struct timespec *ts, const struct compat_timespec __user *cts) { return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) || __get_user(ts->tv_sec, &cts->tv_sec) || __get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; } -EXPORT_SYMBOL_GPL(get_compat_timespec); -int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user *cts) +static int __compat_put_timespec(const struct timespec *ts, struct compat_timespec __user *cts) { return (!access_ok(VERIFY_WRITE, cts, sizeof(*cts)) || __put_user(ts->tv_sec, &cts->tv_sec) || __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; } -EXPORT_SYMBOL_GPL(put_compat_timespec); int compat_get_timeval(struct timeval *tv, const void __user *utv) { if (COMPAT_USE_64BIT_TIME) - return copy_from_user(tv, utv, sizeof *tv) ? -EFAULT : 0; + return copy_from_user(tv, utv, sizeof(*tv)) ? -EFAULT : 0; else - return get_compat_timeval(tv, utv); + return __compat_get_timeval(tv, utv); } EXPORT_SYMBOL_GPL(compat_get_timeval); int compat_put_timeval(const struct timeval *tv, void __user *utv) { if (COMPAT_USE_64BIT_TIME) - return copy_to_user(utv, tv, sizeof *tv) ? -EFAULT : 0; + return copy_to_user(utv, tv, sizeof(*tv)) ? -EFAULT : 0; else - return put_compat_timeval(tv, utv); + return __compat_put_timeval(tv, utv); } EXPORT_SYMBOL_GPL(compat_put_timeval); int compat_get_timespec(struct timespec *ts, const void __user *uts) { if (COMPAT_USE_64BIT_TIME) - return copy_from_user(ts, uts, sizeof *ts) ? -EFAULT : 0; + return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0; else - return get_compat_timespec(ts, uts); + return __compat_get_timespec(ts, uts); } EXPORT_SYMBOL_GPL(compat_get_timespec); int compat_put_timespec(const struct timespec *ts, void __user *uts) { if (COMPAT_USE_64BIT_TIME) - return copy_to_user(uts, ts, sizeof *ts) ? -EFAULT : 0; + return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0; else - return put_compat_timespec(ts, uts); + return __compat_put_timespec(ts, uts); } EXPORT_SYMBOL_GPL(compat_put_timespec); +int compat_convert_timespec(struct timespec __user **kts, + const void __user *cts) +{ + struct timespec ts; + struct timespec __user *uts; + + if (!cts || COMPAT_USE_64BIT_TIME) { + *kts = (struct timespec __user *)cts; + return 0; + } + + uts = compat_alloc_user_space(sizeof(ts)); + if (!uts) + return -EFAULT; + if (compat_get_timespec(&ts, cts)) + return -EFAULT; + if (copy_to_user(uts, &ts, sizeof(ts))) + return -EFAULT; + + *kts = uts; + return 0; +} + static long compat_nanosleep_restart(struct restart_block *restart) { struct compat_timespec __user *rmtp; @@ -229,21 +229,21 @@ static long compat_nanosleep_restart(struct restart_block *restart) if (ret) { rmtp = restart->nanosleep.compat_rmtp; - if (rmtp && put_compat_timespec(&rmt, rmtp)) + if (rmtp && compat_put_timespec(&rmt, rmtp)) return -EFAULT; } return ret; } -asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp, - struct compat_timespec __user *rmtp) +COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, + struct compat_timespec __user *, rmtp) { struct timespec tu, rmt; mm_segment_t oldfs; long ret; - if (get_compat_timespec(&tu, rqtp)) + if (compat_get_timespec(&tu, rqtp)) return -EFAULT; if (!timespec_valid(&tu)) @@ -263,7 +263,7 @@ asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp, restart->fn = compat_nanosleep_restart; restart->nanosleep.compat_rmtp = rmtp; - if (rmtp && put_compat_timespec(&rmt, rmtp)) + if (rmtp && compat_put_timespec(&rmt, rmtp)) return -EFAULT; } @@ -328,7 +328,7 @@ static compat_clock_t clock_t_to_compat_clock_t(clock_t x) return compat_jiffies_to_clock_t(clock_t_to_jiffies(x)); } -asmlinkage long compat_sys_times(struct compat_tms __user *tbuf) +COMPAT_SYSCALL_DEFINE1(times, struct compat_tms __user *, tbuf) { if (tbuf) { struct tms tms; @@ -354,7 +354,7 @@ asmlinkage long compat_sys_times(struct compat_tms __user *tbuf) * types that can be passed to put_user()/get_user(). */ -asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set) +COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set) { old_sigset_t s; long ret; @@ -424,8 +424,8 @@ COMPAT_SYSCALL_DEFINE3(sigprocmask, int, how, #endif -asmlinkage long compat_sys_setrlimit(unsigned int resource, - struct compat_rlimit __user *rlim) +COMPAT_SYSCALL_DEFINE2(setrlimit, unsigned int, resource, + struct compat_rlimit __user *, rlim) { struct rlimit r; @@ -443,15 +443,15 @@ asmlinkage long compat_sys_setrlimit(unsigned int resource, #ifdef COMPAT_RLIM_OLD_INFINITY -asmlinkage long compat_sys_old_getrlimit(unsigned int resource, - struct compat_rlimit __user *rlim) +COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, + struct compat_rlimit __user *, rlim) { struct rlimit r; int ret; mm_segment_t old_fs = get_fs(); set_fs(KERNEL_DS); - ret = sys_old_getrlimit(resource, &r); + ret = sys_old_getrlimit(resource, (struct rlimit __user *)&r); set_fs(old_fs); if (!ret) { @@ -470,8 +470,8 @@ asmlinkage long compat_sys_old_getrlimit(unsigned int resource, #endif -asmlinkage long compat_sys_getrlimit(unsigned int resource, - struct compat_rlimit __user *rlim) +COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource, + struct compat_rlimit __user *, rlim) { struct rlimit r; int ret; @@ -596,9 +596,9 @@ static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr, return compat_get_bitmap(k, user_mask_ptr, len * 8); } -asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid, - unsigned int len, - compat_ulong_t __user *user_mask_ptr) +COMPAT_SYSCALL_DEFINE3(sched_setaffinity, compat_pid_t, pid, + unsigned int, len, + compat_ulong_t __user *, user_mask_ptr) { cpumask_var_t new_mask; int retval; @@ -616,8 +616,8 @@ out: return retval; } -asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len, - compat_ulong_t __user *user_mask_ptr) +COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t, pid, unsigned int, len, + compat_ulong_t __user *, user_mask_ptr) { int ret; cpumask_var_t mask; @@ -647,8 +647,8 @@ asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len, int get_compat_itimerspec(struct itimerspec *dst, const struct compat_itimerspec __user *src) { - if (get_compat_timespec(&dst->it_interval, &src->it_interval) || - get_compat_timespec(&dst->it_value, &src->it_value)) + if (__compat_get_timespec(&dst->it_interval, &src->it_interval) || + __compat_get_timespec(&dst->it_value, &src->it_value)) return -EFAULT; return 0; } @@ -656,15 +656,15 @@ int get_compat_itimerspec(struct itimerspec *dst, int put_compat_itimerspec(struct compat_itimerspec __user *dst, const struct itimerspec *src) { - if (put_compat_timespec(&src->it_interval, &dst->it_interval) || - put_compat_timespec(&src->it_value, &dst->it_value)) + if (__compat_put_timespec(&src->it_interval, &dst->it_interval) || + __compat_put_timespec(&src->it_value, &dst->it_value)) return -EFAULT; return 0; } -long compat_sys_timer_create(clockid_t which_clock, - struct compat_sigevent __user *timer_event_spec, - timer_t __user *created_timer_id) +COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock, + struct compat_sigevent __user *, timer_event_spec, + timer_t __user *, created_timer_id) { struct sigevent __user *event = NULL; @@ -680,9 +680,9 @@ long compat_sys_timer_create(clockid_t which_clock, return sys_timer_create(which_clock, event, created_timer_id); } -long compat_sys_timer_settime(timer_t timer_id, int flags, - struct compat_itimerspec __user *new, - struct compat_itimerspec __user *old) +COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, + struct compat_itimerspec __user *, new, + struct compat_itimerspec __user *, old) { long err; mm_segment_t oldfs; @@ -703,8 +703,8 @@ long compat_sys_timer_settime(timer_t timer_id, int flags, return err; } -long compat_sys_timer_gettime(timer_t timer_id, - struct compat_itimerspec __user *setting) +COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, + struct compat_itimerspec __user *, setting) { long err; mm_segment_t oldfs; @@ -720,14 +720,14 @@ long compat_sys_timer_gettime(timer_t timer_id, return err; } -long compat_sys_clock_settime(clockid_t which_clock, - struct compat_timespec __user *tp) +COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock, + struct compat_timespec __user *, tp) { long err; mm_segment_t oldfs; struct timespec ts; - if (get_compat_timespec(&ts, tp)) + if (compat_get_timespec(&ts, tp)) return -EFAULT; oldfs = get_fs(); set_fs(KERNEL_DS); @@ -737,8 +737,8 @@ long compat_sys_clock_settime(clockid_t which_clock, return err; } -long compat_sys_clock_gettime(clockid_t which_clock, - struct compat_timespec __user *tp) +COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, + struct compat_timespec __user *, tp) { long err; mm_segment_t oldfs; @@ -749,13 +749,13 @@ long compat_sys_clock_gettime(clockid_t which_clock, err = sys_clock_gettime(which_clock, (struct timespec __user *) &ts); set_fs(oldfs); - if (!err && put_compat_timespec(&ts, tp)) + if (!err && compat_put_timespec(&ts, tp)) return -EFAULT; return err; } -long compat_sys_clock_adjtime(clockid_t which_clock, - struct compat_timex __user *utp) +COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock, + struct compat_timex __user *, utp) { struct timex txc; mm_segment_t oldfs; @@ -777,8 +777,8 @@ long compat_sys_clock_adjtime(clockid_t which_clock, return ret; } -long compat_sys_clock_getres(clockid_t which_clock, - struct compat_timespec __user *tp) +COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, + struct compat_timespec __user *, tp) { long err; mm_segment_t oldfs; @@ -789,7 +789,7 @@ long compat_sys_clock_getres(clockid_t which_clock, err = sys_clock_getres(which_clock, (struct timespec __user *) &ts); set_fs(oldfs); - if (!err && tp && put_compat_timespec(&ts, tp)) + if (!err && tp && compat_put_timespec(&ts, tp)) return -EFAULT; return err; } @@ -799,7 +799,7 @@ static long compat_clock_nanosleep_restart(struct restart_block *restart) long err; mm_segment_t oldfs; struct timespec tu; - struct compat_timespec *rmtp = restart->nanosleep.compat_rmtp; + struct compat_timespec __user *rmtp = restart->nanosleep.compat_rmtp; restart->nanosleep.rmtp = (struct timespec __user *) &tu; oldfs = get_fs(); @@ -808,7 +808,7 @@ static long compat_clock_nanosleep_restart(struct restart_block *restart) set_fs(oldfs); if ((err == -ERESTART_RESTARTBLOCK) && rmtp && - put_compat_timespec(&tu, rmtp)) + compat_put_timespec(&tu, rmtp)) return -EFAULT; if (err == -ERESTART_RESTARTBLOCK) { @@ -818,16 +818,16 @@ static long compat_clock_nanosleep_restart(struct restart_block *restart) return err; } -long compat_sys_clock_nanosleep(clockid_t which_clock, int flags, - struct compat_timespec __user *rqtp, - struct compat_timespec __user *rmtp) +COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, + struct compat_timespec __user *, rqtp, + struct compat_timespec __user *, rmtp) { long err; mm_segment_t oldfs; struct timespec in, out; struct restart_block *restart; - if (get_compat_timespec(&in, rqtp)) + if (compat_get_timespec(&in, rqtp)) return -EFAULT; oldfs = get_fs(); @@ -838,7 +838,7 @@ long compat_sys_clock_nanosleep(clockid_t which_clock, int flags, set_fs(oldfs); if ((err == -ERESTART_RESTARTBLOCK) && rmtp && - put_compat_timespec(&out, rmtp)) + compat_put_timespec(&out, rmtp)) return -EFAULT; if (err == -ERESTART_RESTARTBLOCK) { @@ -1010,7 +1010,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, /* compat_time_t is a 32 bit "long" and needs to get converted. */ -asmlinkage long compat_sys_time(compat_time_t __user * tloc) +COMPAT_SYSCALL_DEFINE1(time, compat_time_t __user *, tloc) { compat_time_t i; struct timeval tv; @@ -1026,7 +1026,7 @@ asmlinkage long compat_sys_time(compat_time_t __user * tloc) return i; } -asmlinkage long compat_sys_stime(compat_time_t __user *tptr) +COMPAT_SYSCALL_DEFINE1(stime, compat_time_t __user *, tptr) { struct timespec tv; int err; @@ -1046,7 +1046,7 @@ asmlinkage long compat_sys_stime(compat_time_t __user *tptr) #endif /* __ARCH_WANT_COMPAT_SYS_TIME */ -asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) +COMPAT_SYSCALL_DEFINE1(adjtimex, struct compat_timex __user *, utp) { struct timex txc; int err, ret; @@ -1065,11 +1065,11 @@ asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) } #ifdef CONFIG_NUMA -asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages, - compat_uptr_t __user *pages32, - const int __user *nodes, - int __user *status, - int flags) +COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages, + compat_uptr_t __user *, pages32, + const int __user *, nodes, + int __user *, status, + int, flags) { const void __user * __user *pages; int i; @@ -1085,10 +1085,10 @@ asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages, return sys_move_pages(pid, nr_pages, pages, nodes, status, flags); } -asmlinkage long compat_sys_migrate_pages(compat_pid_t pid, - compat_ulong_t maxnode, - const compat_ulong_t __user *old_nodes, - const compat_ulong_t __user *new_nodes) +COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid, + compat_ulong_t, maxnode, + const compat_ulong_t __user *, old_nodes, + const compat_ulong_t __user *, new_nodes) { unsigned long __user *old = NULL; unsigned long __user *new = NULL; @@ -1130,7 +1130,7 @@ COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval, set_fs(KERNEL_DS); ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t); set_fs(old_fs); - if (put_compat_timespec(&t, interval)) + if (compat_put_timespec(&t, interval)) return -EFAULT; return ret; } diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 6cb20d2e7ee..5664985c46a 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -19,6 +19,7 @@ #include <linux/sched.h> #include <linux/hardirq.h> #include <linux/export.h> +#include <linux/kprobes.h> #define CREATE_TRACE_POINTS #include <trace/events/context_tracking.h> @@ -104,6 +105,7 @@ void context_tracking_user_enter(void) } local_irq_restore(flags); } +NOKPROBE_SYMBOL(context_tracking_user_enter); #ifdef CONFIG_PREEMPT /** @@ -120,7 +122,7 @@ void context_tracking_user_enter(void) * instead of preempt_schedule() to exit user context if needed before * calling the scheduler. */ -asmlinkage void __sched notrace preempt_schedule_context(void) +asmlinkage __visible void __sched notrace preempt_schedule_context(void) { enum ctx_state prev_ctx; @@ -181,6 +183,7 @@ void context_tracking_user_exit(void) } local_irq_restore(flags); } +NOKPROBE_SYMBOL(context_tracking_user_exit); /** * __context_tracking_task_switch - context switch the syscall callbacks diff --git a/kernel/cpu.c b/kernel/cpu.c index deff2e69376..a343bde710b 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -19,6 +19,8 @@ #include <linux/mutex.h> #include <linux/gfp.h> #include <linux/suspend.h> +#include <linux/lockdep.h> +#include <trace/events/power.h> #include "smpboot.h" @@ -27,18 +29,23 @@ static DEFINE_MUTEX(cpu_add_remove_lock); /* - * The following two API's must be used when attempting - * to serialize the updates to cpu_online_mask, cpu_present_mask. + * The following two APIs (cpu_maps_update_begin/done) must be used when + * attempting to serialize the updates to cpu_online_mask & cpu_present_mask. + * The APIs cpu_notifier_register_begin/done() must be used to protect CPU + * hotplug callback (un)registration performed using __register_cpu_notifier() + * or __unregister_cpu_notifier(). */ void cpu_maps_update_begin(void) { mutex_lock(&cpu_add_remove_lock); } +EXPORT_SYMBOL(cpu_notifier_register_begin); void cpu_maps_update_done(void) { mutex_unlock(&cpu_add_remove_lock); } +EXPORT_SYMBOL(cpu_notifier_register_done); static RAW_NOTIFIER_HEAD(cpu_chain); @@ -57,17 +64,30 @@ static struct { * an ongoing cpu hotplug operation. */ int refcount; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif } cpu_hotplug = { .active_writer = NULL, .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock), .refcount = 0, +#ifdef CONFIG_DEBUG_LOCK_ALLOC + .dep_map = {.name = "cpu_hotplug.lock" }, +#endif }; +/* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */ +#define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map) +#define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map) +#define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map) + void get_online_cpus(void) { might_sleep(); if (cpu_hotplug.active_writer == current) return; + cpuhp_lock_acquire_read(); mutex_lock(&cpu_hotplug.lock); cpu_hotplug.refcount++; mutex_unlock(&cpu_hotplug.lock); @@ -87,6 +107,7 @@ void put_online_cpus(void) if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer)) wake_up_process(cpu_hotplug.active_writer); mutex_unlock(&cpu_hotplug.lock); + cpuhp_lock_release(); } EXPORT_SYMBOL_GPL(put_online_cpus); @@ -117,6 +138,7 @@ void cpu_hotplug_begin(void) { cpu_hotplug.active_writer = current; + cpuhp_lock_acquire(); for (;;) { mutex_lock(&cpu_hotplug.lock); if (likely(!cpu_hotplug.refcount)) @@ -131,6 +153,7 @@ void cpu_hotplug_done(void) { cpu_hotplug.active_writer = NULL; mutex_unlock(&cpu_hotplug.lock); + cpuhp_lock_release(); } /* @@ -166,6 +189,11 @@ int __ref register_cpu_notifier(struct notifier_block *nb) return ret; } +int __ref __register_cpu_notifier(struct notifier_block *nb) +{ + return raw_notifier_chain_register(&cpu_chain, nb); +} + static int __cpu_notify(unsigned long val, void *v, int nr_to_call, int *nr_calls) { @@ -189,6 +217,7 @@ static void cpu_notify_nofail(unsigned long val, void *v) BUG_ON(cpu_notify(val, v)); } EXPORT_SYMBOL(register_cpu_notifier); +EXPORT_SYMBOL(__register_cpu_notifier); void __ref unregister_cpu_notifier(struct notifier_block *nb) { @@ -198,6 +227,12 @@ void __ref unregister_cpu_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(unregister_cpu_notifier); +void __ref __unregister_cpu_notifier(struct notifier_block *nb) +{ + raw_notifier_chain_unregister(&cpu_chain, nb); +} +EXPORT_SYMBOL(__unregister_cpu_notifier); + /** * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU * @cpu: a CPU id @@ -249,8 +284,7 @@ static inline void check_for_tasks(int cpu) task_cputime(p, &utime, &stime); if (task_cpu(p) == cpu && p->state == TASK_RUNNING && (utime || stime)) - printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d " - "(state = %ld, flags = %x)\n", + pr_warn("Task %s (pid = %d) is on cpu %d (state = %ld, flags = %x)\n", p->comm, task_pid_nr(p), cpu, p->state, p->flags); } @@ -302,8 +336,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) if (err) { nr_calls--; __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); - printk("%s: attempt to take down CPU %u failed\n", - __func__, cpu); + pr_warn("%s: attempt to take down CPU %u failed\n", + __func__, cpu); goto out_release; } @@ -410,8 +444,8 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen) ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); if (ret) { nr_calls--; - printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n", - __func__, cpu); + pr_warn("%s: attempt to bring up CPU %u failed\n", + __func__, cpu); goto out_notify; } @@ -441,11 +475,10 @@ int cpu_up(unsigned int cpu) int err = 0; if (!cpu_possible(cpu)) { - printk(KERN_ERR "can't online cpu %d because it is not " - "configured as may-hotadd at boot time\n", cpu); + pr_err("can't online cpu %d because it is not configured as may-hotadd at boot time\n", + cpu); #if defined(CONFIG_IA64) - printk(KERN_ERR "please check additional_cpus= boot " - "parameter\n"); + pr_err("please check additional_cpus= boot parameter\n"); #endif return -EINVAL; } @@ -484,16 +517,17 @@ int disable_nonboot_cpus(void) */ cpumask_clear(frozen_cpus); - printk("Disabling non-boot CPUs ...\n"); + pr_info("Disabling non-boot CPUs ...\n"); for_each_online_cpu(cpu) { if (cpu == first_cpu) continue; + trace_suspend_resume(TPS("CPU_OFF"), cpu, true); error = _cpu_down(cpu, 1); + trace_suspend_resume(TPS("CPU_OFF"), cpu, false); if (!error) cpumask_set_cpu(cpu, frozen_cpus); else { - printk(KERN_ERR "Error taking CPU%d down: %d\n", - cpu, error); + pr_err("Error taking CPU%d down: %d\n", cpu, error); break; } } @@ -503,7 +537,7 @@ int disable_nonboot_cpus(void) /* Make sure the CPUs won't be enabled by someone else */ cpu_hotplug_disabled = 1; } else { - printk(KERN_ERR "Non-boot CPUs are not disabled\n"); + pr_err("Non-boot CPUs are not disabled\n"); } cpu_maps_update_done(); return error; @@ -527,17 +561,19 @@ void __ref enable_nonboot_cpus(void) if (cpumask_empty(frozen_cpus)) goto out; - printk(KERN_INFO "Enabling non-boot CPUs ...\n"); + pr_info("Enabling non-boot CPUs ...\n"); arch_enable_nonboot_cpus_begin(); for_each_cpu(cpu, frozen_cpus) { + trace_suspend_resume(TPS("CPU_ON"), cpu, true); error = _cpu_up(cpu, 1); + trace_suspend_resume(TPS("CPU_ON"), cpu, false); if (!error) { - printk(KERN_INFO "CPU%d is up\n", cpu); + pr_info("CPU%d is up\n", cpu); continue; } - printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); + pr_warn("Error taking CPU%d up: %d\n", cpu, error); } arch_enable_nonboot_cpus_end(); @@ -692,10 +728,12 @@ void set_cpu_present(unsigned int cpu, bool present) void set_cpu_online(unsigned int cpu, bool online) { - if (online) + if (online) { cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits)); - else + cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits)); + } else { cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits)); + } } void set_cpu_active(unsigned int cpu, bool active) diff --git a/kernel/cpu/Makefile b/kernel/cpu/Makefile deleted file mode 100644 index 59ab052ef7a..00000000000 --- a/kernel/cpu/Makefile +++ /dev/null @@ -1 +0,0 @@ -obj-y = idle.o diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c deleted file mode 100644 index 277f494c2a9..00000000000 --- a/kernel/cpu/idle.c +++ /dev/null @@ -1,144 +0,0 @@ -/* - * Generic entry point for the idle threads - */ -#include <linux/sched.h> -#include <linux/cpu.h> -#include <linux/tick.h> -#include <linux/mm.h> -#include <linux/stackprotector.h> - -#include <asm/tlb.h> - -#include <trace/events/power.h> - -static int __read_mostly cpu_idle_force_poll; - -void cpu_idle_poll_ctrl(bool enable) -{ - if (enable) { - cpu_idle_force_poll++; - } else { - cpu_idle_force_poll--; - WARN_ON_ONCE(cpu_idle_force_poll < 0); - } -} - -#ifdef CONFIG_GENERIC_IDLE_POLL_SETUP -static int __init cpu_idle_poll_setup(char *__unused) -{ - cpu_idle_force_poll = 1; - return 1; -} -__setup("nohlt", cpu_idle_poll_setup); - -static int __init cpu_idle_nopoll_setup(char *__unused) -{ - cpu_idle_force_poll = 0; - return 1; -} -__setup("hlt", cpu_idle_nopoll_setup); -#endif - -static inline int cpu_idle_poll(void) -{ - rcu_idle_enter(); - trace_cpu_idle_rcuidle(0, smp_processor_id()); - local_irq_enable(); - while (!tif_need_resched()) - cpu_relax(); - trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); - rcu_idle_exit(); - return 1; -} - -/* Weak implementations for optional arch specific functions */ -void __weak arch_cpu_idle_prepare(void) { } -void __weak arch_cpu_idle_enter(void) { } -void __weak arch_cpu_idle_exit(void) { } -void __weak arch_cpu_idle_dead(void) { } -void __weak arch_cpu_idle(void) -{ - cpu_idle_force_poll = 1; - local_irq_enable(); -} - -/* - * Generic idle loop implementation - */ -static void cpu_idle_loop(void) -{ - while (1) { - tick_nohz_idle_enter(); - - while (!need_resched()) { - check_pgt_cache(); - rmb(); - - if (cpu_is_offline(smp_processor_id())) - arch_cpu_idle_dead(); - - local_irq_disable(); - arch_cpu_idle_enter(); - - /* - * In poll mode we reenable interrupts and spin. - * - * Also if we detected in the wakeup from idle - * path that the tick broadcast device expired - * for us, we don't want to go deep idle as we - * know that the IPI is going to arrive right - * away - */ - if (cpu_idle_force_poll || tick_check_broadcast_expired()) { - cpu_idle_poll(); - } else { - if (!current_clr_polling_and_test()) { - stop_critical_timings(); - rcu_idle_enter(); - arch_cpu_idle(); - WARN_ON_ONCE(irqs_disabled()); - rcu_idle_exit(); - start_critical_timings(); - } else { - local_irq_enable(); - } - __current_set_polling(); - } - arch_cpu_idle_exit(); - } - - /* - * Since we fell out of the loop above, we know - * TIF_NEED_RESCHED must be set, propagate it into - * PREEMPT_NEED_RESCHED. - * - * This is required because for polling idle loops we will - * not have had an IPI to fold the state for us. - */ - preempt_set_need_resched(); - tick_nohz_idle_exit(); - schedule_preempt_disabled(); - } -} - -void cpu_startup_entry(enum cpuhp_state state) -{ - /* - * This #ifdef needs to die, but it's too late in the cycle to - * make this generic (arm and sh have never invoked the canary - * init for the non boot cpus!). Will be fixed in 3.11 - */ -#ifdef CONFIG_X86 - /* - * If we're the non-boot CPU, nothing set the stack canary up - * for us. The boot CPU already has it initialized but no harm - * in doing it again. This is a good place for updating it, as - * we wont ever return from this function (so the invalid - * canaries already on the stack wont ever trigger). - */ - boot_init_stack_canary(); -#endif - __current_set_polling(); - arch_cpu_idle_prepare(); - cpu_idle_loop(); -} diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 4410ac6a55f..116a4164720 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -61,12 +61,7 @@ #include <linux/cgroup.h> #include <linux/wait.h> -/* - * Tracks how many cpusets are currently defined in system. - * When there is only one cpuset (the root cpuset) we can - * short circuit some hooks. - */ -int number_of_cpusets __read_mostly; +struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE; /* See "Frequency meter" comments, below. */ @@ -119,12 +114,12 @@ static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) /* Retrieve the cpuset for a task */ static inline struct cpuset *task_cs(struct task_struct *task) { - return css_cs(task_css(task, cpuset_subsys_id)); + return css_cs(task_css(task, cpuset_cgrp_id)); } static inline struct cpuset *parent_cs(struct cpuset *cs) { - return css_cs(css_parent(&cs->css)); + return css_cs(cs->css.parent); } #ifdef CONFIG_NUMA @@ -467,7 +462,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) * be changed to have empty cpus_allowed or mems_allowed. */ ret = -ENOSPC; - if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress)) { + if ((cgroup_has_tasks(cur->css.cgroup) || cur->attach_in_progress)) { if (!cpumask_empty(cur->cpus_allowed) && cpumask_empty(trial->cpus_allowed)) goto out; @@ -611,7 +606,7 @@ static int generate_sched_domains(cpumask_var_t **domains, goto done; } - csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); + csa = kmalloc(nr_cpusets() * sizeof(cp), GFP_KERNEL); if (!csa) goto done; csn = 0; @@ -696,11 +691,8 @@ restart: if (nslot == ndoms) { static int warnings = 10; if (warnings) { - printk(KERN_WARNING - "rebuild_sched_domains confused:" - " nslot %d, ndoms %d, csn %d, i %d," - " apn %d\n", - nslot, ndoms, csn, i, apn); + pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n", + nslot, ndoms, csn, i, apn); warnings--; } continue; @@ -829,55 +821,36 @@ static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs) } /** - * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's - * @tsk: task to test - * @data: cpuset to @tsk belongs to - * - * Called by css_scan_tasks() for each task in a cgroup whose cpus_allowed - * mask needs to be changed. - * - * We don't need to re-check for the cgroup/cpuset membership, since we're - * holding cpuset_mutex at this point. - */ -static void cpuset_change_cpumask(struct task_struct *tsk, void *data) -{ - struct cpuset *cs = data; - struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); - - set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed); -} - -/** * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed - * @heap: if NULL, defer allocating heap memory to css_scan_tasks() - * - * Called with cpuset_mutex held * - * The css_scan_tasks() function will scan all the tasks in a cgroup, - * calling callback functions for each. - * - * No return value. It's guaranteed that css_scan_tasks() always returns 0 - * if @heap != NULL. + * Iterate through each task of @cs updating its cpus_allowed to the + * effective cpuset's. As this function is called with cpuset_mutex held, + * cpuset membership stays stable. */ -static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) +static void update_tasks_cpumask(struct cpuset *cs) { - css_scan_tasks(&cs->css, NULL, cpuset_change_cpumask, cs, heap); + struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); + struct css_task_iter it; + struct task_struct *task; + + css_task_iter_start(&cs->css, &it); + while ((task = css_task_iter_next(&it))) + set_cpus_allowed_ptr(task, cpus_cs->cpus_allowed); + css_task_iter_end(&it); } /* * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy. * @root_cs: the root cpuset of the hierarchy * @update_root: update root cpuset or not? - * @heap: the heap used by css_scan_tasks() * * This will update cpumasks of tasks in @root_cs and all other empty cpusets * which take on cpumask of @root_cs. * * Called with cpuset_mutex held */ -static void update_tasks_cpumask_hier(struct cpuset *root_cs, - bool update_root, struct ptr_heap *heap) +static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root) { struct cpuset *cp; struct cgroup_subsys_state *pos_css; @@ -894,11 +867,11 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs, continue; } } - if (!css_tryget(&cp->css)) + if (!css_tryget_online(&cp->css)) continue; rcu_read_unlock(); - update_tasks_cpumask(cp, heap); + update_tasks_cpumask(cp); rcu_read_lock(); css_put(&cp->css); @@ -909,12 +882,12 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs, /** * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it * @cs: the cpuset to consider + * @trialcs: trial cpuset * @buf: buffer of cpu numbers written to this cpuset */ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, const char *buf) { - struct ptr_heap heap; int retval; int is_load_balanced; @@ -947,19 +920,13 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (retval < 0) return retval; - retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); - if (retval) - return retval; - is_load_balanced = is_sched_load_balance(trialcs); mutex_lock(&callback_mutex); cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); mutex_unlock(&callback_mutex); - update_tasks_cpumask_hier(cs, true, &heap); - - heap_free(&heap); + update_tasks_cpumask_hier(cs, true); if (is_load_balanced) rebuild_sched_domains_locked(); @@ -974,12 +941,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, * Temporarilly set tasks mems_allowed to target nodes of migration, * so that the migration code can allocate pages on these nodes. * - * Call holding cpuset_mutex, so current's cpuset won't change - * during this call, as manage_mutex holds off any cpuset_attach() - * calls. Therefore we don't need to take task_lock around the - * call to guarantee_online_mems(), as we know no one is changing - * our task's cpuset. - * * While the mm_struct we are migrating is typically from some * other task, the task_struct mems_allowed that we are hacking * is for our current task, which must allocate new pages for that @@ -996,8 +957,10 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); + rcu_read_lock(); mems_cs = effective_nodemask_cpuset(task_cs(tsk)); guarantee_online_mems(mems_cs, &tsk->mems_allowed); + rcu_read_unlock(); } /* @@ -1026,7 +989,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, task_lock(tsk); /* * Determine if a loop is necessary if another thread is doing - * get_mems_allowed(). If at least one node remains unchanged and + * read_mems_allowed_begin(). If at least one node remains unchanged and * tsk does not have a mempolicy, then an empty nodemask will not be * possible when mems_allowed is larger than a word. */ @@ -1052,53 +1015,22 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, task_unlock(tsk); } -struct cpuset_change_nodemask_arg { - struct cpuset *cs; - nodemask_t *newmems; -}; - -/* - * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy - * of it to cpuset's new mems_allowed, and migrate pages to new nodes if - * memory_migrate flag is set. Called with cpuset_mutex held. - */ -static void cpuset_change_nodemask(struct task_struct *p, void *data) -{ - struct cpuset_change_nodemask_arg *arg = data; - struct cpuset *cs = arg->cs; - struct mm_struct *mm; - int migrate; - - cpuset_change_task_nodemask(p, arg->newmems); - - mm = get_task_mm(p); - if (!mm) - return; - - migrate = is_memory_migrate(cs); - - mpol_rebind_mm(mm, &cs->mems_allowed); - if (migrate) - cpuset_migrate_mm(mm, &cs->old_mems_allowed, arg->newmems); - mmput(mm); -} - static void *cpuset_being_rebound; /** * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. * @cs: the cpuset in which each task's mems_allowed mask needs to be changed - * @heap: if NULL, defer allocating heap memory to css_scan_tasks() * - * Called with cpuset_mutex held. No return value. It's guaranteed that - * css_scan_tasks() always returns 0 if @heap != NULL. + * Iterate through each task of @cs updating its mems_allowed to the + * effective cpuset's. As this function is called with cpuset_mutex held, + * cpuset membership stays stable. */ -static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) +static void update_tasks_nodemask(struct cpuset *cs) { static nodemask_t newmems; /* protected by cpuset_mutex */ struct cpuset *mems_cs = effective_nodemask_cpuset(cs); - struct cpuset_change_nodemask_arg arg = { .cs = cs, - .newmems = &newmems }; + struct css_task_iter it; + struct task_struct *task; cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ @@ -1114,7 +1046,25 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) * It's ok if we rebind the same mm twice; mpol_rebind_mm() * is idempotent. Also migrate pages in each mm to new nodes. */ - css_scan_tasks(&cs->css, NULL, cpuset_change_nodemask, &arg, heap); + css_task_iter_start(&cs->css, &it); + while ((task = css_task_iter_next(&it))) { + struct mm_struct *mm; + bool migrate; + + cpuset_change_task_nodemask(task, &newmems); + + mm = get_task_mm(task); + if (!mm) + continue; + + migrate = is_memory_migrate(cs); + + mpol_rebind_mm(mm, &cs->mems_allowed); + if (migrate) + cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems); + mmput(mm); + } + css_task_iter_end(&it); /* * All the tasks' nodemasks have been updated, update @@ -1130,15 +1080,13 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy. * @cs: the root cpuset of the hierarchy * @update_root: update the root cpuset or not? - * @heap: the heap used by css_scan_tasks() * * This will update nodemasks of tasks in @root_cs and all other empty cpusets * which take on nodemask of @root_cs. * * Called with cpuset_mutex held */ -static void update_tasks_nodemask_hier(struct cpuset *root_cs, - bool update_root, struct ptr_heap *heap) +static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root) { struct cpuset *cp; struct cgroup_subsys_state *pos_css; @@ -1155,11 +1103,11 @@ static void update_tasks_nodemask_hier(struct cpuset *root_cs, continue; } } - if (!css_tryget(&cp->css)) + if (!css_tryget_online(&cp->css)) continue; rcu_read_unlock(); - update_tasks_nodemask(cp, heap); + update_tasks_nodemask(cp); rcu_read_lock(); css_put(&cp->css); @@ -1184,7 +1132,6 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, const char *buf) { int retval; - struct ptr_heap heap; /* * top_cpuset.mems_allowed tracks node_stats[N_MEMORY]; @@ -1223,24 +1170,24 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, if (retval < 0) goto done; - retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); - if (retval < 0) - goto done; - mutex_lock(&callback_mutex); cs->mems_allowed = trialcs->mems_allowed; mutex_unlock(&callback_mutex); - update_tasks_nodemask_hier(cs, true, &heap); - - heap_free(&heap); + update_tasks_nodemask_hier(cs, true); done: return retval; } int current_cpuset_is_being_rebound(void) { - return task_cs(current) == cpuset_being_rebound; + int ret; + + rcu_read_lock(); + ret = task_cs(current) == cpuset_being_rebound; + rcu_read_unlock(); + + return ret; } static int update_relax_domain_level(struct cpuset *cs, s64 val) @@ -1261,38 +1208,22 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) } /** - * cpuset_change_flag - make a task's spread flags the same as its cpuset's - * @tsk: task to be updated - * @data: cpuset to @tsk belongs to - * - * Called by css_scan_tasks() for each task in a cgroup. - * - * We don't need to re-check for the cgroup/cpuset membership, since we're - * holding cpuset_mutex at this point. - */ -static void cpuset_change_flag(struct task_struct *tsk, void *data) -{ - struct cpuset *cs = data; - - cpuset_update_task_spread_flag(cs, tsk); -} - -/** * update_tasks_flags - update the spread flags of tasks in the cpuset. * @cs: the cpuset in which each task's spread flags needs to be changed - * @heap: if NULL, defer allocating heap memory to css_scan_tasks() * - * Called with cpuset_mutex held - * - * The css_scan_tasks() function will scan all the tasks in a cgroup, - * calling callback functions for each. - * - * No return value. It's guaranteed that css_scan_tasks() always returns 0 - * if @heap != NULL. + * Iterate through each task of @cs updating its spread flags. As this + * function is called with cpuset_mutex held, cpuset membership stays + * stable. */ -static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap) +static void update_tasks_flags(struct cpuset *cs) { - css_scan_tasks(&cs->css, NULL, cpuset_change_flag, cs, heap); + struct css_task_iter it; + struct task_struct *task; + + css_task_iter_start(&cs->css, &it); + while ((task = css_task_iter_next(&it))) + cpuset_update_task_spread_flag(cs, task); + css_task_iter_end(&it); } /* @@ -1310,7 +1241,6 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, struct cpuset *trialcs; int balance_flag_changed; int spread_flag_changed; - struct ptr_heap heap; int err; trialcs = alloc_trial_cpuset(cs); @@ -1326,10 +1256,6 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, if (err < 0) goto out; - err = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); - if (err < 0) - goto out; - balance_flag_changed = (is_sched_load_balance(cs) != is_sched_load_balance(trialcs)); @@ -1344,8 +1270,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, rebuild_sched_domains_locked(); if (spread_flag_changed) - update_tasks_flags(cs, &heap); - heap_free(&heap); + update_tasks_flags(cs); out: free_trial_cpuset(trialcs); return err; @@ -1449,6 +1374,8 @@ static int fmeter_getrate(struct fmeter *fmp) return val; } +static struct cpuset *cpuset_attach_old_cs; + /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ static int cpuset_can_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) @@ -1457,6 +1384,9 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css, struct task_struct *task; int ret; + /* used later by cpuset_attach() */ + cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset)); + mutex_lock(&cpuset_mutex); /* @@ -1468,7 +1398,7 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css, (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) goto out_unlock; - cgroup_taskset_for_each(task, css, tset) { + cgroup_taskset_for_each(task, tset) { /* * Kthreads which disallow setaffinity shouldn't be moved * to a new cpuset; we don't want to change their cpu @@ -1520,10 +1450,8 @@ static void cpuset_attach(struct cgroup_subsys_state *css, struct mm_struct *mm; struct task_struct *task; struct task_struct *leader = cgroup_taskset_first(tset); - struct cgroup_subsys_state *oldcss = cgroup_taskset_cur_css(tset, - cpuset_subsys_id); struct cpuset *cs = css_cs(css); - struct cpuset *oldcs = css_cs(oldcss); + struct cpuset *oldcs = cpuset_attach_old_cs; struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); struct cpuset *mems_cs = effective_nodemask_cpuset(cs); @@ -1537,7 +1465,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css, guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to); - cgroup_taskset_for_each(task, css, tset) { + cgroup_taskset_for_each(task, tset) { /* * can_attach beforehand should guarantee that this doesn't * fail. TODO: have a better way to handle failure here @@ -1676,13 +1604,15 @@ out_unlock: /* * Common handling for a write to a "cpus" or "mems" file. */ -static int cpuset_write_resmask(struct cgroup_subsys_state *css, - struct cftype *cft, const char *buf) +static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { - struct cpuset *cs = css_cs(css); + struct cpuset *cs = css_cs(of_css(of)); struct cpuset *trialcs; int retval = -ENODEV; + buf = strstrip(buf); + /* * CPU or memory hotunplug may leave @cs w/o any execution * resources, in which case the hotplug code asynchronously updates @@ -1693,7 +1623,17 @@ static int cpuset_write_resmask(struct cgroup_subsys_state *css, * resources, wait for the previously scheduled operations before * proceeding, so that we don't end up keep removing tasks added * after execution capability is restored. + * + * cpuset_hotplug_work calls back into cgroup core via + * cgroup_transfer_tasks() and waiting for it from a cgroupfs + * operation like this one can lead to a deadlock through kernfs + * active_ref protection. Let's break the protection. Losing the + * protection is okay as we check whether @cs is online after + * grabbing cpuset_mutex anyway. This only happens on the legacy + * hierarchies. */ + css_get(&cs->css); + kernfs_break_active_protection(of->kn); flush_work(&cpuset_hotplug_work); mutex_lock(&cpuset_mutex); @@ -1706,7 +1646,7 @@ static int cpuset_write_resmask(struct cgroup_subsys_state *css, goto out_unlock; } - switch (cft->private) { + switch (of_cft(of)->private) { case FILE_CPULIST: retval = update_cpumask(cs, trialcs, buf); break; @@ -1721,7 +1661,9 @@ static int cpuset_write_resmask(struct cgroup_subsys_state *css, free_trial_cpuset(trialcs); out_unlock: mutex_unlock(&cpuset_mutex); - return retval; + kernfs_unbreak_active_protection(of->kn); + css_put(&cs->css); + return retval ?: nbytes; } /* @@ -1823,7 +1765,7 @@ static struct cftype files[] = { { .name = "cpus", .seq_show = cpuset_common_seq_show, - .write_string = cpuset_write_resmask, + .write = cpuset_write_resmask, .max_write_len = (100U + 6 * NR_CPUS), .private = FILE_CPULIST, }, @@ -1831,7 +1773,7 @@ static struct cftype files[] = { { .name = "mems", .seq_show = cpuset_common_seq_show, - .write_string = cpuset_write_resmask, + .write = cpuset_write_resmask, .max_write_len = (100U + 6 * MAX_NUMNODES), .private = FILE_MEMLIST, }, @@ -1959,7 +1901,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) if (is_spread_slab(parent)) set_bit(CS_SPREAD_SLAB, &cs->flags); - number_of_cpusets++; + cpuset_inc(); if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) goto out_unlock; @@ -2010,7 +1952,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) if (is_sched_load_balance(cs)) update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); - number_of_cpusets--; + cpuset_dec(); clear_bit(CS_ONLINE, &cs->flags); mutex_unlock(&cpuset_mutex); @@ -2024,8 +1966,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css) kfree(cs); } -struct cgroup_subsys cpuset_subsys = { - .name = "cpuset", +struct cgroup_subsys cpuset_cgrp_subsys = { .css_alloc = cpuset_css_alloc, .css_online = cpuset_css_online, .css_offline = cpuset_css_offline, @@ -2033,7 +1974,6 @@ struct cgroup_subsys cpuset_subsys = { .can_attach = cpuset_can_attach, .cancel_attach = cpuset_cancel_attach, .attach = cpuset_attach, - .subsys_id = cpuset_subsys_id, .base_cftypes = files, .early_init = 1, }; @@ -2065,7 +2005,6 @@ int __init cpuset_init(void) if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)) BUG(); - number_of_cpusets = 1; return 0; } @@ -2090,10 +2029,9 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) parent = parent_cs(parent); if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) { - rcu_read_lock(); - printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset %s\n", - cgroup_name(cs->css.cgroup)); - rcu_read_unlock(); + pr_err("cpuset: failed to transfer tasks out of empty cpuset "); + pr_cont_cgroup_name(cs->css.cgroup); + pr_cont("\n"); } } @@ -2141,7 +2079,7 @@ retry: */ if ((sane && cpumask_empty(cs->cpus_allowed)) || (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed))) - update_tasks_cpumask(cs, NULL); + update_tasks_cpumask(cs); mutex_lock(&callback_mutex); nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems); @@ -2155,7 +2093,7 @@ retry: */ if ((sane && nodes_empty(cs->mems_allowed)) || (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed))) - update_tasks_nodemask(cs, NULL); + update_tasks_nodemask(cs); is_empty = cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed); @@ -2217,7 +2155,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) mutex_lock(&callback_mutex); top_cpuset.mems_allowed = new_mems; mutex_unlock(&callback_mutex); - update_tasks_nodemask(&top_cpuset, NULL); + update_tasks_nodemask(&top_cpuset); } mutex_unlock(&cpuset_mutex); @@ -2229,7 +2167,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) rcu_read_lock(); cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { - if (cs == &top_cpuset || !css_tryget(&cs->css)) + if (cs == &top_cpuset || !css_tryget_online(&cs->css)) continue; rcu_read_unlock(); @@ -2309,10 +2247,10 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) struct cpuset *cpus_cs; mutex_lock(&callback_mutex); - task_lock(tsk); + rcu_read_lock(); cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); guarantee_online_cpus(cpus_cs, pmask); - task_unlock(tsk); + rcu_read_unlock(); mutex_unlock(&callback_mutex); } @@ -2365,10 +2303,10 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk) nodemask_t mask; mutex_lock(&callback_mutex); - task_lock(tsk); + rcu_read_lock(); mems_cs = effective_nodemask_cpuset(task_cs(tsk)); guarantee_online_mems(mems_cs, &mask); - task_unlock(tsk); + rcu_read_unlock(); mutex_unlock(&callback_mutex); return mask; @@ -2484,11 +2422,11 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) /* Not hardwall and node outside mems_allowed: scan up cpusets */ mutex_lock(&callback_mutex); - task_lock(current); + rcu_read_lock(); cs = nearest_hardwall_ancestor(task_cs(current)); - task_unlock(current); - allowed = node_isset(node, cs->mems_allowed); + rcu_read_unlock(); + mutex_unlock(&callback_mutex); return allowed; } @@ -2610,30 +2548,30 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, /** * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed - * @task: pointer to task_struct of some task. + * @tsk: pointer to task_struct of some task. * * Description: Prints @task's name, cpuset name, and cached copy of its - * mems_allowed to the kernel log. Must hold task_lock(task) to allow - * dereferencing task_cs(task). + * mems_allowed to the kernel log. */ void cpuset_print_task_mems_allowed(struct task_struct *tsk) { /* Statically allocated to prevent using excess stack. */ static char cpuset_nodelist[CPUSET_NODELIST_LEN]; static DEFINE_SPINLOCK(cpuset_buffer_lock); + struct cgroup *cgrp; - struct cgroup *cgrp = task_cs(tsk)->css.cgroup; - - rcu_read_lock(); spin_lock(&cpuset_buffer_lock); + rcu_read_lock(); + cgrp = task_cs(tsk)->css.cgroup; nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, tsk->mems_allowed); - printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n", - tsk->comm, cgroup_name(cgrp), cpuset_nodelist); + pr_info("%s cpuset=", tsk->comm); + pr_cont_cgroup_name(cgrp); + pr_cont(" mems_allowed=%s\n", cpuset_nodelist); - spin_unlock(&cpuset_buffer_lock); rcu_read_unlock(); + spin_unlock(&cpuset_buffer_lock); } /* @@ -2664,9 +2602,9 @@ int cpuset_memory_pressure_enabled __read_mostly; void __cpuset_memory_pressure_bump(void) { - task_lock(current); + rcu_read_lock(); fmeter_markevent(&task_cs(current)->fmeter); - task_unlock(current); + rcu_read_unlock(); } #ifdef CONFIG_PROC_PID_CPUSET @@ -2683,12 +2621,12 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v) { struct pid *pid; struct task_struct *tsk; - char *buf; + char *buf, *p; struct cgroup_subsys_state *css; int retval; retval = -ENOMEM; - buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + buf = kmalloc(PATH_MAX, GFP_KERNEL); if (!buf) goto out; @@ -2698,14 +2636,16 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v) if (!tsk) goto out_free; + retval = -ENAMETOOLONG; rcu_read_lock(); - css = task_css(tsk, cpuset_subsys_id); - retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); + css = task_css(tsk, cpuset_cgrp_id); + p = cgroup_path(css->cgroup, buf, PATH_MAX); rcu_read_unlock(); - if (retval < 0) + if (!p) goto out_put_task; - seq_puts(m, buf); + seq_puts(m, p); seq_putc(m, '\n'); + retval = 0; out_put_task: put_task_struct(tsk); out_free: @@ -2718,10 +2658,10 @@ out: /* Display task mems_allowed in /proc/<pid>/status file. */ void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) { - seq_printf(m, "Mems_allowed:\t"); + seq_puts(m, "Mems_allowed:\t"); seq_nodemask(m, &task->mems_allowed); - seq_printf(m, "\n"); - seq_printf(m, "Mems_allowed_list:\t"); + seq_puts(m, "\n"); + seq_puts(m, "Mems_allowed_list:\t"); seq_nodemask_list(m, &task->mems_allowed); - seq_printf(m, "\n"); + seq_puts(m, "\n"); } diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 334b3980ffc..1adf62b39b9 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -49,6 +49,7 @@ #include <linux/pid.h> #include <linux/smp.h> #include <linux/mm.h> +#include <linux/vmacache.h> #include <linux/rcupdate.h> #include <asm/cacheflush.h> @@ -224,10 +225,17 @@ static void kgdb_flush_swbreak_addr(unsigned long addr) if (!CACHE_FLUSH_IS_SAFE) return; - if (current->mm && current->mm->mmap_cache) { - flush_cache_range(current->mm->mmap_cache, - addr, addr + BREAK_INSTR_SIZE); + if (current->mm) { + int i; + + for (i = 0; i < VMACACHE_SIZE; i++) { + if (!current->vmacache[i]) + continue; + flush_cache_range(current->vmacache[i], + addr, addr + BREAK_INSTR_SIZE); + } } + /* Force flush instruction cache if it was outside the mm */ flush_icache_range(addr, addr + BREAK_INSTR_SIZE); } @@ -526,7 +534,7 @@ return_normal: kgdb_info[cpu].exception_state &= ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE); kgdb_info[cpu].enter_kgdb--; - smp_mb__before_atomic_dec(); + smp_mb__before_atomic(); atomic_dec(&slaves_in_kgdb); dbg_touch_watchdogs(); local_irq_restore(flags); @@ -654,7 +662,7 @@ kgdb_restore: kgdb_info[cpu].exception_state &= ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE); kgdb_info[cpu].enter_kgdb--; - smp_mb__before_atomic_dec(); + smp_mb__before_atomic(); atomic_dec(&masters_in_kgdb); /* Free kgdb_active */ atomic_set(&kgdb_active, -1); @@ -1035,7 +1043,7 @@ int dbg_io_get_char(void) * otherwise as a quick means to stop program execution and "break" into * the debugger. */ -void kgdb_breakpoint(void) +noinline void kgdb_breakpoint(void) { atomic_inc(&kgdb_setting_breakpoint); wmb(); /* Sync point before breakpoint */ diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index b03e0e814e4..fe15fff5df5 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -21,7 +21,7 @@ static void kdb_show_stack(struct task_struct *p, void *addr) { int old_lvl = console_loglevel; - console_loglevel = 15; + console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH; kdb_trap_printk++; kdb_set_current_task(p); if (addr) { diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 14ff4849262..7c70812caea 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -710,7 +710,7 @@ kdb_printit: } if (logging) { saved_loglevel = console_loglevel; - console_loglevel = 0; + console_loglevel = CONSOLE_LOGLEVEL_SILENT; printk(KERN_INFO "%s", kdb_buffer); } diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 0b097c8a1e5..2f7c760305c 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1091,7 +1091,7 @@ static int kdb_reboot(int argc, const char **argv) static void kdb_dumpregs(struct pt_regs *regs) { int old_lvl = console_loglevel; - console_loglevel = 15; + console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH; kdb_trap_printk++; show_regs(regs); kdb_trap_printk--; diff --git a/kernel/events/core.c b/kernel/events/core.c index 56003c6edfd..6b17ac1b0c2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -39,6 +39,8 @@ #include <linux/hw_breakpoint.h> #include <linux/mm_types.h> #include <linux/cgroup.h> +#include <linux/module.h> +#include <linux/mman.h> #include "internal.h" @@ -231,11 +233,29 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, #define NR_ACCUMULATED_SAMPLES 128 static DEFINE_PER_CPU(u64, running_sample_length); -void perf_sample_event_took(u64 sample_len_ns) +static void perf_duration_warn(struct irq_work *w) { + u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns); u64 avg_local_sample_len; u64 local_samples_len; + + local_samples_len = __get_cpu_var(running_sample_length); + avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; + + printk_ratelimited(KERN_WARNING + "perf interrupt took too long (%lld > %lld), lowering " + "kernel.perf_event_max_sample_rate to %d\n", + avg_local_sample_len, allowed_ns >> 1, + sysctl_perf_event_sample_rate); +} + +static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn); + +void perf_sample_event_took(u64 sample_len_ns) +{ u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns); + u64 avg_local_sample_len; + u64 local_samples_len; if (allowed_ns == 0) return; @@ -263,13 +283,14 @@ void perf_sample_event_took(u64 sample_len_ns) sysctl_perf_event_sample_rate = max_samples_per_tick * HZ; perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; - printk_ratelimited(KERN_WARNING - "perf samples too long (%lld > %lld), lowering " - "kernel.perf_event_max_sample_rate to %d\n", - avg_local_sample_len, allowed_ns, - sysctl_perf_event_sample_rate); - update_perf_cpu_limits(); + + if (!irq_work_queue(&perf_duration_work)) { + early_printk("perf interrupt took too long (%lld > %lld), lowering " + "kernel.perf_event_max_sample_rate to %d\n", + avg_local_sample_len, allowed_ns >> 1, + sysctl_perf_event_sample_rate); + } } static atomic64_t perf_event_id; @@ -342,7 +363,7 @@ struct perf_cgroup { static inline struct perf_cgroup * perf_cgroup_from_task(struct task_struct *task) { - return container_of(task_css(task, perf_subsys_id), + return container_of(task_css(task, perf_event_cgrp_id), struct perf_cgroup, css); } @@ -370,11 +391,6 @@ perf_cgroup_match(struct perf_event *event) event->cgrp->css.cgroup); } -static inline bool perf_tryget_cgroup(struct perf_event *event) -{ - return css_tryget(&event->cgrp->css); -} - static inline void perf_put_cgroup(struct perf_event *event) { css_put(&event->cgrp->css); @@ -593,9 +609,8 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, if (!f.file) return -EBADF; - rcu_read_lock(); - - css = css_from_dir(f.file->f_dentry, &perf_subsys); + css = css_tryget_online_from_dir(f.file->f_dentry, + &perf_event_cgrp_subsys); if (IS_ERR(css)) { ret = PTR_ERR(css); goto out; @@ -604,13 +619,6 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, cgrp = container_of(css, struct perf_cgroup, css); event->cgrp = cgrp; - /* must be done before we fput() the file */ - if (!perf_tryget_cgroup(event)) { - event->cgrp = NULL; - ret = -ENOENT; - goto out; - } - /* * all events in a group must monitor * the same cgroup because a task belongs @@ -621,7 +629,6 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, ret = -EINVAL; } out: - rcu_read_unlock(); fdput(f); return ret; } @@ -1439,6 +1446,11 @@ group_sched_out(struct perf_event *group_event, cpuctx->exclusive = 0; } +struct remove_event { + struct perf_event *event; + bool detach_group; +}; + /* * Cross CPU call to remove a performance event * @@ -1447,12 +1459,15 @@ group_sched_out(struct perf_event *group_event, */ static int __perf_remove_from_context(void *info) { - struct perf_event *event = info; + struct remove_event *re = info; + struct perf_event *event = re->event; struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); raw_spin_lock(&ctx->lock); event_sched_out(event, cpuctx, ctx); + if (re->detach_group) + perf_group_detach(event); list_del_event(event, ctx); if (!ctx->nr_events && cpuctx->task_ctx == ctx) { ctx->is_active = 0; @@ -1477,10 +1492,14 @@ static int __perf_remove_from_context(void *info) * When called from perf_event_exit_task, it's OK because the * context has been detached from its task. */ -static void perf_remove_from_context(struct perf_event *event) +static void perf_remove_from_context(struct perf_event *event, bool detach_group) { struct perf_event_context *ctx = event->ctx; struct task_struct *task = ctx->task; + struct remove_event re = { + .event = event, + .detach_group = detach_group, + }; lockdep_assert_held(&ctx->mutex); @@ -1489,12 +1508,12 @@ static void perf_remove_from_context(struct perf_event *event) * Per cpu events are removed via an smp call and * the removal is always successful. */ - cpu_function_call(event->cpu, __perf_remove_from_context, event); + cpu_function_call(event->cpu, __perf_remove_from_context, &re); return; } retry: - if (!task_function_call(task, __perf_remove_from_context, event)) + if (!task_function_call(task, __perf_remove_from_context, &re)) return; raw_spin_lock_irq(&ctx->lock); @@ -1511,6 +1530,8 @@ retry: * Since the task isn't running, its safe to remove the event, us * holding the ctx->lock ensures the task won't get scheduled in. */ + if (detach_group) + perf_group_detach(event); list_del_event(event, ctx); raw_spin_unlock_irq(&ctx->lock); } @@ -1659,6 +1680,8 @@ event_sched_in(struct perf_event *event, u64 tstamp = perf_event_time(event); int ret = 0; + lockdep_assert_held(&ctx->lock); + if (event->state <= PERF_EVENT_STATE_OFF) return 0; @@ -1714,7 +1737,7 @@ group_sched_in(struct perf_event *group_event, struct perf_event_context *ctx) { struct perf_event *event, *partial_group = NULL; - struct pmu *pmu = group_event->pmu; + struct pmu *pmu = ctx->pmu; u64 now = ctx->time; bool simulate = false; @@ -2297,7 +2320,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, next_parent = rcu_dereference(next_ctx->parent_ctx); /* If neither context have a parent context; they cannot be clones. */ - if (!parent && !next_parent) + if (!parent || !next_parent) goto unlock; if (next_parent == ctx || next_ctx == parent || next_parent == parent) { @@ -2563,8 +2586,6 @@ static void perf_branch_stack_sched_in(struct task_struct *prev, if (cpuctx->ctx.nr_branch_stack > 0 && pmu->flush_branch_stack) { - pmu = cpuctx->ctx.pmu; - perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_pmu_disable(pmu); @@ -2954,6 +2975,22 @@ out: local_irq_restore(flags); } +void perf_event_exec(void) +{ + struct perf_event_context *ctx; + int ctxn; + + rcu_read_lock(); + for_each_task_context_nr(ctxn) { + ctx = current->perf_event_ctxp[ctxn]; + if (!ctx) + continue; + + perf_event_enable_on_exec(ctx); + } + rcu_read_unlock(); +} + /* * Cross CPU call to read the hardware event */ @@ -3176,7 +3213,8 @@ static void free_event_rcu(struct rcu_head *head) } static void ring_buffer_put(struct ring_buffer *rb); -static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb); +static void ring_buffer_attach(struct perf_event *event, + struct ring_buffer *rb); static void unaccount_event_cpu(struct perf_event *event, int cpu) { @@ -3227,17 +3265,19 @@ static void __free_event(struct perf_event *event) if (event->ctx) put_ctx(event->ctx); + if (event->pmu) + module_put(event->pmu->module); + call_rcu(&event->rcu_head, free_event_rcu); } -static void free_event(struct perf_event *event) + +static void _free_event(struct perf_event *event) { irq_work_sync(&event->pending); unaccount_event(event); if (event->rb) { - struct ring_buffer *rb; - /* * Can happen when we close an event with re-directed output. * @@ -3245,57 +3285,38 @@ static void free_event(struct perf_event *event) * over us; possibly making our ring_buffer_put() the last. */ mutex_lock(&event->mmap_mutex); - rb = event->rb; - if (rb) { - rcu_assign_pointer(event->rb, NULL); - ring_buffer_detach(event, rb); - ring_buffer_put(rb); /* could be last */ - } + ring_buffer_attach(event, NULL); mutex_unlock(&event->mmap_mutex); } if (is_cgroup_event(event)) perf_detach_cgroup(event); - __free_event(event); } -int perf_event_release_kernel(struct perf_event *event) +/* + * Used to free events which have a known refcount of 1, such as in error paths + * where the event isn't exposed yet and inherited events. + */ +static void free_event(struct perf_event *event) { - struct perf_event_context *ctx = event->ctx; - - WARN_ON_ONCE(ctx->parent_ctx); - /* - * There are two ways this annotation is useful: - * - * 1) there is a lock recursion from perf_event_exit_task - * see the comment there. - * - * 2) there is a lock-inversion with mmap_sem through - * perf_event_read_group(), which takes faults while - * holding ctx->mutex, however this is called after - * the last filedesc died, so there is no possibility - * to trigger the AB-BA case. - */ - mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); - raw_spin_lock_irq(&ctx->lock); - perf_group_detach(event); - raw_spin_unlock_irq(&ctx->lock); - perf_remove_from_context(event); - mutex_unlock(&ctx->mutex); - - free_event(event); + if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1, + "unexpected event refcount: %ld; ptr=%p\n", + atomic_long_read(&event->refcount), event)) { + /* leak to avoid use-after-free */ + return; + } - return 0; + _free_event(event); } -EXPORT_SYMBOL_GPL(perf_event_release_kernel); /* * Called when the last reference to the file is gone. */ static void put_event(struct perf_event *event) { + struct perf_event_context *ctx = event->ctx; struct task_struct *owner; if (!atomic_long_dec_and_test(&event->refcount)) @@ -3334,8 +3355,32 @@ static void put_event(struct perf_event *event) put_task_struct(owner); } - perf_event_release_kernel(event); + WARN_ON_ONCE(ctx->parent_ctx); + /* + * There are two ways this annotation is useful: + * + * 1) there is a lock recursion from perf_event_exit_task + * see the comment there. + * + * 2) there is a lock-inversion with mmap_sem through + * perf_event_read_group(), which takes faults while + * holding ctx->mutex, however this is called after + * the last filedesc died, so there is no possibility + * to trigger the AB-BA case. + */ + mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); + perf_remove_from_context(event, true); + mutex_unlock(&ctx->mutex); + + _free_event(event); +} + +int perf_event_release_kernel(struct perf_event *event) +{ + put_event(event); + return 0; } +EXPORT_SYMBOL_GPL(perf_event_release_kernel); static int perf_release(struct inode *inode, struct file *file) { @@ -3837,28 +3882,47 @@ unlock: static void ring_buffer_attach(struct perf_event *event, struct ring_buffer *rb) { + struct ring_buffer *old_rb = NULL; unsigned long flags; - if (!list_empty(&event->rb_entry)) - return; + if (event->rb) { + /* + * Should be impossible, we set this when removing + * event->rb_entry and wait/clear when adding event->rb_entry. + */ + WARN_ON_ONCE(event->rcu_pending); - spin_lock_irqsave(&rb->event_lock, flags); - if (list_empty(&event->rb_entry)) - list_add(&event->rb_entry, &rb->event_list); - spin_unlock_irqrestore(&rb->event_lock, flags); -} + old_rb = event->rb; + event->rcu_batches = get_state_synchronize_rcu(); + event->rcu_pending = 1; -static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb) -{ - unsigned long flags; + spin_lock_irqsave(&old_rb->event_lock, flags); + list_del_rcu(&event->rb_entry); + spin_unlock_irqrestore(&old_rb->event_lock, flags); + } - if (list_empty(&event->rb_entry)) - return; + if (event->rcu_pending && rb) { + cond_synchronize_rcu(event->rcu_batches); + event->rcu_pending = 0; + } - spin_lock_irqsave(&rb->event_lock, flags); - list_del_init(&event->rb_entry); - wake_up_all(&event->waitq); - spin_unlock_irqrestore(&rb->event_lock, flags); + if (rb) { + spin_lock_irqsave(&rb->event_lock, flags); + list_add_rcu(&event->rb_entry, &rb->event_list); + spin_unlock_irqrestore(&rb->event_lock, flags); + } + + rcu_assign_pointer(event->rb, rb); + + if (old_rb) { + ring_buffer_put(old_rb); + /* + * Since we detached before setting the new rb, so that we + * could attach the new rb, we could have missed a wakeup. + * Provide it now. + */ + wake_up_all(&event->waitq); + } } static void ring_buffer_wakeup(struct perf_event *event) @@ -3927,7 +3991,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) { struct perf_event *event = vma->vm_file->private_data; - struct ring_buffer *rb = event->rb; + struct ring_buffer *rb = ring_buffer_get(event); struct user_struct *mmap_user = rb->mmap_user; int mmap_locked = rb->mmap_locked; unsigned long size = perf_data_size(rb); @@ -3935,18 +3999,14 @@ static void perf_mmap_close(struct vm_area_struct *vma) atomic_dec(&rb->mmap_count); if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) - return; + goto out_put; - /* Detach current event from the buffer. */ - rcu_assign_pointer(event->rb, NULL); - ring_buffer_detach(event, rb); + ring_buffer_attach(event, NULL); mutex_unlock(&event->mmap_mutex); /* If there's still other mmap()s of this buffer, we're done. */ - if (atomic_read(&rb->mmap_count)) { - ring_buffer_put(rb); /* can't be last */ - return; - } + if (atomic_read(&rb->mmap_count)) + goto out_put; /* * No other mmap()s, detach from all other events that might redirect @@ -3976,11 +4036,9 @@ again: * still restart the iteration to make sure we're not now * iterating the wrong list. */ - if (event->rb == rb) { - rcu_assign_pointer(event->rb, NULL); - ring_buffer_detach(event, rb); - ring_buffer_put(rb); /* can't be last, we still have one */ - } + if (event->rb == rb) + ring_buffer_attach(event, NULL); + mutex_unlock(&event->mmap_mutex); put_event(event); @@ -4005,6 +4063,7 @@ again: vma->vm_mm->pinned_vm -= mmap_locked; free_uid(mmap_user); +out_put: ring_buffer_put(rb); /* could be last */ } @@ -4122,7 +4181,6 @@ again: vma->vm_mm->pinned_vm += extra; ring_buffer_attach(event, rb); - rcu_assign_pointer(event->rb, rb); perf_event_init_userpage(event); perf_event_update_userpage(event); @@ -5034,21 +5092,9 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) NULL); } -void perf_event_comm(struct task_struct *task) +void perf_event_comm(struct task_struct *task, bool exec) { struct perf_comm_event comm_event; - struct perf_event_context *ctx; - int ctxn; - - rcu_read_lock(); - for_each_task_context_nr(ctxn) { - ctx = task->perf_event_ctxp[ctxn]; - if (!ctx) - continue; - - perf_event_enable_on_exec(ctx); - } - rcu_read_unlock(); if (!atomic_read(&nr_comm_events)) return; @@ -5060,7 +5106,7 @@ void perf_event_comm(struct task_struct *task) .event_id = { .header = { .type = PERF_RECORD_COMM, - .misc = 0, + .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0, /* .size */ }, /* .pid */ @@ -5083,6 +5129,7 @@ struct perf_mmap_event { int maj, min; u64 ino; u64 ino_generation; + u32 prot, flags; struct { struct perf_event_header header; @@ -5124,6 +5171,8 @@ static void perf_event_mmap_output(struct perf_event *event, mmap_event->event_id.header.size += sizeof(mmap_event->min); mmap_event->event_id.header.size += sizeof(mmap_event->ino); mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation); + mmap_event->event_id.header.size += sizeof(mmap_event->prot); + mmap_event->event_id.header.size += sizeof(mmap_event->flags); } perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); @@ -5142,6 +5191,8 @@ static void perf_event_mmap_output(struct perf_event *event, perf_output_put(&handle, mmap_event->min); perf_output_put(&handle, mmap_event->ino); perf_output_put(&handle, mmap_event->ino_generation); + perf_output_put(&handle, mmap_event->prot); + perf_output_put(&handle, mmap_event->flags); } __output_copy(&handle, mmap_event->file_name, @@ -5160,6 +5211,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) struct file *file = vma->vm_file; int maj = 0, min = 0; u64 ino = 0, gen = 0; + u32 prot = 0, flags = 0; unsigned int size; char tmp[16]; char *buf = NULL; @@ -5190,6 +5242,28 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) gen = inode->i_generation; maj = MAJOR(dev); min = MINOR(dev); + + if (vma->vm_flags & VM_READ) + prot |= PROT_READ; + if (vma->vm_flags & VM_WRITE) + prot |= PROT_WRITE; + if (vma->vm_flags & VM_EXEC) + prot |= PROT_EXEC; + + if (vma->vm_flags & VM_MAYSHARE) + flags = MAP_SHARED; + else + flags = MAP_PRIVATE; + + if (vma->vm_flags & VM_DENYWRITE) + flags |= MAP_DENYWRITE; + if (vma->vm_flags & VM_MAYEXEC) + flags |= MAP_EXECUTABLE; + if (vma->vm_flags & VM_LOCKED) + flags |= MAP_LOCKED; + if (vma->vm_flags & VM_HUGETLB) + flags |= MAP_HUGETLB; + goto got_name; } else { name = (char *)arch_vma_name(vma); @@ -5230,6 +5304,8 @@ got_name: mmap_event->min = min; mmap_event->ino = ino; mmap_event->ino_generation = gen; + mmap_event->prot = prot; + mmap_event->flags = flags; if (!(vma->vm_flags & VM_EXEC)) mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA; @@ -5270,6 +5346,8 @@ void perf_event_mmap(struct vm_area_struct *vma) /* .min (attr_mmap2 only) */ /* .ino (attr_mmap2 only) */ /* .ino_generation (attr_mmap2 only) */ + /* .prot (attr_mmap2 only) */ + /* .flags (attr_mmap2 only) */ }; perf_event_mmap_event(&mmap_event); @@ -5406,6 +5484,9 @@ struct swevent_htable { /* Recursion avoidance in each contexts */ int recursion[PERF_NR_CONTEXTS]; + + /* Keeps track of cpu being initialized/exited */ + bool online; }; static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); @@ -5652,8 +5733,14 @@ static int perf_swevent_add(struct perf_event *event, int flags) hwc->state = !(flags & PERF_EF_START); head = find_swevent_head(swhash, event); - if (WARN_ON_ONCE(!head)) + if (!head) { + /* + * We can race with cpu hotplug code. Do not + * WARN if the cpu just got unplugged. + */ + WARN_ON_ONCE(swhash->online); return -EINVAL; + } hlist_add_head_rcu(&event->hlist_entry, head); @@ -6294,7 +6381,7 @@ static int perf_event_idx_default(struct perf_event *event) * Ensures all contexts with the same task_ctx_nr have the same * pmu_cpu_context too. */ -static void *find_pmu_context(int ctxn) +static struct perf_cpu_context __percpu *find_pmu_context(int ctxn) { struct pmu *pmu; @@ -6549,6 +6636,7 @@ free_pdc: free_percpu(pmu->pmu_disable_count); goto unlock; } +EXPORT_SYMBOL_GPL(perf_pmu_register); void perf_pmu_unregister(struct pmu *pmu) { @@ -6570,6 +6658,7 @@ void perf_pmu_unregister(struct pmu *pmu) put_device(pmu->dev); free_pmu_context(pmu); } +EXPORT_SYMBOL_GPL(perf_pmu_unregister); struct pmu *perf_init_event(struct perf_event *event) { @@ -6583,6 +6672,10 @@ struct pmu *perf_init_event(struct perf_event *event) pmu = idr_find(&pmu_idr, event->attr.type); rcu_read_unlock(); if (pmu) { + if (!try_module_get(pmu->module)) { + pmu = ERR_PTR(-ENODEV); + goto unlock; + } event->pmu = pmu; ret = pmu->event_init(event); if (ret) @@ -6591,6 +6684,10 @@ struct pmu *perf_init_event(struct perf_event *event) } list_for_each_entry_rcu(pmu, &pmus, entry) { + if (!try_module_get(pmu->module)) { + pmu = ERR_PTR(-ENODEV); + goto unlock; + } event->pmu = pmu; ret = pmu->event_init(event); if (!ret) @@ -6769,6 +6866,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, err_pmu: if (event->destroy) event->destroy(event); + module_put(pmu->module); err_ns: if (event->ns) put_pid_ns(event->ns); @@ -6832,10 +6930,6 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, if (ret) return -EFAULT; - /* disabled for now */ - if (attr->mmap2) - return -EINVAL; - if (attr->__reserved_1) return -EINVAL; @@ -6912,7 +7006,7 @@ err_size: static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event) { - struct ring_buffer *rb = NULL, *old_rb = NULL; + struct ring_buffer *rb = NULL; int ret = -EINVAL; if (!output_event) @@ -6940,8 +7034,6 @@ set: if (atomic_read(&event->mmap_count)) goto unlock; - old_rb = event->rb; - if (output_event) { /* get the rb we want to redirect to */ rb = ring_buffer_get(output_event); @@ -6949,23 +7041,7 @@ set: goto unlock; } - if (old_rb) - ring_buffer_detach(event, old_rb); - - if (rb) - ring_buffer_attach(event, rb); - - rcu_assign_pointer(event->rb, rb); - - if (old_rb) { - ring_buffer_put(old_rb); - /* - * Since we detached before setting the new rb, so that we - * could attach the new rb, we could have missed a wakeup. - * Provide it now. - */ - wake_up_all(&event->waitq); - } + ring_buffer_attach(event, rb); ret = 0; unlock: @@ -7016,6 +7092,9 @@ SYSCALL_DEFINE5(perf_event_open, if (attr.freq) { if (attr.sample_freq > sysctl_perf_event_sample_rate) return -EINVAL; + } else { + if (attr.sample_period & (1ULL << 63)) + return -EINVAL; } /* @@ -7053,20 +7132,33 @@ SYSCALL_DEFINE5(perf_event_open, } } + if (task && group_leader && + group_leader->attr.inherit != attr.inherit) { + err = -EINVAL; + goto err_task; + } + get_online_cpus(); event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL, NULL); if (IS_ERR(event)) { err = PTR_ERR(event); - goto err_task; + goto err_cpus; } if (flags & PERF_FLAG_PID_CGROUP) { err = perf_cgroup_connect(pid, event, &attr, group_leader); if (err) { __free_event(event); - goto err_task; + goto err_cpus; + } + } + + if (is_sampling_event(event)) { + if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) { + err = -ENOTSUPP; + goto err_alloc; } } @@ -7163,7 +7255,7 @@ SYSCALL_DEFINE5(perf_event_open, struct perf_event_context *gctx = group_leader->ctx; mutex_lock(&gctx->mutex); - perf_remove_from_context(group_leader); + perf_remove_from_context(group_leader, false); /* * Removing from the context ends up with disabled @@ -7173,7 +7265,7 @@ SYSCALL_DEFINE5(perf_event_open, perf_event__state_init(group_leader); list_for_each_entry(sibling, &group_leader->sibling_list, group_entry) { - perf_remove_from_context(sibling); + perf_remove_from_context(sibling, false); perf_event__state_init(sibling); put_ctx(gctx); } @@ -7228,8 +7320,9 @@ err_context: put_ctx(ctx); err_alloc: free_event(event); -err_task: +err_cpus: put_online_cpus(); +err_task: if (task) put_task_struct(task); err_group_fd: @@ -7303,7 +7396,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) mutex_lock(&src_ctx->mutex); list_for_each_entry_safe(event, tmp, &src_ctx->event_list, event_entry) { - perf_remove_from_context(event); + perf_remove_from_context(event, false); unaccount_event_cpu(event, src_cpu); put_ctx(src_ctx); list_add(&event->migrate_entry, &events); @@ -7365,13 +7458,19 @@ __perf_event_exit_task(struct perf_event *child_event, struct perf_event_context *child_ctx, struct task_struct *child) { - if (child_event->parent) { - raw_spin_lock_irq(&child_ctx->lock); - perf_group_detach(child_event); - raw_spin_unlock_irq(&child_ctx->lock); - } - - perf_remove_from_context(child_event); + /* + * Do not destroy the 'original' grouping; because of the context + * switch optimization the original events could've ended up in a + * random child task. + * + * If we were to destroy the original group, all group related + * operations would cease to function properly after this random + * child dies. + * + * Do destroy all inherited groups, we don't care about those + * and being thorough is better. + */ + perf_remove_from_context(child_event, !!child_event->parent); /* * It can happen that the parent exits first, and has events @@ -7386,8 +7485,8 @@ __perf_event_exit_task(struct perf_event *child_event, static void perf_event_exit_task_context(struct task_struct *child, int ctxn) { - struct perf_event *child_event, *tmp; - struct perf_event_context *child_ctx; + struct perf_event *child_event, *next; + struct perf_event_context *child_ctx, *parent_ctx; unsigned long flags; if (likely(!child->perf_event_ctxp[ctxn])) { @@ -7412,6 +7511,15 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) raw_spin_lock(&child_ctx->lock); task_ctx_sched_out(child_ctx); child->perf_event_ctxp[ctxn] = NULL; + + /* + * In order to avoid freeing: child_ctx->parent_ctx->task + * under perf_event_context::lock, grab another reference. + */ + parent_ctx = child_ctx->parent_ctx; + if (parent_ctx) + get_ctx(parent_ctx); + /* * If this context is a clone; unclone it so it can't get * swapped to another process while we're removing all @@ -7422,6 +7530,13 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) raw_spin_unlock_irqrestore(&child_ctx->lock, flags); /* + * Now that we no longer hold perf_event_context::lock, drop + * our extra child_ctx->parent_ctx reference. + */ + if (parent_ctx) + put_ctx(parent_ctx); + + /* * Report the task dead after unscheduling the events so that we * won't get any samples after PERF_RECORD_EXIT. We can however still * get a few PERF_RECORD_READ events. @@ -7440,24 +7555,9 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) */ mutex_lock(&child_ctx->mutex); -again: - list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups, - group_entry) - __perf_event_exit_task(child_event, child_ctx, child); - - list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups, - group_entry) + list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry) __perf_event_exit_task(child_event, child_ctx, child); - /* - * If the last event was a group event, it will have appended all - * its siblings to the list, but we obtained 'tmp' before that which - * will still point to the list head terminating the iteration. - */ - if (!list_empty(&child_ctx->pinned_groups) || - !list_empty(&child_ctx->flexible_groups)) - goto again; - mutex_unlock(&child_ctx->mutex); put_ctx(child_ctx); @@ -7722,6 +7822,8 @@ int perf_event_init_context(struct task_struct *child, int ctxn) * swapped under us. */ parent_ctx = perf_pin_task_context(parent, ctxn); + if (!parent_ctx) + return 0; /* * No need to check if parent_ctx != NULL here; since we saw @@ -7833,6 +7935,7 @@ static void perf_event_init_cpu(int cpu) struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); mutex_lock(&swhash->hlist_mutex); + swhash->online = true; if (swhash->hlist_refcount > 0) { struct swevent_hlist *hlist; @@ -7855,15 +7958,15 @@ static void perf_pmu_rotate_stop(struct pmu *pmu) static void __perf_event_exit_context(void *__info) { + struct remove_event re = { .detach_group = false }; struct perf_event_context *ctx = __info; - struct perf_event *event, *tmp; perf_pmu_rotate_stop(ctx->pmu); - list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) - __perf_remove_from_context(event); - list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) - __perf_remove_from_context(event); + rcu_read_lock(); + list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry) + __perf_remove_from_context(&re); + rcu_read_unlock(); } static void perf_event_exit_cpu_context(int cpu) @@ -7887,11 +7990,12 @@ static void perf_event_exit_cpu(int cpu) { struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); + perf_event_exit_cpu_context(cpu); + mutex_lock(&swhash->hlist_mutex); + swhash->online = false; swevent_hlist_release(swhash); mutex_unlock(&swhash->hlist_mutex); - - perf_event_exit_cpu_context(cpu); } #else static inline void perf_event_exit_cpu(int cpu) { } @@ -8036,7 +8140,7 @@ static void perf_cgroup_attach(struct cgroup_subsys_state *css, { struct task_struct *task; - cgroup_taskset_for_each(task, css, tset) + cgroup_taskset_for_each(task, tset) task_function_call(task, __perf_cgroup_move, task); } @@ -8055,9 +8159,7 @@ static void perf_cgroup_exit(struct cgroup_subsys_state *css, task_function_call(task, __perf_cgroup_move, task); } -struct cgroup_subsys perf_subsys = { - .name = "perf_event", - .subsys_id = perf_subsys_id, +struct cgroup_subsys perf_event_cgrp_subsys = { .css_alloc = perf_cgroup_css_alloc, .css_free = perf_cgroup_css_free, .exit = perf_cgroup_exit, diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 307d87c0991..6f3254e8c13 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -36,6 +36,7 @@ #include "../../mm/internal.h" /* munlock_vma_page */ #include <linux/percpu-rwsem.h> #include <linux/task_work.h> +#include <linux/shmem_fs.h> #include <linux/uprobes.h> @@ -60,8 +61,6 @@ static struct percpu_rw_semaphore dup_mmap_sem; /* Have a copy of original instruction */ #define UPROBE_COPY_INSN 0 -/* Can skip singlestep */ -#define UPROBE_SKIP_SSTEP 1 struct uprobe { struct rb_node rb_node; /* node in the rb tree */ @@ -129,7 +128,7 @@ struct xol_area { */ static bool valid_vma(struct vm_area_struct *vma, bool is_register) { - vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_SHARED; + vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE; if (is_register) flags |= VM_WRITE; @@ -281,18 +280,13 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t * supported by that architecture then we need to modify is_trap_at_addr and * uprobe_write_opcode accordingly. This would never be a problem for archs * that have fixed length instructions. - */ - -/* + * * uprobe_write_opcode - write the opcode at a given virtual address. * @mm: the probed process address space. * @vaddr: the virtual address to store the opcode. * @opcode: opcode to be written at @vaddr. * - * Called with mm->mmap_sem held (for read and with a reference to - * mm). - * - * For mm @mm, write the opcode at @vaddr. + * Called with mm->mmap_sem held for write. * Return 0 (success) or a negative errno. */ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, @@ -312,21 +306,25 @@ retry: if (ret <= 0) goto put_old; + ret = anon_vma_prepare(vma); + if (ret) + goto put_old; + ret = -ENOMEM; new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); if (!new_page) goto put_old; - __SetPageUptodate(new_page); + if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) + goto put_new; + __SetPageUptodate(new_page); copy_highpage(new_page, old_page); copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); - ret = anon_vma_prepare(vma); - if (ret) - goto put_new; - ret = __replace_page(vma, vaddr, old_page, new_page); + if (ret) + mem_cgroup_uncharge_page(new_page); put_new: page_cache_release(new_page); @@ -491,12 +489,9 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) uprobe->offset = offset; init_rwsem(&uprobe->register_rwsem); init_rwsem(&uprobe->consumer_rwsem); - /* For now assume that the instruction need not be single-stepped */ - __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags); /* add to uprobes_tree, sorted on inode:offset */ cur_uprobe = insert_uprobe(uprobe); - /* a uprobe exists for this inode:offset combination */ if (cur_uprobe) { kfree(uprobe); @@ -542,14 +537,15 @@ static int __copy_insn(struct address_space *mapping, struct file *filp, void *insn, int nbytes, loff_t offset) { struct page *page; - - if (!mapping->a_ops->readpage) - return -EIO; /* - * Ensure that the page that has the original instruction is - * populated and in page-cache. + * Ensure that the page that has the original instruction is populated + * and in page-cache. If ->readpage == NULL it must be shmem_mapping(), + * see uprobe_register(). */ - page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp); + if (mapping->a_ops->readpage) + page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp); + else + page = shmem_read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT); if (IS_ERR(page)) return PTR_ERR(page); @@ -850,7 +846,7 @@ static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *u { int err; - if (!consumer_del(uprobe, uc)) /* WARN? */ + if (WARN_ON(!consumer_del(uprobe, uc))) return; err = register_for_each_vma(uprobe, NULL); @@ -885,6 +881,9 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * if (!uc->handler && !uc->ret_handler) return -EINVAL; + /* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */ + if (!inode->i_mapping->a_ops->readpage && !shmem_mapping(inode->i_mapping)) + return -EIO; /* Racy, just to catch the obvious mistakes */ if (offset > i_size_read(inode)) return -EINVAL; @@ -928,7 +927,7 @@ int uprobe_apply(struct inode *inode, loff_t offset, int ret = -ENOENT; uprobe = find_uprobe(inode, offset); - if (!uprobe) + if (WARN_ON(!uprobe)) return ret; down_write(&uprobe->register_rwsem); @@ -953,7 +952,7 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume struct uprobe *uprobe; uprobe = find_uprobe(inode, offset); - if (!uprobe) + if (WARN_ON(!uprobe)) return; down_write(&uprobe->register_rwsem); @@ -1296,14 +1295,8 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe) if (unlikely(!xol_vaddr)) return 0; - /* Initialize the slot */ - copy_to_page(area->page, xol_vaddr, - &uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); - /* - * We probably need flush_icache_user_range() but it needs vma. - * This should work on supported architectures too. - */ - flush_dcache_page(area->page); + arch_uprobe_copy_ixol(area->page, xol_vaddr, + &uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); return xol_vaddr; } @@ -1346,6 +1339,21 @@ static void xol_free_insn_slot(struct task_struct *tsk) } } +void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, + void *src, unsigned long len) +{ + /* Initialize the slot */ + copy_to_page(page, vaddr, src, len); + + /* + * We probably need flush_icache_user_range() but it needs vma. + * This should work on most of architectures by default. If + * architecture needs to do something different it can define + * its own version of the function. + */ + flush_dcache_page(page); +} + /** * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs * @regs: Reflects the saved state of the task after it has hit a breakpoint @@ -1357,6 +1365,16 @@ unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs) return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE; } +unsigned long uprobe_get_trap_addr(struct pt_regs *regs) +{ + struct uprobe_task *utask = current->utask; + + if (unlikely(utask && utask->active_uprobe)) + return utask->vaddr; + + return instruction_pointer(regs); +} + /* * Called with no locks held. * Called in context of a exiting or a exec-ing thread. @@ -1628,20 +1646,6 @@ bool uprobe_deny_signal(void) return true; } -/* - * Avoid singlestepping the original instruction if the original instruction - * is a NOP or can be emulated. - */ -static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs) -{ - if (test_bit(UPROBE_SKIP_SSTEP, &uprobe->flags)) { - if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) - return true; - clear_bit(UPROBE_SKIP_SSTEP, &uprobe->flags); - } - return false; -} - static void mmf_recalc_uprobes(struct mm_struct *mm) { struct vm_area_struct *vma; @@ -1804,6 +1808,11 @@ static bool handle_trampoline(struct pt_regs *regs) return true; } +bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs) +{ + return false; +} + /* * Run handler and ask thread to singlestep. * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. @@ -1858,14 +1867,18 @@ static void handle_swbp(struct pt_regs *regs) if (!get_utask()) goto out; + if (arch_uprobe_ignore(&uprobe->arch, regs)) + goto out; + handler_chain(uprobe, regs); - if (can_skip_sstep(uprobe, regs)) + + if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) goto out; if (!pre_ssout(uprobe, regs, bp_vaddr)) return; - /* can_skip_sstep() succeeded, or restart if can't singlestep */ + /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */ out: put_uprobe(uprobe); } @@ -1877,10 +1890,11 @@ out: static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) { struct uprobe *uprobe; + int err = 0; uprobe = utask->active_uprobe; if (utask->state == UTASK_SSTEP_ACK) - arch_uprobe_post_xol(&uprobe->arch, regs); + err = arch_uprobe_post_xol(&uprobe->arch, regs); else if (utask->state == UTASK_SSTEP_TRAPPED) arch_uprobe_abort_xol(&uprobe->arch, regs); else @@ -1894,6 +1908,11 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) spin_lock_irq(¤t->sighand->siglock); recalc_sigpending(); /* see uprobe_deny_signal() */ spin_unlock_irq(¤t->sighand->siglock); + + if (unlikely(err)) { + uprobe_warn(current, "execute the probed insn, sending SIGILL."); + force_sig_info(SIGILL, SEND_SIG_FORCED, current); + } } /* diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index 0dbeae37422..83d4382f569 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c @@ -37,7 +37,7 @@ static unsigned long ident_map[32] = { struct exec_domain default_exec_domain = { .name = "Linux", /* name */ .handler = default_handler, /* lcall7 causes a seg fault. */ - .pers_low = 0, /* PER_LINUX personality. */ + .pers_low = 0, /* PER_LINUX personality. */ .pers_high = 0, /* PER_LINUX personality. */ .signal_map = ident_map, /* Identity map signals. */ .signal_invmap = ident_map, /* - both ways. */ @@ -83,7 +83,7 @@ lookup_exec_domain(unsigned int personality) ep = &default_exec_domain; out: read_unlock(&exec_domains_lock); - return (ep); + return ep; } int @@ -110,8 +110,9 @@ register_exec_domain(struct exec_domain *ep) out: write_unlock(&exec_domains_lock); - return (err); + return err; } +EXPORT_SYMBOL(register_exec_domain); int unregister_exec_domain(struct exec_domain *ep) @@ -133,6 +134,7 @@ unregister: write_unlock(&exec_domains_lock); return 0; } +EXPORT_SYMBOL(unregister_exec_domain); int __set_personality(unsigned int personality) { @@ -144,6 +146,7 @@ int __set_personality(unsigned int personality) return 0; } +EXPORT_SYMBOL(__set_personality); #ifdef CONFIG_PROC_FS static int execdomains_proc_show(struct seq_file *m, void *v) @@ -188,8 +191,3 @@ SYSCALL_DEFINE1(personality, unsigned int, personality) return old; } - - -EXPORT_SYMBOL(register_exec_domain); -EXPORT_SYMBOL(unregister_exec_domain); -EXPORT_SYMBOL(__set_personality); diff --git a/kernel/exit.c b/kernel/exit.c index 1e77fc64531..e5c4668f179 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -313,46 +313,7 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) } } -/* - * Let kernel threads use this to say that they allow a certain signal. - * Must not be used if kthread was cloned with CLONE_SIGHAND. - */ -int allow_signal(int sig) -{ - if (!valid_signal(sig) || sig < 1) - return -EINVAL; - - spin_lock_irq(¤t->sighand->siglock); - /* This is only needed for daemonize()'ed kthreads */ - sigdelset(¤t->blocked, sig); - /* - * Kernel threads handle their own signals. Let the signal code - * know it'll be handled, so that they don't get converted to - * SIGKILL or just silently dropped. - */ - current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - return 0; -} - -EXPORT_SYMBOL(allow_signal); - -int disallow_signal(int sig) -{ - if (!valid_signal(sig) || sig < 1) - return -EINVAL; - - spin_lock_irq(¤t->sighand->siglock); - current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - return 0; -} - -EXPORT_SYMBOL(disallow_signal); - -#ifdef CONFIG_MM_OWNER +#ifdef CONFIG_MEMCG /* * A task is exiting. If it owned this mm, find a new owner for the mm. */ @@ -395,14 +356,18 @@ retry: } /* - * Search through everything else. We should not get - * here often + * Search through everything else, we should not get here often. */ - do_each_thread(g, c) { - if (c->mm == mm) - goto assign_new_owner; - } while_each_thread(g, c); - + for_each_process(g) { + if (g->flags & PF_KTHREAD) + continue; + for_each_thread(g, c) { + if (c->mm == mm) + goto assign_new_owner; + if (c->mm) + break; + } + } read_unlock(&tasklist_lock); /* * We found no owner yet mm_users > 1: this implies that we are @@ -434,7 +399,7 @@ assign_new_owner: task_unlock(c); put_task_struct(c); } -#endif /* CONFIG_MM_OWNER */ +#endif /* CONFIG_MEMCG */ /* * Turn us into a lazy TLB process if we @@ -570,7 +535,7 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p, if (same_thread_group(p->real_parent, father)) return; - /* We don't want people slaying init. */ + /* We don't want people slaying init. */ p->exit_signal = SIGCHLD; /* If it has exited notify the new parent about this child's death. */ @@ -784,9 +749,10 @@ void do_exit(long code) exit_shm(tsk); exit_files(tsk); exit_fs(tsk); + if (group_dead) + disassociate_ctty(1); exit_task_namespaces(tsk); exit_task_work(tsk); - check_stack_usage(); exit_thread(); /* @@ -797,21 +763,17 @@ void do_exit(long code) */ perf_event_exit_task(tsk); - cgroup_exit(tsk, 1); - - if (group_dead) - disassociate_ctty(1); + cgroup_exit(tsk); module_put(task_thread_info(tsk)->exec_domain->module); - proc_exit_connector(tsk); - /* * FIXME: do that only when needed, using sched_exit tracepoint */ flush_ptrace_hw_breakpoint(tsk); exit_notify(tsk, group_dead); + proc_exit_connector(tsk); #ifdef CONFIG_NUMA task_lock(tsk); mpol_put(tsk->mempolicy); @@ -844,6 +806,7 @@ void do_exit(long code) validate_creds_for_do_exit(tsk); + check_stack_usage(); preempt_disable(); if (tsk->nr_dirtied) __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); @@ -1038,17 +1001,13 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) return wait_noreap_copyout(wo, p, pid, uid, why, status); } + traced = ptrace_reparented(p); /* - * Try to move the task's state to DEAD - * only one thread is allowed to do this: + * Move the task's state to DEAD/TRACE, only one thread can do this. */ - state = xchg(&p->exit_state, EXIT_DEAD); - if (state != EXIT_ZOMBIE) { - BUG_ON(state != EXIT_DEAD); + state = traced && thread_group_leader(p) ? EXIT_TRACE : EXIT_DEAD; + if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE) return 0; - } - - traced = ptrace_reparented(p); /* * It can be ptraced but not reparented, check * thread_group_leader() to filter out sub-threads. @@ -1109,7 +1068,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) /* * Now we are sure this task is interesting, and no other - * thread can reap it because we set its state to EXIT_DEAD. + * thread can reap it because we its state == DEAD/TRACE. */ read_unlock(&tasklist_lock); @@ -1146,22 +1105,19 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) if (!retval) retval = pid; - if (traced) { + if (state == EXIT_TRACE) { write_lock_irq(&tasklist_lock); /* We dropped tasklist, ptracer could die and untrace */ ptrace_unlink(p); - /* - * If this is not a sub-thread, notify the parent. - * If parent wants a zombie, don't release it now. - */ - if (thread_group_leader(p) && - !do_notify_parent(p, p->exit_signal)) { - p->exit_state = EXIT_ZOMBIE; - p = NULL; - } + + /* If parent wants a zombie, don't release it now */ + state = EXIT_ZOMBIE; + if (do_notify_parent(p, p->exit_signal)) + state = EXIT_DEAD; + p->exit_state = state; write_unlock_irq(&tasklist_lock); } - if (p != NULL) + if (state == EXIT_DEAD) release_task(p); return retval; @@ -1338,7 +1294,12 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) static int wait_consider_task(struct wait_opts *wo, int ptrace, struct task_struct *p) { - int ret = eligible_child(wo, p); + int ret; + + if (unlikely(p->exit_state == EXIT_DEAD)) + return 0; + + ret = eligible_child(wo, p); if (!ret) return ret; @@ -1356,33 +1317,44 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, return 0; } - /* dead body doesn't have much to contribute */ - if (unlikely(p->exit_state == EXIT_DEAD)) { + if (unlikely(p->exit_state == EXIT_TRACE)) { /* - * But do not ignore this task until the tracer does - * wait_task_zombie()->do_notify_parent(). + * ptrace == 0 means we are the natural parent. In this case + * we should clear notask_error, debugger will notify us. */ - if (likely(!ptrace) && unlikely(ptrace_reparented(p))) + if (likely(!ptrace)) wo->notask_error = 0; return 0; } - /* slay zombie? */ - if (p->exit_state == EXIT_ZOMBIE) { + if (likely(!ptrace) && unlikely(p->ptrace)) { /* - * A zombie ptracee is only visible to its ptracer. - * Notification and reaping will be cascaded to the real - * parent when the ptracer detaches. + * If it is traced by its real parent's group, just pretend + * the caller is ptrace_do_wait() and reap this child if it + * is zombie. + * + * This also hides group stop state from real parent; otherwise + * a single stop can be reported twice as group and ptrace stop. + * If a ptracer wants to distinguish these two events for its + * own children it should create a separate process which takes + * the role of real parent. */ - if (likely(!ptrace) && unlikely(p->ptrace)) { - /* it will become visible, clear notask_error */ - wo->notask_error = 0; - return 0; - } + if (!ptrace_reparented(p)) + ptrace = 1; + } + /* slay zombie? */ + if (p->exit_state == EXIT_ZOMBIE) { /* we don't reap group leaders with subthreads */ - if (!delay_group_leader(p)) - return wait_task_zombie(wo, p); + if (!delay_group_leader(p)) { + /* + * A zombie ptracee is only visible to its ptracer. + * Notification and reaping will be cascaded to the + * real parent when the ptracer detaches. + */ + if (unlikely(ptrace) || likely(!p->ptrace)) + return wait_task_zombie(wo, p); + } /* * Allow access to stopped/continued state via zombie by @@ -1408,19 +1380,6 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, wo->notask_error = 0; } else { /* - * If @p is ptraced by a task in its real parent's group, - * hide group stop/continued state when looking at @p as - * the real parent; otherwise, a single stop can be - * reported twice as group and ptrace stops. - * - * If a ptracer wants to distinguish the two events for its - * own children, it should create a separate process which - * takes the role of real parent. - */ - if (likely(!ptrace) && p->ptrace && !ptrace_reparented(p)) - return 0; - - /* * @p is alive and it's gonna stop, continue or exit, so * there always is something to wait for. */ diff --git a/kernel/extable.c b/kernel/extable.c index 763faf037ec..d8a6446adbc 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -36,7 +36,7 @@ extern struct exception_table_entry __start___ex_table[]; extern struct exception_table_entry __stop___ex_table[]; /* Cleared by build time tools if the table is already sorted. */ -u32 __initdata main_extable_sort_needed = 1; +u32 __initdata __visible main_extable_sort_needed = 1; /* Sort the kernel's built-in exception table */ void __init sort_main_extable(void) diff --git a/kernel/fork.c b/kernel/fork.c index a17621c6cd4..6a13c46cd87 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -28,6 +28,8 @@ #include <linux/mman.h> #include <linux/mmu_notifier.h> #include <linux/fs.h> +#include <linux/mm.h> +#include <linux/vmacache.h> #include <linux/nsproxy.h> #include <linux/capability.h> #include <linux/cpu.h> @@ -71,6 +73,7 @@ #include <linux/signalfd.h> #include <linux/uprobes.h> #include <linux/aio.h> +#include <linux/compiler.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -147,15 +150,15 @@ void __weak arch_release_thread_info(struct thread_info *ti) static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, int node) { - struct page *page = alloc_pages_node(node, THREADINFO_GFP_ACCOUNTED, - THREAD_SIZE_ORDER); + struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP, + THREAD_SIZE_ORDER); return page ? page_address(page) : NULL; } static inline void free_thread_info(struct thread_info *ti) { - free_memcg_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER); + free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER); } # else static struct kmem_cache *thread_info_cache; @@ -237,6 +240,7 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(atomic_read(&tsk->usage)); WARN_ON(tsk == current); + task_numa_free(tsk); security_task_free(tsk); exit_creds(tsk); delayacct_tsk_free(tsk); @@ -283,7 +287,7 @@ void __init fork_init(unsigned long mempages) init_task.signal->rlim[RLIMIT_NPROC]; } -int __attribute__((weak)) arch_dup_task_struct(struct task_struct *dst, +int __weak arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { *dst = *src; @@ -363,7 +367,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) mm->locked_vm = 0; mm->mmap = NULL; - mm->mmap_cache = NULL; + mm->vmacache_seqnum = 0; mm->map_count = 0; cpumask_clear(mm_cpumask(mm)); mm->mm_rb = RB_ROOT; @@ -529,8 +533,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) atomic_set(&mm->mm_count, 1); init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); - mm->flags = (current->mm) ? - (current->mm->flags & MMF_INIT_MASK) : default_dump_filter; mm->core_state = NULL; atomic_long_set(&mm->nr_ptes, 0); memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); @@ -539,8 +541,15 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) mm_init_owner(mm, p); clear_tlb_flush_pending(mm); - if (likely(!mm_alloc_pgd(mm))) { + if (current->mm) { + mm->flags = current->mm->flags & MMF_INIT_MASK; + mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK; + } else { + mm->flags = default_dump_filter; mm->def_flags = 0; + } + + if (likely(!mm_alloc_pgd(mm))) { mmu_notifier_mm_init(mm); return mm; } @@ -876,6 +885,9 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) if (!oldmm) return 0; + /* initialize the new vmacache entries */ + vmacache_flush(tsk); + if (clone_flags & CLONE_VM) { atomic_inc(&oldmm->mm_users); mm = oldmm; @@ -1069,15 +1081,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) return 0; } -static void copy_flags(unsigned long clone_flags, struct task_struct *p) -{ - unsigned long new_flags = p->flags; - - new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); - new_flags |= PF_FORKNOEXEC; - p->flags = new_flags; -} - SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr) { current->clear_child_tid = tidptr; @@ -1096,12 +1099,12 @@ static void rt_mutex_init_task(struct task_struct *p) #endif } -#ifdef CONFIG_MM_OWNER +#ifdef CONFIG_MEMCG void mm_init_owner(struct mm_struct *mm, struct task_struct *p) { mm->owner = p; } -#endif /* CONFIG_MM_OWNER */ +#endif /* CONFIG_MEMCG */ /* * Initialize POSIX timer handling for a single task. @@ -1227,7 +1230,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto bad_fork_cleanup_count; delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ - copy_flags(clone_flags, p); + p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); + p->flags |= PF_FORKNOEXEC; INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); rcu_copy_process(p); @@ -1271,9 +1275,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (IS_ERR(p->mempolicy)) { retval = PTR_ERR(p->mempolicy); p->mempolicy = NULL; - goto bad_fork_cleanup_cgroup; + goto bad_fork_cleanup_threadgroup_lock; } - mpol_fix_fork_child_flag(p); #endif #ifdef CONFIG_CPUSETS p->cpuset_mem_spread_rotor = NUMA_NO_NODE; @@ -1484,7 +1487,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, total_forks++; spin_unlock(¤t->sighand->siglock); + syscall_tracepoint_update(p); write_unlock_irq(&tasklist_lock); + proc_fork_connector(p); cgroup_post_fork(p); if (clone_flags & CLONE_THREAD) @@ -1524,11 +1529,10 @@ bad_fork_cleanup_policy: perf_event_free_task(p); #ifdef CONFIG_NUMA mpol_put(p->mempolicy); -bad_fork_cleanup_cgroup: +bad_fork_cleanup_threadgroup_lock: #endif if (clone_flags & CLONE_THREAD) threadgroup_change_end(current); - cgroup_exit(p, 0); delayacct_tsk_free(p); module_put(task_thread_info(p)->exec_domain->module); bad_fork_cleanup_count: @@ -1604,10 +1608,12 @@ long do_fork(unsigned long clone_flags, */ if (!IS_ERR(p)) { struct completion vfork; + struct pid *pid; trace_sched_process_fork(current, p); - nr = task_pid_vnr(p); + pid = get_task_pid(p, PIDTYPE_PID); + nr = pid_vnr(pid); if (clone_flags & CLONE_PARENT_SETTID) put_user(nr, parent_tidptr); @@ -1622,12 +1628,14 @@ long do_fork(unsigned long clone_flags, /* forking complete and child started to run, tell ptracer */ if (unlikely(trace)) - ptrace_event(trace, nr); + ptrace_event_pid(trace, pid); if (clone_flags & CLONE_VFORK) { if (!wait_for_vfork_done(p, &vfork)) - ptrace_event(PTRACE_EVENT_VFORK_DONE, nr); + ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid); } + + put_pid(pid); } else { nr = PTR_ERR(p); } diff --git a/kernel/futex.c b/kernel/futex.c index 44a1261cb9f..b632b5f3f09 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -70,7 +70,10 @@ #include "locking/rtmutex_common.h" /* - * Basic futex operation and ordering guarantees: + * READ this before attempting to hack on futexes! + * + * Basic futex operation and ordering guarantees + * ============================================= * * The waiter reads the futex value in user space and calls * futex_wait(). This function computes the hash bucket and acquires @@ -119,7 +122,7 @@ * sys_futex(WAIT, futex, val); * futex_wait(futex, val); * - * waiters++; + * waiters++; (a) * mb(); (A) <-- paired with -. * | * lock(hash_bucket(futex)); | @@ -135,14 +138,14 @@ * unlock(hash_bucket(futex)); * schedule(); if (waiters) * lock(hash_bucket(futex)); - * wake_waiters(futex); - * unlock(hash_bucket(futex)); + * else wake_waiters(futex); + * waiters--; (b) unlock(hash_bucket(futex)); * - * Where (A) orders the waiters increment and the futex value read -- this - * is guaranteed by the head counter in the hb spinlock; and where (B) - * orders the write to futex and the waiters read -- this is done by the - * barriers in get_futex_key_refs(), through either ihold or atomic_inc, - * depending on the futex type. + * Where (A) orders the waiters increment and the futex value read through + * atomic operations (see hb_waiters_inc) and where (B) orders the write + * to futex and the waiters read -- this is done by the barriers in + * get_futex_key_refs(), through either ihold or atomic_inc, depending on the + * futex type. * * This yields the following case (where X:=waiters, Y:=futex): * @@ -155,9 +158,22 @@ * Which guarantees that x==0 && y==0 is impossible; which translates back into * the guarantee that we cannot both miss the futex variable change and the * enqueue. + * + * Note that a new waiter is accounted for in (a) even when it is possible that + * the wait call can return error, in which case we backtrack from it in (b). + * Refer to the comment in queue_lock(). + * + * Similarly, in order to account for waiters being requeued on another + * address we always increment the waiters for the destination bucket before + * acquiring the lock. It then decrements them again after releasing it - + * the code that actually moves the futex(es) between hash buckets (requeue_futex) + * will do the additional required waiter count housekeeping. This is done for + * double_lock_hb() and double_unlock_hb(), respectively. */ +#ifndef CONFIG_HAVE_FUTEX_CMPXCHG int __read_mostly futex_cmpxchg_enabled; +#endif /* * Futex flags used to encode options to functions and preserve them across @@ -234,6 +250,7 @@ static const struct futex_q futex_q_init = { * waiting on a futex. */ struct futex_hash_bucket { + atomic_t waiters; spinlock_t lock; struct plist_head chain; } ____cacheline_aligned_in_smp; @@ -250,25 +267,40 @@ static inline void futex_get_mm(union futex_key *key) * get_futex_key() implies a full barrier. This is relied upon * as full barrier (B), see the ordering comment above. */ - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); } -static inline bool hb_waiters_pending(struct futex_hash_bucket *hb) +/* + * Reflects a new waiter being added to the waitqueue. + */ +static inline void hb_waiters_inc(struct futex_hash_bucket *hb) { #ifdef CONFIG_SMP + atomic_inc(&hb->waiters); /* - * Tasks trying to enter the critical region are most likely - * potential waiters that will be added to the plist. Ensure - * that wakers won't miss to-be-slept tasks in the window between - * the wait call and the actual plist_add. + * Full barrier (A), see the ordering comment above. */ - if (spin_is_locked(&hb->lock)) - return true; - smp_rmb(); /* Make sure we check the lock state first */ + smp_mb__after_atomic(); +#endif +} - return !plist_head_empty(&hb->chain); +/* + * Reflects a waiter being removed from the waitqueue by wakeup + * paths. + */ +static inline void hb_waiters_dec(struct futex_hash_bucket *hb) +{ +#ifdef CONFIG_SMP + atomic_dec(&hb->waiters); +#endif +} + +static inline int hb_waiters_pending(struct futex_hash_bucket *hb) +{ +#ifdef CONFIG_SMP + return atomic_read(&hb->waiters); #else - return true; + return 1; #endif } @@ -711,6 +743,55 @@ void exit_pi_state_list(struct task_struct *curr) raw_spin_unlock_irq(&curr->pi_lock); } +/* + * We need to check the following states: + * + * Waiter | pi_state | pi->owner | uTID | uODIED | ? + * + * [1] NULL | --- | --- | 0 | 0/1 | Valid + * [2] NULL | --- | --- | >0 | 0/1 | Valid + * + * [3] Found | NULL | -- | Any | 0/1 | Invalid + * + * [4] Found | Found | NULL | 0 | 1 | Valid + * [5] Found | Found | NULL | >0 | 1 | Invalid + * + * [6] Found | Found | task | 0 | 1 | Valid + * + * [7] Found | Found | NULL | Any | 0 | Invalid + * + * [8] Found | Found | task | ==taskTID | 0/1 | Valid + * [9] Found | Found | task | 0 | 0 | Invalid + * [10] Found | Found | task | !=taskTID | 0/1 | Invalid + * + * [1] Indicates that the kernel can acquire the futex atomically. We + * came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit. + * + * [2] Valid, if TID does not belong to a kernel thread. If no matching + * thread is found then it indicates that the owner TID has died. + * + * [3] Invalid. The waiter is queued on a non PI futex + * + * [4] Valid state after exit_robust_list(), which sets the user space + * value to FUTEX_WAITERS | FUTEX_OWNER_DIED. + * + * [5] The user space value got manipulated between exit_robust_list() + * and exit_pi_state_list() + * + * [6] Valid state after exit_pi_state_list() which sets the new owner in + * the pi_state but cannot access the user space value. + * + * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set. + * + * [8] Owner and user space value match + * + * [9] There is no transient state which sets the user space TID to 0 + * except exit_robust_list(), but this is indicated by the + * FUTEX_OWNER_DIED bit. See [4] + * + * [10] There is no transient state which leaves owner and user space + * TID out of sync. + */ static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, union futex_key *key, struct futex_pi_state **ps) @@ -723,12 +804,13 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, plist_for_each_entry_safe(this, next, &hb->chain, list) { if (match_futex(&this->key, key)) { /* - * Another waiter already exists - bump up - * the refcount and return its pi_state: + * Sanity check the waiter before increasing + * the refcount and attaching to it. */ pi_state = this->pi_state; /* - * Userspace might have messed up non-PI and PI futexes + * Userspace might have messed up non-PI and + * PI futexes [3] */ if (unlikely(!pi_state)) return -EINVAL; @@ -736,34 +818,70 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, WARN_ON(!atomic_read(&pi_state->refcount)); /* - * When pi_state->owner is NULL then the owner died - * and another waiter is on the fly. pi_state->owner - * is fixed up by the task which acquires - * pi_state->rt_mutex. - * - * We do not check for pid == 0 which can happen when - * the owner died and robust_list_exit() cleared the - * TID. + * Handle the owner died case: */ - if (pid && pi_state->owner) { + if (uval & FUTEX_OWNER_DIED) { + /* + * exit_pi_state_list sets owner to NULL and + * wakes the topmost waiter. The task which + * acquires the pi_state->rt_mutex will fixup + * owner. + */ + if (!pi_state->owner) { + /* + * No pi state owner, but the user + * space TID is not 0. Inconsistent + * state. [5] + */ + if (pid) + return -EINVAL; + /* + * Take a ref on the state and + * return. [4] + */ + goto out_state; + } + + /* + * If TID is 0, then either the dying owner + * has not yet executed exit_pi_state_list() + * or some waiter acquired the rtmutex in the + * pi state, but did not yet fixup the TID in + * user space. + * + * Take a ref on the state and return. [6] + */ + if (!pid) + goto out_state; + } else { /* - * Bail out if user space manipulated the - * futex value. + * If the owner died bit is not set, + * then the pi_state must have an + * owner. [7] */ - if (pid != task_pid_vnr(pi_state->owner)) + if (!pi_state->owner) return -EINVAL; } + /* + * Bail out if user space manipulated the + * futex value. If pi state exists then the + * owner TID must be the same as the user + * space TID. [9/10] + */ + if (pid != task_pid_vnr(pi_state->owner)) + return -EINVAL; + + out_state: atomic_inc(&pi_state->refcount); *ps = pi_state; - return 0; } } /* * We are the first waiter - try to look up the real owner and attach - * the new pi_state to it, but bail out when TID = 0 + * the new pi_state to it, but bail out when TID = 0 [1] */ if (!pid) return -ESRCH; @@ -771,6 +889,11 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, if (!p) return -ESRCH; + if (!p->mm) { + put_task_struct(p); + return -EPERM; + } + /* * We need to look at the task state flags to figure out, * whether the task is exiting. To protect against the do_exit @@ -791,6 +914,9 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, return ret; } + /* + * No existing pi state. First waiter. [2] + */ pi_state = alloc_pi_state(); /* @@ -862,10 +988,18 @@ retry: return -EDEADLK; /* - * Surprise - we got the lock. Just return to userspace: + * Surprise - we got the lock, but we do not trust user space at all. */ - if (unlikely(!curval)) - return 1; + if (unlikely(!curval)) { + /* + * We verify whether there is kernel state for this + * futex. If not, we can safely assume, that the 0 -> + * TID transition is correct. If state exists, we do + * not bother to fixup the user space state as it was + * corrupted already. + */ + return futex_top_waiter(hb, key) ? -EINVAL : 1; + } uval = curval; @@ -954,6 +1088,7 @@ static void __unqueue_futex(struct futex_q *q) hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); plist_del(&q->list, &hb->chain); + hb_waiters_dec(hb); } /* @@ -995,6 +1130,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) struct task_struct *new_owner; struct futex_pi_state *pi_state = this->pi_state; u32 uninitialized_var(curval), newval; + int ret = 0; if (!pi_state) return -EINVAL; @@ -1018,23 +1154,19 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) new_owner = this->task; /* - * We pass it to the next owner. (The WAITERS bit is always - * kept enabled while there is PI state around. We must also - * preserve the owner died bit.) + * We pass it to the next owner. The WAITERS bit is always + * kept enabled while there is PI state around. We cleanup the + * owner died bit, because we are the owner. */ - if (!(uval & FUTEX_OWNER_DIED)) { - int ret = 0; - - newval = FUTEX_WAITERS | task_pid_vnr(new_owner); + newval = FUTEX_WAITERS | task_pid_vnr(new_owner); - if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) - ret = -EFAULT; - else if (curval != uval) - ret = -EINVAL; - if (ret) { - raw_spin_unlock(&pi_state->pi_mutex.wait_lock); - return ret; - } + if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) + ret = -EFAULT; + else if (curval != uval) + ret = -EINVAL; + if (ret) { + raw_spin_unlock(&pi_state->pi_mutex.wait_lock); + return ret; } raw_spin_lock_irq(&pi_state->owner->pi_lock); @@ -1257,7 +1389,9 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, */ if (likely(&hb1->chain != &hb2->chain)) { plist_del(&q->list, &hb1->chain); + hb_waiters_dec(hb1); plist_add(&q->list, &hb2->chain); + hb_waiters_inc(hb2); q->lock_ptr = &hb2->lock; } get_futex_key_refs(key2); @@ -1312,7 +1446,7 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, * * Return: * 0 - failed to acquire the lock atomically; - * 1 - acquired the lock; + * >0 - acquired the lock, return value is vpid of the top_waiter * <0 - error */ static int futex_proxy_trylock_atomic(u32 __user *pifutex, @@ -1323,7 +1457,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, { struct futex_q *top_waiter = NULL; u32 curval; - int ret; + int ret, vpid; if (get_futex_value_locked(&curval, pifutex)) return -EFAULT; @@ -1351,11 +1485,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, * the contended case or if set_waiters is 1. The pi_state is returned * in ps in contended cases. */ + vpid = task_pid_vnr(top_waiter->task); ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, set_waiters); - if (ret == 1) + if (ret == 1) { requeue_pi_wake_futex(top_waiter, key2, hb2); - + return vpid; + } return ret; } @@ -1386,10 +1522,16 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, struct futex_pi_state *pi_state = NULL; struct futex_hash_bucket *hb1, *hb2; struct futex_q *this, *next; - u32 curval2; if (requeue_pi) { /* + * Requeue PI only works on two distinct uaddrs. This + * check is only valid for private futexes. See below. + */ + if (uaddr1 == uaddr2) + return -EINVAL; + + /* * requeue_pi requires a pi_state, try to allocate it now * without any locks in case it fails. */ @@ -1427,10 +1569,20 @@ retry: if (unlikely(ret != 0)) goto out_put_key1; + /* + * The check above which compares uaddrs is not sufficient for + * shared futexes. We need to compare the keys: + */ + if (requeue_pi && match_futex(&key1, &key2)) { + ret = -EINVAL; + goto out_put_keys; + } + hb1 = hash_futex(&key1); hb2 = hash_futex(&key2); retry_private: + hb_waiters_inc(hb2); double_lock_hb(hb1, hb2); if (likely(cmpval != NULL)) { @@ -1440,6 +1592,7 @@ retry_private: if (unlikely(ret)) { double_unlock_hb(hb1, hb2); + hb_waiters_dec(hb2); ret = get_user(curval, uaddr1); if (ret) @@ -1472,16 +1625,25 @@ retry_private: * At this point the top_waiter has either taken uaddr2 or is * waiting on it. If the former, then the pi_state will not * exist yet, look it up one more time to ensure we have a - * reference to it. + * reference to it. If the lock was taken, ret contains the + * vpid of the top waiter task. */ - if (ret == 1) { + if (ret > 0) { WARN_ON(pi_state); drop_count++; task_count++; - ret = get_futex_value_locked(&curval2, uaddr2); - if (!ret) - ret = lookup_pi_state(curval2, hb2, &key2, - &pi_state); + /* + * If we acquired the lock, then the user + * space value of uaddr2 should be vpid. It + * cannot be changed by the top waiter as it + * is blocked on hb2 lock if it tries to do + * so. If something fiddled with it behind our + * back the pi state lookup might unearth + * it. So we rather use the known value than + * rereading and handing potential crap to + * lookup_pi_state. + */ + ret = lookup_pi_state(ret, hb2, &key2, &pi_state); } switch (ret) { @@ -1489,6 +1651,7 @@ retry_private: break; case -EFAULT: double_unlock_hb(hb1, hb2); + hb_waiters_dec(hb2); put_futex_key(&key2); put_futex_key(&key1); ret = fault_in_user_writeable(uaddr2); @@ -1498,6 +1661,7 @@ retry_private: case -EAGAIN: /* The owner was exiting, try again. */ double_unlock_hb(hb1, hb2); + hb_waiters_dec(hb2); put_futex_key(&key2); put_futex_key(&key1); cond_resched(); @@ -1573,6 +1737,7 @@ retry_private: out_unlock: double_unlock_hb(hb1, hb2); + hb_waiters_dec(hb2); /* * drop_futex_key_refs() must be called outside the spinlocks. During @@ -1600,6 +1765,17 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) struct futex_hash_bucket *hb; hb = hash_futex(&q->key); + + /* + * Increment the counter before taking the lock so that + * a potential waker won't miss a to-be-slept task that is + * waiting for the spinlock. This is safe as all queue_lock() + * users end up calling queue_me(). Similarly, for housekeeping, + * decrement the counter at queue_unlock() when some error has + * occurred and we don't end up adding the task to the list. + */ + hb_waiters_inc(hb); + q->lock_ptr = &hb->lock; spin_lock(&hb->lock); /* implies MB (A) */ @@ -1611,6 +1787,7 @@ queue_unlock(struct futex_hash_bucket *hb) __releases(&hb->lock) { spin_unlock(&hb->lock); + hb_waiters_dec(hb); } /** @@ -2249,9 +2426,10 @@ retry: /* * To avoid races, try to do the TID -> 0 atomic transition * again. If it succeeds then we can return without waking - * anyone else up: + * anyone else up. We only try this if neither the waiters nor + * the owner died bit are set. */ - if (!(uval & FUTEX_OWNER_DIED) && + if (!(uval & ~FUTEX_TID_MASK) && cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0)) goto pi_faulted; /* @@ -2281,11 +2459,9 @@ retry: /* * No waiters - kernel unlocks the futex: */ - if (!(uval & FUTEX_OWNER_DIED)) { - ret = unlock_futex_pi(uaddr, uval); - if (ret == -EFAULT) - goto pi_faulted; - } + ret = unlock_futex_pi(uaddr, uval); + if (ret == -EFAULT) + goto pi_faulted; out_unlock: spin_unlock(&hb->lock); @@ -2342,6 +2518,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * Unqueue the futex_q and determine which it was. */ plist_del(&q->list, &hb->chain); + hb_waiters_dec(hb); /* Handle spurious wakeups gracefully */ ret = -EWOULDBLOCK; @@ -2446,6 +2623,15 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, if (ret) goto out_key2; + /* + * The check above which compares uaddrs is not sufficient for + * shared futexes. We need to compare the keys: + */ + if (match_futex(&q.key, &key2)) { + ret = -EINVAL; + goto out_put_keys; + } + /* Queue the futex_q, drop the hb lock, wait for wakeup. */ futex_wait_queue_me(hb, &q, to); @@ -2843,9 +3029,28 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); } -static int __init futex_init(void) +static void __init futex_detect_cmpxchg(void) { +#ifndef CONFIG_HAVE_FUTEX_CMPXCHG u32 curval; + + /* + * This will fail and we want it. Some arch implementations do + * runtime detection of the futex_atomic_cmpxchg_inatomic() + * functionality. We want to know that before we call in any + * of the complex code paths. Also we want to prevent + * registration of robust lists in that case. NULL is + * guaranteed to fault and we get -EFAULT on functional + * implementation, the non-functional ones will return + * -ENOSYS. + */ + if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT) + futex_cmpxchg_enabled = 1; +#endif +} + +static int __init futex_init(void) +{ unsigned int futex_shift; unsigned long i; @@ -2861,20 +3066,11 @@ static int __init futex_init(void) &futex_shift, NULL, futex_hashsize, futex_hashsize); futex_hashsize = 1UL << futex_shift; - /* - * This will fail and we want it. Some arch implementations do - * runtime detection of the futex_atomic_cmpxchg_inatomic() - * functionality. We want to know that before we call in any - * of the complex code paths. Also we want to prevent - * registration of robust lists in that case. NULL is - * guaranteed to fault and we get -EFAULT on functional - * implementation, the non-functional ones will return - * -ENOSYS. - */ - if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT) - futex_cmpxchg_enabled = 1; + + futex_detect_cmpxchg(); for (i = 0; i < futex_hashsize; i++) { + atomic_set(&futex_queues[i].waiters, 0); plist_head_init(&futex_queues[i].chain); spin_lock_init(&futex_queues[i].lock); } diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index f9f44fd4d34..55c8c9349cf 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -183,7 +183,7 @@ COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || cmd == FUTEX_WAIT_BITSET || cmd == FUTEX_WAIT_REQUEUE_PI)) { - if (get_compat_timespec(&ts, utime)) + if (compat_get_timespec(&ts, utime)) return -EFAULT; if (!timespec_valid(&ts)) return -EINVAL; diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c index f45b75b713c..b358a802fd1 100644 --- a/kernel/gcov/base.c +++ b/kernel/gcov/base.c @@ -85,6 +85,12 @@ void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters) } EXPORT_SYMBOL(__gcov_merge_ior); +void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_time_profile); + /** * gcov_enable_events - enable event reporting through gcov_event() * diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index 2c6e4631c81..826ba9fb5e3 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -18,7 +18,12 @@ #include <linux/vmalloc.h> #include "gcov.h" +#if __GNUC__ == 4 && __GNUC_MINOR__ >= 9 +#define GCOV_COUNTERS 9 +#else #define GCOV_COUNTERS 8 +#endif + #define GCOV_TAG_FUNCTION_LENGTH 3 static struct gcov_info *gcov_info_head; diff --git a/kernel/groups.c b/kernel/groups.c index 90cf1c38c8e..451698f86cf 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -157,17 +157,13 @@ int groups_search(const struct group_info *group_info, kgid_t grp) * set_groups - Change a group subscription in a set of credentials * @new: The newly prepared set of credentials to alter * @group_info: The group list to install - * - * Validate a group subscription and, if valid, insert it into a set - * of credentials. */ -int set_groups(struct cred *new, struct group_info *group_info) +void set_groups(struct cred *new, struct group_info *group_info) { put_group_info(new->group_info); groups_sort(group_info); get_group_info(group_info); new->group_info = group_info; - return 0; } EXPORT_SYMBOL(set_groups); @@ -182,18 +178,12 @@ EXPORT_SYMBOL(set_groups); int set_current_groups(struct group_info *group_info) { struct cred *new; - int ret; new = prepare_creds(); if (!new) return -ENOMEM; - ret = set_groups(new, group_info); - if (ret < 0) { - abort_creds(new); - return ret; - } - + set_groups(new, group_info); return commit_creds(new); } diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 09094361dce..3ab28993f6e 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -168,19 +168,6 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, } } - -/* - * Get the preferred target CPU for NOHZ - */ -static int hrtimer_get_target(int this_cpu, int pinned) -{ -#ifdef CONFIG_NO_HZ_COMMON - if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) - return get_nohz_timer_target(); -#endif - return this_cpu; -} - /* * With HIGHRES=y we do not migrate the timer when it is expiring * before the next event on the target cpu because we cannot reprogram @@ -214,7 +201,7 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, struct hrtimer_clock_base *new_base; struct hrtimer_cpu_base *new_cpu_base; int this_cpu = smp_processor_id(); - int cpu = hrtimer_get_target(this_cpu, pinned); + int cpu = get_nohz_timer_target(pinned); int basenum = base->index; again: @@ -247,6 +234,11 @@ again: goto again; } timer->base = new_base; + } else { + if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) { + cpu = this_cpu; + goto again; + } } return new_base; } @@ -582,6 +574,23 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) cpu_base->expires_next.tv64 = expires_next.tv64; + /* + * If a hang was detected in the last timer interrupt then we + * leave the hang delay active in the hardware. We want the + * system to make progress. That also prevents the following + * scenario: + * T1 expires 50ms from now + * T2 expires 5s from now + * + * T1 is removed, so this code is called and would reprogram + * the hardware to 5s from now. Any hrtimer_start after that + * will not reprogram the hardware due to hang_detected being + * set. So we'd effectivly block all timers until the T2 event + * fires. + */ + if (cpu_base->hang_detected) + return; + if (cpu_base->expires_next.tv64 != KTIME_MAX) tick_program_event(cpu_base->expires_next, 1); } @@ -981,11 +990,8 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, /* Remove an active timer from the queue: */ ret = remove_hrtimer(timer, base); - /* Switch the timer base, if necessary: */ - new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED); - if (mode & HRTIMER_MODE_REL) { - tim = ktime_add_safe(tim, new_base->get_time()); + tim = ktime_add_safe(tim, base->get_time()); /* * CONFIG_TIME_LOW_RES is a temporary way for architectures * to signal that they simply return xtime in @@ -1000,6 +1006,9 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, hrtimer_set_expires_range_ns(timer, tim, delta_ns); + /* Switch the timer base, if necessary: */ + new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED); + timer_stats_hrtimer_set_start_info(timer); leftmost = enqueue_hrtimer(timer, new_base); @@ -1030,6 +1039,7 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, return ret; } +EXPORT_SYMBOL_GPL(__hrtimer_start_range_ns); /** * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 0b9c169d577..06db12434d7 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -52,8 +52,10 @@ unsigned int __read_mostly sysctl_hung_task_panic = static int __init hung_task_panic_setup(char *str) { - sysctl_hung_task_panic = simple_strtoul(str, NULL, 0); + int rc = kstrtouint(str, 0, &sysctl_hung_task_panic); + if (rc) + return rc; return 1; } __setup("hung_task_panic=", hung_task_panic_setup); @@ -246,5 +248,4 @@ static int __init hung_task_init(void) return 0; } - -module_init(hung_task_init); +subsys_initcall(hung_task_init); diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 4a1fef09f65..d269cecdfbf 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -5,6 +5,10 @@ menu "IRQ subsystem" config MAY_HAVE_SPARSE_IRQ bool +# Legacy support, required for itanic +config GENERIC_IRQ_LEGACY + bool + # Enable the generic irq autoprobe mechanism config GENERIC_IRQ_PROBE bool @@ -17,6 +21,11 @@ config GENERIC_IRQ_SHOW config GENERIC_IRQ_SHOW_LEVEL bool +# Facility to allocate a hardware interrupt. This is legacy support +# and should not be used in new code. Use irq domains instead. +config GENERIC_IRQ_LEGACY_ALLOC_HWIRQ + bool + # Support for delayed migration from interrupt context config GENERIC_PENDING_IRQ bool @@ -40,6 +49,7 @@ config IRQ_EDGE_EOI_HANDLER # Generic configurable interrupt chip implementation config GENERIC_IRQ_CHIP bool + select IRQ_DOMAIN # Generic irq_domain hw <--> linux irq number translation config IRQ_DOMAIN diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index dc04c166c54..a2b28a2fd7b 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -40,10 +40,9 @@ int irq_set_chip(unsigned int irq, struct irq_chip *chip) irq_put_desc_unlock(desc, flags); /* * For !CONFIG_SPARSE_IRQ make the irq show up in - * allocated_irqs. For the CONFIG_SPARSE_IRQ case, it is - * already marked, and this call is harmless. + * allocated_irqs. */ - irq_reserve_irq(irq); + irq_mark_irq(irq); return 0; } EXPORT_SYMBOL(irq_set_chip); @@ -281,6 +280,19 @@ void unmask_irq(struct irq_desc *desc) } } +void unmask_threaded_irq(struct irq_desc *desc) +{ + struct irq_chip *chip = desc->irq_data.chip; + + if (chip->flags & IRQCHIP_EOI_THREADED) + chip->irq_eoi(&desc->irq_data); + + if (chip->irq_unmask) { + chip->irq_unmask(&desc->irq_data); + irq_state_clr_masked(desc); + } +} + /* * handle_nested_irq - Handle a nested irq from a irq thread * @irq: the interrupt number @@ -435,6 +447,27 @@ static inline void preflow_handler(struct irq_desc *desc) static inline void preflow_handler(struct irq_desc *desc) { } #endif +static void cond_unmask_eoi_irq(struct irq_desc *desc, struct irq_chip *chip) +{ + if (!(desc->istate & IRQS_ONESHOT)) { + chip->irq_eoi(&desc->irq_data); + return; + } + /* + * We need to unmask in the following cases: + * - Oneshot irq which did not wake the thread (caused by a + * spurious interrupt or a primary handler handling it + * completely). + */ + if (!irqd_irq_disabled(&desc->irq_data) && + irqd_irq_masked(&desc->irq_data) && !desc->threads_oneshot) { + chip->irq_eoi(&desc->irq_data); + unmask_irq(desc); + } else if (!(chip->flags & IRQCHIP_EOI_THREADED)) { + chip->irq_eoi(&desc->irq_data); + } +} + /** * handle_fasteoi_irq - irq handler for transparent controllers * @irq: the interrupt number @@ -448,6 +481,8 @@ static inline void preflow_handler(struct irq_desc *desc) { } void handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) { + struct irq_chip *chip = desc->irq_data.chip; + raw_spin_lock(&desc->lock); if (unlikely(irqd_irq_inprogress(&desc->irq_data))) @@ -473,18 +508,14 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) preflow_handler(desc); handle_irq_event(desc); - if (desc->istate & IRQS_ONESHOT) - cond_unmask_irq(desc); + cond_unmask_eoi_irq(desc, chip); -out_eoi: - desc->irq_data.chip->irq_eoi(&desc->irq_data); -out_unlock: raw_spin_unlock(&desc->lock); return; out: - if (!(desc->irq_data.chip->flags & IRQCHIP_EOI_IF_HANDLED)) - goto out_eoi; - goto out_unlock; + if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED)) + chip->irq_eoi(&desc->irq_data); + raw_spin_unlock(&desc->lock); } /** diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index bd8e788d71e..1ef0606797c 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c @@ -73,6 +73,51 @@ int devm_request_threaded_irq(struct device *dev, unsigned int irq, EXPORT_SYMBOL(devm_request_threaded_irq); /** + * devm_request_any_context_irq - allocate an interrupt line for a managed device + * @dev: device to request interrupt for + * @irq: Interrupt line to allocate + * @handler: Function to be called when the IRQ occurs + * @thread_fn: function to be called in a threaded interrupt context. NULL + * for devices which handle everything in @handler + * @irqflags: Interrupt type flags + * @devname: An ascii name for the claiming device + * @dev_id: A cookie passed back to the handler function + * + * Except for the extra @dev argument, this function takes the + * same arguments and performs the same function as + * request_any_context_irq(). IRQs requested with this function will be + * automatically freed on driver detach. + * + * If an IRQ allocated with this function needs to be freed + * separately, devm_free_irq() must be used. + */ +int devm_request_any_context_irq(struct device *dev, unsigned int irq, + irq_handler_t handler, unsigned long irqflags, + const char *devname, void *dev_id) +{ + struct irq_devres *dr; + int rc; + + dr = devres_alloc(devm_irq_release, sizeof(struct irq_devres), + GFP_KERNEL); + if (!dr) + return -ENOMEM; + + rc = request_any_context_irq(irq, handler, irqflags, devname, dev_id); + if (rc) { + devres_free(dr); + return rc; + } + + dr->irq = irq; + dr->dev_id = dev_id; + devres_add(dev, dr); + + return 0; +} +EXPORT_SYMBOL(devm_request_any_context_irq); + +/** * devm_free_irq - free an interrupt * @dev: device to free interrupt for * @irq: Interrupt line to free diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 131ca176b49..63548027085 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -41,6 +41,7 @@ irqreturn_t no_action(int cpl, void *dev_id) { return IRQ_NONE; } +EXPORT_SYMBOL_GPL(no_action); static void warn_no_thread(unsigned int irq, struct irqaction *action) { @@ -51,7 +52,7 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action) "but no thread function available.", irq, action->name); } -static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action) +void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action) { /* * In case the thread crashed and was killed we just pretend that @@ -157,7 +158,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) break; } - irq_wake_thread(desc, action); + __irq_wake_thread(desc, action); /* Fall through to add to randomness */ case IRQ_HANDLED: diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 001fa5bab49..099ea2e0eb8 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -6,6 +6,7 @@ * of this file for your non core code. */ #include <linux/irqdesc.h> +#include <linux/kernel_stat.h> #ifdef CONFIG_SPARSE_IRQ # define IRQ_BITMAP_BITS (NR_IRQS + 8196) @@ -32,7 +33,7 @@ enum { }; /* - * Bit masks for desc->state + * Bit masks for desc->core_internal_state__do_not_mess_with_it * * IRQS_AUTODETECT - autodetection in progress * IRQS_SPURIOUS_DISABLED - was disabled due to spurious interrupt @@ -73,6 +74,13 @@ extern void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu); extern void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu); extern void mask_irq(struct irq_desc *desc); extern void unmask_irq(struct irq_desc *desc); +extern void unmask_threaded_irq(struct irq_desc *desc); + +#ifdef CONFIG_SPARSE_IRQ +static inline void irq_mark_irq(unsigned int irq) { } +#else +extern void irq_mark_irq(unsigned int irq); +#endif extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); @@ -82,6 +90,7 @@ irqreturn_t handle_irq_event(struct irq_desc *desc); /* Resending of interrupts :*/ void check_irq_resend(struct irq_desc *desc, unsigned int irq); bool irq_wait_for_poll(struct irq_desc *desc); +void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action); #ifdef CONFIG_PROC_FS extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); @@ -179,3 +188,9 @@ static inline bool irqd_has_set(struct irq_data *d, unsigned int mask) { return d->state_use_accessors & mask; } + +static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *desc) +{ + __this_cpu_inc(*desc->kstat_irqs); + __this_cpu_inc(kstat.irqs_sum); +} diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 192a302d6cf..1487a123db5 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -274,10 +274,16 @@ struct irq_desc *irq_to_desc(unsigned int irq) { return (irq < NR_IRQS) ? irq_desc + irq : NULL; } +EXPORT_SYMBOL(irq_to_desc); static void free_desc(unsigned int irq) { - dynamic_irq_cleanup(irq); + struct irq_desc *desc = irq_to_desc(irq); + unsigned long flags; + + raw_spin_lock_irqsave(&desc->lock, flags); + desc_set_defaults(irq, desc, desc_node(desc), NULL); + raw_spin_unlock_irqrestore(&desc->lock, flags); } static inline int alloc_descs(unsigned int start, unsigned int cnt, int node, @@ -298,6 +304,20 @@ static int irq_expand_nr_irqs(unsigned int nr) return -ENOMEM; } +void irq_mark_irq(unsigned int irq) +{ + mutex_lock(&sparse_irq_lock); + bitmap_set(allocated_irqs, irq, 1); + mutex_unlock(&sparse_irq_lock); +} + +#ifdef CONFIG_GENERIC_IRQ_LEGACY +void irq_init_desc(unsigned int irq) +{ + free_desc(irq); +} +#endif + #endif /* !CONFIG_SPARSE_IRQ */ /** @@ -362,6 +382,13 @@ __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, if (from > irq) return -EINVAL; from = irq; + } else { + /* + * For interrupts which are freely allocated the + * architecture can force a lower bound to the @from + * argument. x86 uses this to exclude the GSI space. + */ + from = arch_dynirq_lower_bound(from); } mutex_lock(&sparse_irq_lock); @@ -388,30 +415,56 @@ err: } EXPORT_SYMBOL_GPL(__irq_alloc_descs); +#ifdef CONFIG_GENERIC_IRQ_LEGACY_ALLOC_HWIRQ /** - * irq_reserve_irqs - mark irqs allocated - * @from: mark from irq number - * @cnt: number of irqs to mark + * irq_alloc_hwirqs - Allocate an irq descriptor and initialize the hardware + * @cnt: number of interrupts to allocate + * @node: node on which to allocate * - * Returns 0 on success or an appropriate error code + * Returns an interrupt number > 0 or 0, if the allocation fails. */ -int irq_reserve_irqs(unsigned int from, unsigned int cnt) +unsigned int irq_alloc_hwirqs(int cnt, int node) { - unsigned int start; - int ret = 0; + int i, irq = __irq_alloc_descs(-1, 0, cnt, node, NULL); - if (!cnt || (from + cnt) > nr_irqs) - return -EINVAL; + if (irq < 0) + return 0; - mutex_lock(&sparse_irq_lock); - start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0); - if (start == from) - bitmap_set(allocated_irqs, start, cnt); - else - ret = -EEXIST; - mutex_unlock(&sparse_irq_lock); - return ret; + for (i = irq; cnt > 0; i++, cnt--) { + if (arch_setup_hwirq(i, node)) + goto err; + irq_clear_status_flags(i, _IRQ_NOREQUEST); + } + return irq; + +err: + for (i--; i >= irq; i--) { + irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE); + arch_teardown_hwirq(i); + } + irq_free_descs(irq, cnt); + return 0; +} +EXPORT_SYMBOL_GPL(irq_alloc_hwirqs); + +/** + * irq_free_hwirqs - Free irq descriptor and cleanup the hardware + * @from: Free from irq number + * @cnt: number of interrupts to free + * + */ +void irq_free_hwirqs(unsigned int from, int cnt) +{ + int i, j; + + for (i = from, j = cnt; j > 0; i++, j--) { + irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE); + arch_teardown_hwirq(i); + } + irq_free_descs(from, cnt); } +EXPORT_SYMBOL_GPL(irq_free_hwirqs); +#endif /** * irq_get_next_irq - get next allocated irq number @@ -474,18 +527,9 @@ int irq_set_percpu_devid(unsigned int irq) return 0; } -/** - * dynamic_irq_cleanup - cleanup a dynamically allocated irq - * @irq: irq number to initialize - */ -void dynamic_irq_cleanup(unsigned int irq) +void kstat_incr_irq_this_cpu(unsigned int irq) { - struct irq_desc *desc = irq_to_desc(irq); - unsigned long flags; - - raw_spin_lock_irqsave(&desc->lock, flags); - desc_set_defaults(irq, desc, desc_node(desc), NULL); - raw_spin_unlock_irqrestore(&desc->lock, flags); + kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); } unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index cf68bb36fe5..eb5e10e32e0 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -10,6 +10,7 @@ #include <linux/mutex.h> #include <linux/of.h> #include <linux/of_address.h> +#include <linux/of_irq.h> #include <linux/topology.h> #include <linux/seq_file.h> #include <linux/slab.h> @@ -26,14 +27,14 @@ static struct irq_domain *irq_default_domain; * __irq_domain_add() - Allocate a new irq_domain data structure * @of_node: optional device-tree node of the interrupt controller * @size: Size of linear map; 0 for radix mapping only + * @hwirq_max: Maximum number of interrupts supported by controller * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no * direct mapping * @ops: map/unmap domain callbacks * @host_data: Controller private data pointer * - * Allocates and initialize and irq_domain structure. Caller is expected to - * register allocated irq_domain with irq_domain_register(). Returns pointer - * to IRQ domain, or NULL on failure. + * Allocates and initialize and irq_domain structure. + * Returns pointer to IRQ domain, or NULL on failure. */ struct irq_domain *__irq_domain_add(struct device_node *of_node, int size, irq_hw_number_t hwirq_max, int direct_max, diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 481a13c43b1..3dc6a61bf06 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -32,24 +32,10 @@ static int __init setup_forced_irqthreads(char *arg) early_param("threadirqs", setup_forced_irqthreads); #endif -/** - * synchronize_irq - wait for pending IRQ handlers (on other CPUs) - * @irq: interrupt number to wait for - * - * This function waits for any pending IRQ handlers for this interrupt - * to complete before returning. If you use this function while - * holding a resource the IRQ handler may need you will deadlock. - * - * This function may be called - with care - from IRQ context. - */ -void synchronize_irq(unsigned int irq) +static void __synchronize_hardirq(struct irq_desc *desc) { - struct irq_desc *desc = irq_to_desc(irq); bool inprogress; - if (!desc) - return; - do { unsigned long flags; @@ -67,12 +53,56 @@ void synchronize_irq(unsigned int irq) /* Oops, that failed? */ } while (inprogress); +} - /* - * We made sure that no hardirq handler is running. Now verify - * that no threaded handlers are active. - */ - wait_event(desc->wait_for_threads, !atomic_read(&desc->threads_active)); +/** + * synchronize_hardirq - wait for pending hard IRQ handlers (on other CPUs) + * @irq: interrupt number to wait for + * + * This function waits for any pending hard IRQ handlers for this + * interrupt to complete before returning. If you use this + * function while holding a resource the IRQ handler may need you + * will deadlock. It does not take associated threaded handlers + * into account. + * + * Do not use this for shutdown scenarios where you must be sure + * that all parts (hardirq and threaded handler) have completed. + * + * This function may be called - with care - from IRQ context. + */ +void synchronize_hardirq(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + + if (desc) + __synchronize_hardirq(desc); +} +EXPORT_SYMBOL(synchronize_hardirq); + +/** + * synchronize_irq - wait for pending IRQ handlers (on other CPUs) + * @irq: interrupt number to wait for + * + * This function waits for any pending IRQ handlers for this interrupt + * to complete before returning. If you use this function while + * holding a resource the IRQ handler may need you will deadlock. + * + * This function may be called - with care - from IRQ context. + */ +void synchronize_irq(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + + if (desc) { + __synchronize_hardirq(desc); + /* + * We made sure that no hardirq handler is + * running. Now verify that no threaded handlers are + * active. + */ + wait_event(desc->wait_for_threads, + !atomic_read(&desc->threads_active)); + } } EXPORT_SYMBOL(synchronize_irq); @@ -150,7 +180,7 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, struct irq_chip *chip = irq_data_get_irq_chip(data); int ret; - ret = chip->irq_set_affinity(data, mask, false); + ret = chip->irq_set_affinity(data, mask, force); switch (ret) { case IRQ_SET_MASK_OK: cpumask_copy(data->affinity, mask); @@ -162,7 +192,8 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, return ret; } -int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) +int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask, + bool force) { struct irq_chip *chip = irq_data_get_irq_chip(data); struct irq_desc *desc = irq_data_to_desc(data); @@ -172,7 +203,7 @@ int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) return -EINVAL; if (irq_can_move_pcntxt(data)) { - ret = irq_do_set_affinity(data, mask, false); + ret = irq_do_set_affinity(data, mask, force); } else { irqd_set_move_pending(data); irq_copy_pending(desc, mask); @@ -187,13 +218,7 @@ int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) return ret; } -/** - * irq_set_affinity - Set the irq affinity of a given irq - * @irq: Interrupt to set affinity - * @mask: cpumask - * - */ -int irq_set_affinity(unsigned int irq, const struct cpumask *mask) +int __irq_set_affinity(unsigned int irq, const struct cpumask *mask, bool force) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; @@ -203,7 +228,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *mask) return -EINVAL; raw_spin_lock_irqsave(&desc->lock, flags); - ret = __irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask); + ret = irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask, force); raw_spin_unlock_irqrestore(&desc->lock, flags); return ret; } @@ -718,7 +743,7 @@ again: if (!desc->threads_oneshot && !irqd_irq_disabled(&desc->irq_data) && irqd_irq_masked(&desc->irq_data)) - unmask_irq(desc); + unmask_threaded_irq(desc); out_unlock: raw_spin_unlock_irq(&desc->lock); @@ -727,7 +752,7 @@ out_unlock: #ifdef CONFIG_SMP /* - * Check whether we need to chasnge the affinity of the interrupt thread. + * Check whether we need to change the affinity of the interrupt thread. */ static void irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) @@ -802,8 +827,7 @@ static irqreturn_t irq_thread_fn(struct irq_desc *desc, static void wake_threads_waitq(struct irq_desc *desc) { - if (atomic_dec_and_test(&desc->threads_active) && - waitqueue_active(&desc->wait_for_threads)) + if (atomic_dec_and_test(&desc->threads_active)) wake_up(&desc->wait_for_threads); } @@ -862,8 +886,8 @@ static int irq_thread(void *data) irq_thread_check_affinity(desc, action); action_ret = handler_fn(desc, action); - if (!noirqdebug) - note_interrupt(action->irq, desc, action_ret); + if (action_ret == IRQ_HANDLED) + atomic_inc(&desc->threads_handled); wake_threads_waitq(desc); } @@ -881,6 +905,33 @@ static int irq_thread(void *data) return 0; } +/** + * irq_wake_thread - wake the irq thread for the action identified by dev_id + * @irq: Interrupt line + * @dev_id: Device identity for which the thread should be woken + * + */ +void irq_wake_thread(unsigned int irq, void *dev_id) +{ + struct irq_desc *desc = irq_to_desc(irq); + struct irqaction *action; + unsigned long flags; + + if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc))) + return; + + raw_spin_lock_irqsave(&desc->lock, flags); + for (action = desc->action; action; action = action->next) { + if (action->dev_id == dev_id) { + if (action->thread) + __irq_wake_thread(desc, action); + break; + } + } + raw_spin_unlock_irqrestore(&desc->lock, flags); +} +EXPORT_SYMBOL_GPL(irq_wake_thread); + static void irq_setup_forced_threading(struct irqaction *new) { if (!force_irqthreads) @@ -897,6 +948,23 @@ static void irq_setup_forced_threading(struct irqaction *new) } } +static int irq_request_resources(struct irq_desc *desc) +{ + struct irq_data *d = &desc->irq_data; + struct irq_chip *c = d->chip; + + return c->irq_request_resources ? c->irq_request_resources(d) : 0; +} + +static void irq_release_resources(struct irq_desc *desc) +{ + struct irq_data *d = &desc->irq_data; + struct irq_chip *c = d->chip; + + if (c->irq_release_resources) + c->irq_release_resources(d); +} + /* * Internal function to register an irqaction - typically used to * allocate special interrupts that are part of the architecture. @@ -1092,6 +1160,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) } if (!shared) { + ret = irq_request_resources(desc); + if (ret) { + pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n", + new->name, irq, desc->irq_data.chip->name); + goto out_mask; + } + init_waitqueue_head(&desc->wait_for_threads); /* Setup the type (level, edge polarity) if configured: */ @@ -1262,8 +1337,10 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) *action_ptr = action->next; /* If this was the last handler, shut down the IRQ line: */ - if (!desc->action) + if (!desc->action) { irq_shutdown(desc); + irq_release_resources(desc); + } #ifdef CONFIG_SMP /* make sure affinity_hint is cleaned up */ diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 36f6ee181b0..ac1ba2f1103 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -324,15 +324,15 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) #ifdef CONFIG_SMP /* create /proc/irq/<irq>/smp_affinity */ - proc_create_data("smp_affinity", 0600, desc->dir, + proc_create_data("smp_affinity", 0644, desc->dir, &irq_affinity_proc_fops, (void *)(long)irq); /* create /proc/irq/<irq>/affinity_hint */ - proc_create_data("affinity_hint", 0400, desc->dir, + proc_create_data("affinity_hint", 0444, desc->dir, &irq_affinity_hint_proc_fops, (void *)(long)irq); /* create /proc/irq/<irq>/smp_affinity_list */ - proc_create_data("smp_affinity_list", 0600, desc->dir, + proc_create_data("smp_affinity_list", 0644, desc->dir, &irq_affinity_list_proc_fops, (void *)(long)irq); proc_create_data("node", 0444, desc->dir, @@ -372,7 +372,7 @@ void unregister_handler_proc(unsigned int irq, struct irqaction *action) static void register_default_affinity_proc(void) { #ifdef CONFIG_SMP - proc_create("irq/default_smp_affinity", 0600, NULL, + proc_create("irq/default_smp_affinity", 0644, NULL, &default_affinity_proc_fops); #endif } diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index a1d8cc63b56..e2514b0e439 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -270,6 +270,8 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc, return action && (action->flags & IRQF_IRQPOLL); } +#define SPURIOUS_DEFERRED 0x80000000 + void note_interrupt(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) { @@ -277,15 +279,111 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, irq_settings_is_polled(desc)) return; - /* we get here again via the threaded handler */ - if (action_ret == IRQ_WAKE_THREAD) - return; - if (bad_action_ret(action_ret)) { report_bad_irq(irq, desc, action_ret); return; } + /* + * We cannot call note_interrupt from the threaded handler + * because we need to look at the compound of all handlers + * (primary and threaded). Aside of that in the threaded + * shared case we have no serialization against an incoming + * hardware interrupt while we are dealing with a threaded + * result. + * + * So in case a thread is woken, we just note the fact and + * defer the analysis to the next hardware interrupt. + * + * The threaded handlers store whether they sucessfully + * handled an interrupt and we check whether that number + * changed versus the last invocation. + * + * We could handle all interrupts with the delayed by one + * mechanism, but for the non forced threaded case we'd just + * add pointless overhead to the straight hardirq interrupts + * for the sake of a few lines less code. + */ + if (action_ret & IRQ_WAKE_THREAD) { + /* + * There is a thread woken. Check whether one of the + * shared primary handlers returned IRQ_HANDLED. If + * not we defer the spurious detection to the next + * interrupt. + */ + if (action_ret == IRQ_WAKE_THREAD) { + int handled; + /* + * We use bit 31 of thread_handled_last to + * denote the deferred spurious detection + * active. No locking necessary as + * thread_handled_last is only accessed here + * and we have the guarantee that hard + * interrupts are not reentrant. + */ + if (!(desc->threads_handled_last & SPURIOUS_DEFERRED)) { + desc->threads_handled_last |= SPURIOUS_DEFERRED; + return; + } + /* + * Check whether one of the threaded handlers + * returned IRQ_HANDLED since the last + * interrupt happened. + * + * For simplicity we just set bit 31, as it is + * set in threads_handled_last as well. So we + * avoid extra masking. And we really do not + * care about the high bits of the handled + * count. We just care about the count being + * different than the one we saw before. + */ + handled = atomic_read(&desc->threads_handled); + handled |= SPURIOUS_DEFERRED; + if (handled != desc->threads_handled_last) { + action_ret = IRQ_HANDLED; + /* + * Note: We keep the SPURIOUS_DEFERRED + * bit set. We are handling the + * previous invocation right now. + * Keep it for the current one, so the + * next hardware interrupt will + * account for it. + */ + desc->threads_handled_last = handled; + } else { + /* + * None of the threaded handlers felt + * responsible for the last interrupt + * + * We keep the SPURIOUS_DEFERRED bit + * set in threads_handled_last as we + * need to account for the current + * interrupt as well. + */ + action_ret = IRQ_NONE; + } + } else { + /* + * One of the primary handlers returned + * IRQ_HANDLED. So we don't care about the + * threaded handlers on the same line. Clear + * the deferred detection bit. + * + * In theory we could/should check whether the + * deferred bit is set and take the result of + * the previous run into account here as + * well. But it's really not worth the + * trouble. If every other interrupt is + * handled we never trigger the spurious + * detector. And if this is just the one out + * of 100k unhandled ones which is handled + * then we merily delay the spurious detection + * by one hard interrupt. Not a real problem. + */ + desc->threads_handled_last &= ~SPURIOUS_DEFERRED; + } + } + if (unlikely(action_ret == IRQ_NONE)) { /* * If we are seeing only the odd spurious IRQ caused by diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 55fcce6065c..a82170e2fa7 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -61,11 +61,11 @@ void __weak arch_irq_work_raise(void) * * Can be re-enqueued while the callback is still in progress. */ -void irq_work_queue(struct irq_work *work) +bool irq_work_queue(struct irq_work *work) { /* Only queue if not already pending */ if (!irq_work_claim(work)) - return; + return false; /* Queue the entry and raise the IPI if needed. */ preempt_disable(); @@ -83,6 +83,8 @@ void irq_work_queue(struct irq_work *work) } preempt_enable(); + + return true; } EXPORT_SYMBOL_GPL(irq_work_queue); diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 3127ad52cdb..cb0cf37dac3 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -23,6 +23,7 @@ #include <linux/mm.h> #include <linux/ctype.h> #include <linux/slab.h> +#include <linux/compiler.h> #include <asm/sections.h> @@ -36,8 +37,8 @@ * These will be re-linked against their real values * during the second link stage. */ -extern const unsigned long kallsyms_addresses[] __attribute__((weak)); -extern const u8 kallsyms_names[] __attribute__((weak)); +extern const unsigned long kallsyms_addresses[] __weak; +extern const u8 kallsyms_names[] __weak; /* * Tell the compiler that the count isn't in the small data section if the arch @@ -46,10 +47,10 @@ extern const u8 kallsyms_names[] __attribute__((weak)); extern const unsigned long kallsyms_num_syms __attribute__((weak, section(".rodata"))); -extern const u8 kallsyms_token_table[] __attribute__((weak)); -extern const u16 kallsyms_token_index[] __attribute__((weak)); +extern const u8 kallsyms_token_table[] __weak; +extern const u16 kallsyms_token_index[] __weak; -extern const unsigned long kallsyms_markers[] __attribute__((weak)); +extern const unsigned long kallsyms_markers[] __weak; static inline int is_kernel_inittext(unsigned long addr) { diff --git a/kernel/kexec.c b/kernel/kexec.c index 60bafbed06a..4b8f0c92588 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -32,6 +32,8 @@ #include <linux/vmalloc.h> #include <linux/swap.h> #include <linux/syscore_ops.h> +#include <linux/compiler.h> +#include <linux/hugetlb.h> #include <asm/page.h> #include <asm/uaccess.h> @@ -124,8 +126,8 @@ static struct page *kimage_alloc_page(struct kimage *image, unsigned long dest); static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, - unsigned long nr_segments, - struct kexec_segment __user *segments) + unsigned long nr_segments, + struct kexec_segment __user *segments) { size_t segment_bytes; struct kimage *image; @@ -256,13 +258,13 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, image->control_code_page = kimage_alloc_control_pages(image, get_order(KEXEC_CONTROL_PAGE_SIZE)); if (!image->control_code_page) { - printk(KERN_ERR "Could not allocate control_code_buffer\n"); + pr_err("Could not allocate control_code_buffer\n"); goto out_free; } image->swap_page = kimage_alloc_control_pages(image, 0); if (!image->swap_page) { - printk(KERN_ERR "Could not allocate swap buffer\n"); + pr_err("Could not allocate swap buffer\n"); goto out_free; } @@ -331,7 +333,7 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, image->control_code_page = kimage_alloc_control_pages(image, get_order(KEXEC_CONTROL_PAGE_SIZE)); if (!image->control_code_page) { - printk(KERN_ERR "Could not allocate control_code_buffer\n"); + pr_err("Could not allocate control_code_buffer\n"); goto out_free; } @@ -620,8 +622,8 @@ static void kimage_terminate(struct kimage *image) #define for_each_kimage_entry(image, ptr, entry) \ for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ - ptr = (entry & IND_INDIRECTION)? \ - phys_to_virt((entry & PAGE_MASK)): ptr +1) + ptr = (entry & IND_INDIRECTION) ? \ + phys_to_virt((entry & PAGE_MASK)) : ptr + 1) static void kimage_free_entry(kimage_entry_t entry) { @@ -649,8 +651,7 @@ static void kimage_free(struct kimage *image) * done with it. */ ind = entry; - } - else if (entry & IND_SOURCE) + } else if (entry & IND_SOURCE) kimage_free_entry(entry); } /* Free the final indirection page */ @@ -773,8 +774,7 @@ static struct page *kimage_alloc_page(struct kimage *image, addr = old_addr; page = old_page; break; - } - else { + } else { /* Place the page on the destination list I * will use it later. */ @@ -1039,10 +1039,10 @@ void __weak crash_unmap_reserved_pages(void) {} #ifdef CONFIG_COMPAT -asmlinkage long compat_sys_kexec_load(unsigned long entry, - unsigned long nr_segments, - struct compat_kexec_segment __user *segments, - unsigned long flags) +COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, + compat_ulong_t, nr_segments, + struct compat_kexec_segment __user *, segments, + compat_ulong_t, flags) { struct compat_kexec_segment in; struct kexec_segment out, __user *ksegments; @@ -1058,7 +1058,7 @@ asmlinkage long compat_sys_kexec_load(unsigned long entry, return -EINVAL; ksegments = compat_alloc_user_space(nr_segments * sizeof(out)); - for (i=0; i < nr_segments; i++) { + for (i = 0; i < nr_segments; i++) { result = copy_from_user(&in, &segments[i], sizeof(in)); if (result) return -EFAULT; @@ -1213,14 +1213,14 @@ void crash_save_cpu(struct pt_regs *regs, int cpu) * squirrelled away. ELF notes happen to provide * all of that, so there is no need to invent something new. */ - buf = (u32*)per_cpu_ptr(crash_notes, cpu); + buf = (u32 *)per_cpu_ptr(crash_notes, cpu); if (!buf) return; memset(&prstatus, 0, sizeof(prstatus)); prstatus.pr_pid = current->pid; elf_core_copy_kernel_regs(&prstatus.pr_reg, regs); buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, - &prstatus, sizeof(prstatus)); + &prstatus, sizeof(prstatus)); final_note(buf); } @@ -1229,13 +1229,12 @@ static int __init crash_notes_memory_init(void) /* Allocate memory for saving cpu registers. */ crash_notes = alloc_percpu(note_buf_t); if (!crash_notes) { - printk("Kexec: Memory allocation for saving cpu register" - " states failed\n"); + pr_warn("Kexec: Memory allocation for saving cpu register states failed\n"); return -ENOMEM; } return 0; } -module_init(crash_notes_memory_init) +subsys_initcall(crash_notes_memory_init); /* @@ -1252,10 +1251,10 @@ module_init(crash_notes_memory_init) * * The function returns 0 on success and -EINVAL on failure. */ -static int __init parse_crashkernel_mem(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base) +static int __init parse_crashkernel_mem(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) { char *cur = cmdline, *tmp; @@ -1266,12 +1265,12 @@ static int __init parse_crashkernel_mem(char *cmdline, /* get the start of the range */ start = memparse(cur, &tmp); if (cur == tmp) { - pr_warning("crashkernel: Memory value expected\n"); + pr_warn("crashkernel: Memory value expected\n"); return -EINVAL; } cur = tmp; if (*cur != '-') { - pr_warning("crashkernel: '-' expected\n"); + pr_warn("crashkernel: '-' expected\n"); return -EINVAL; } cur++; @@ -1280,31 +1279,30 @@ static int __init parse_crashkernel_mem(char *cmdline, if (*cur != ':') { end = memparse(cur, &tmp); if (cur == tmp) { - pr_warning("crashkernel: Memory " - "value expected\n"); + pr_warn("crashkernel: Memory value expected\n"); return -EINVAL; } cur = tmp; if (end <= start) { - pr_warning("crashkernel: end <= start\n"); + pr_warn("crashkernel: end <= start\n"); return -EINVAL; } } if (*cur != ':') { - pr_warning("crashkernel: ':' expected\n"); + pr_warn("crashkernel: ':' expected\n"); return -EINVAL; } cur++; size = memparse(cur, &tmp); if (cur == tmp) { - pr_warning("Memory value expected\n"); + pr_warn("Memory value expected\n"); return -EINVAL; } cur = tmp; if (size >= system_ram) { - pr_warning("crashkernel: invalid size\n"); + pr_warn("crashkernel: invalid size\n"); return -EINVAL; } @@ -1322,8 +1320,7 @@ static int __init parse_crashkernel_mem(char *cmdline, cur++; *crash_base = memparse(cur, &tmp); if (cur == tmp) { - pr_warning("Memory value expected " - "after '@'\n"); + pr_warn("Memory value expected after '@'\n"); return -EINVAL; } } @@ -1335,26 +1332,26 @@ static int __init parse_crashkernel_mem(char *cmdline, /* * That function parses "simple" (old) crashkernel command lines like * - * crashkernel=size[@offset] + * crashkernel=size[@offset] * * It returns 0 on success and -EINVAL on failure. */ -static int __init parse_crashkernel_simple(char *cmdline, - unsigned long long *crash_size, - unsigned long long *crash_base) +static int __init parse_crashkernel_simple(char *cmdline, + unsigned long long *crash_size, + unsigned long long *crash_base) { char *cur = cmdline; *crash_size = memparse(cmdline, &cur); if (cmdline == cur) { - pr_warning("crashkernel: memory value expected\n"); + pr_warn("crashkernel: memory value expected\n"); return -EINVAL; } if (*cur == '@') *crash_base = memparse(cur+1, &cur); else if (*cur != ' ' && *cur != '\0') { - pr_warning("crashkernel: unrecognized char\n"); + pr_warn("crashkernel: unrecognized char\n"); return -EINVAL; } @@ -1551,10 +1548,10 @@ void vmcoreinfo_append_str(const char *fmt, ...) * provide an empty default implementation here -- architecture * code may override this */ -void __attribute__ ((weak)) arch_crash_save_vmcoreinfo(void) +void __weak arch_crash_save_vmcoreinfo(void) {} -unsigned long __attribute__ ((weak)) paddr_vmcoreinfo_note(void) +unsigned long __weak paddr_vmcoreinfo_note(void) { return __pa((unsigned long)(char *)&vmcoreinfo_note); } @@ -1621,7 +1618,11 @@ static int __init crash_save_vmcoreinfo_init(void) #ifdef CONFIG_MEMORY_FAILURE VMCOREINFO_NUMBER(PG_hwpoison); #endif + VMCOREINFO_NUMBER(PG_head_mask); VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); +#ifdef CONFIG_HUGETLBFS + VMCOREINFO_SYMBOL(free_huge_page); +#endif arch_crash_save_vmcoreinfo(); update_vmcoreinfo_note(); @@ -1629,7 +1630,7 @@ static int __init crash_save_vmcoreinfo_init(void) return 0; } -module_init(crash_save_vmcoreinfo_init) +subsys_initcall(crash_save_vmcoreinfo_init); /* * Move into place and start executing a preloaded standalone @@ -1682,7 +1683,15 @@ int kernel_kexec(void) kexec_in_progress = true; kernel_restart_prepare(NULL); migrate_to_reboot_cpu(); - printk(KERN_EMERG "Starting new kernel\n"); + + /* + * migrate_to_reboot_cpu() disables CPU hotplug assuming that + * no further code needs to use CPU hotplug (which is true in + * the reboot case). However, the kexec path depends on using + * CPU hotplug again; so re-enable it here. + */ + cpu_hotplug_enable(); + pr_emerg("Starting new kernel\n"); machine_shutdown(); } diff --git a/kernel/kmod.c b/kernel/kmod.c index b086006c59e..8637e041a24 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -239,7 +239,7 @@ static int ____call_usermodehelper(void *data) commit_creds(new); - retval = do_execve(sub_info->path, + retval = do_execve(getname_kernel(sub_info->path), (const char __user *const __user *)sub_info->argv, (const char __user *const __user *)sub_info->envp); if (!retval) @@ -285,10 +285,7 @@ static int wait_for_helper(void *data) pid_t pid; /* If SIGCLD is ignored sys_wait4 won't populate the status. */ - spin_lock_irq(¤t->sighand->siglock); - current->sighand->action[SIGCHLD-1].sa.sa_handler = SIG_DFL; - spin_unlock_irq(¤t->sighand->siglock); - + kernel_sigaction(SIGCHLD, SIG_DFL); pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD); if (pid < 0) { sub_info->retval = pid; @@ -498,7 +495,7 @@ int __usermodehelper_disable(enum umh_disable_depth depth) static void helper_lock(void) { atomic_inc(&running_helpers); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); } static void helper_unlock(void) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index ceeadfcabb7..734e9a7d280 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -86,21 +86,8 @@ static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) return &(kretprobe_table_locks[hash].lock); } -/* - * Normally, functions that we'd want to prohibit kprobes in, are marked - * __kprobes. But, there are cases where such functions already belong to - * a different section (__sched for preempt_schedule) - * - * For such cases, we now have a blacklist - */ -static struct kprobe_blackpoint kprobe_blacklist[] = { - {"preempt_schedule",}, - {"native_get_debugreg",}, - {"irq_entries_start",}, - {"common_interrupt",}, - {"mcount",}, /* mcount can be called from everywhere */ - {NULL} /* Terminator */ -}; +/* Blacklist -- list of struct kprobe_blacklist_entry */ +static LIST_HEAD(kprobe_blacklist); #ifdef __ARCH_WANT_KPROBES_INSN_SLOT /* @@ -151,13 +138,13 @@ struct kprobe_insn_cache kprobe_insn_slots = { .insn_size = MAX_INSN_SIZE, .nr_garbage = 0, }; -static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c); +static int collect_garbage_slots(struct kprobe_insn_cache *c); /** * __get_insn_slot() - Find a slot on an executable page for an instruction. * We allocate an executable page if there's no room on existing ones. */ -kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c) +kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c) { struct kprobe_insn_page *kip; kprobe_opcode_t *slot = NULL; @@ -214,7 +201,7 @@ out: } /* Return 1 if all garbages are collected, otherwise 0. */ -static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) +static int collect_one_slot(struct kprobe_insn_page *kip, int idx) { kip->slot_used[idx] = SLOT_CLEAN; kip->nused--; @@ -235,7 +222,7 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) return 0; } -static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c) +static int collect_garbage_slots(struct kprobe_insn_cache *c) { struct kprobe_insn_page *kip, *next; @@ -257,8 +244,8 @@ static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c) return 0; } -void __kprobes __free_insn_slot(struct kprobe_insn_cache *c, - kprobe_opcode_t *slot, int dirty) +void __free_insn_slot(struct kprobe_insn_cache *c, + kprobe_opcode_t *slot, int dirty) { struct kprobe_insn_page *kip; @@ -314,7 +301,7 @@ static inline void reset_kprobe_instance(void) * OR * - with preemption disabled - from arch/xxx/kernel/kprobes.c */ -struct kprobe __kprobes *get_kprobe(void *addr) +struct kprobe *get_kprobe(void *addr) { struct hlist_head *head; struct kprobe *p; @@ -327,8 +314,9 @@ struct kprobe __kprobes *get_kprobe(void *addr) return NULL; } +NOKPROBE_SYMBOL(get_kprobe); -static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs); +static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs); /* Return true if the kprobe is an aggregator */ static inline int kprobe_aggrprobe(struct kprobe *p) @@ -360,7 +348,7 @@ static bool kprobes_allow_optimization; * Call all pre_handler on the list, but ignores its return value. * This must be called from arch-dep optimized caller. */ -void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs) +void opt_pre_handler(struct kprobe *p, struct pt_regs *regs) { struct kprobe *kp; @@ -372,9 +360,10 @@ void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs) reset_kprobe_instance(); } } +NOKPROBE_SYMBOL(opt_pre_handler); /* Free optimized instructions and optimized_kprobe */ -static __kprobes void free_aggr_kprobe(struct kprobe *p) +static void free_aggr_kprobe(struct kprobe *p) { struct optimized_kprobe *op; @@ -412,7 +401,7 @@ static inline int kprobe_disarmed(struct kprobe *p) } /* Return true(!0) if the probe is queued on (un)optimizing lists */ -static int __kprobes kprobe_queued(struct kprobe *p) +static int kprobe_queued(struct kprobe *p) { struct optimized_kprobe *op; @@ -428,7 +417,7 @@ static int __kprobes kprobe_queued(struct kprobe *p) * Return an optimized kprobe whose optimizing code replaces * instructions including addr (exclude breakpoint). */ -static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr) +static struct kprobe *get_optimized_kprobe(unsigned long addr) { int i; struct kprobe *p = NULL; @@ -460,7 +449,7 @@ static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); * Optimize (replace a breakpoint with a jump) kprobes listed on * optimizing_list. */ -static __kprobes void do_optimize_kprobes(void) +static void do_optimize_kprobes(void) { /* Optimization never be done when disarmed */ if (kprobes_all_disarmed || !kprobes_allow_optimization || @@ -488,7 +477,7 @@ static __kprobes void do_optimize_kprobes(void) * Unoptimize (replace a jump with a breakpoint and remove the breakpoint * if need) kprobes listed on unoptimizing_list. */ -static __kprobes void do_unoptimize_kprobes(void) +static void do_unoptimize_kprobes(void) { struct optimized_kprobe *op, *tmp; @@ -520,7 +509,7 @@ static __kprobes void do_unoptimize_kprobes(void) } /* Reclaim all kprobes on the free_list */ -static __kprobes void do_free_cleaned_kprobes(void) +static void do_free_cleaned_kprobes(void) { struct optimized_kprobe *op, *tmp; @@ -532,13 +521,13 @@ static __kprobes void do_free_cleaned_kprobes(void) } /* Start optimizer after OPTIMIZE_DELAY passed */ -static __kprobes void kick_kprobe_optimizer(void) +static void kick_kprobe_optimizer(void) { schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY); } /* Kprobe jump optimizer */ -static __kprobes void kprobe_optimizer(struct work_struct *work) +static void kprobe_optimizer(struct work_struct *work) { mutex_lock(&kprobe_mutex); /* Lock modules while optimizing kprobes */ @@ -574,7 +563,7 @@ static __kprobes void kprobe_optimizer(struct work_struct *work) } /* Wait for completing optimization and unoptimization */ -static __kprobes void wait_for_kprobe_optimizer(void) +static void wait_for_kprobe_optimizer(void) { mutex_lock(&kprobe_mutex); @@ -593,7 +582,7 @@ static __kprobes void wait_for_kprobe_optimizer(void) } /* Optimize kprobe if p is ready to be optimized */ -static __kprobes void optimize_kprobe(struct kprobe *p) +static void optimize_kprobe(struct kprobe *p) { struct optimized_kprobe *op; @@ -627,7 +616,7 @@ static __kprobes void optimize_kprobe(struct kprobe *p) } /* Short cut to direct unoptimizing */ -static __kprobes void force_unoptimize_kprobe(struct optimized_kprobe *op) +static void force_unoptimize_kprobe(struct optimized_kprobe *op) { get_online_cpus(); arch_unoptimize_kprobe(op); @@ -637,7 +626,7 @@ static __kprobes void force_unoptimize_kprobe(struct optimized_kprobe *op) } /* Unoptimize a kprobe if p is optimized */ -static __kprobes void unoptimize_kprobe(struct kprobe *p, bool force) +static void unoptimize_kprobe(struct kprobe *p, bool force) { struct optimized_kprobe *op; @@ -697,7 +686,7 @@ static void reuse_unused_kprobe(struct kprobe *ap) } /* Remove optimized instructions */ -static void __kprobes kill_optimized_kprobe(struct kprobe *p) +static void kill_optimized_kprobe(struct kprobe *p) { struct optimized_kprobe *op; @@ -723,7 +712,7 @@ static void __kprobes kill_optimized_kprobe(struct kprobe *p) } /* Try to prepare optimized instructions */ -static __kprobes void prepare_optimized_kprobe(struct kprobe *p) +static void prepare_optimized_kprobe(struct kprobe *p) { struct optimized_kprobe *op; @@ -732,7 +721,7 @@ static __kprobes void prepare_optimized_kprobe(struct kprobe *p) } /* Allocate new optimized_kprobe and try to prepare optimized instructions */ -static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) +static struct kprobe *alloc_aggr_kprobe(struct kprobe *p) { struct optimized_kprobe *op; @@ -747,13 +736,13 @@ static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) return &op->kp; } -static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p); +static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p); /* * Prepare an optimized_kprobe and optimize it * NOTE: p must be a normal registered kprobe */ -static __kprobes void try_to_optimize_kprobe(struct kprobe *p) +static void try_to_optimize_kprobe(struct kprobe *p) { struct kprobe *ap; struct optimized_kprobe *op; @@ -787,7 +776,7 @@ out: } #ifdef CONFIG_SYSCTL -static void __kprobes optimize_all_kprobes(void) +static void optimize_all_kprobes(void) { struct hlist_head *head; struct kprobe *p; @@ -810,7 +799,7 @@ out: mutex_unlock(&kprobe_mutex); } -static void __kprobes unoptimize_all_kprobes(void) +static void unoptimize_all_kprobes(void) { struct hlist_head *head; struct kprobe *p; @@ -861,7 +850,7 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write, #endif /* CONFIG_SYSCTL */ /* Put a breakpoint for a probe. Must be called with text_mutex locked */ -static void __kprobes __arm_kprobe(struct kprobe *p) +static void __arm_kprobe(struct kprobe *p) { struct kprobe *_p; @@ -876,7 +865,7 @@ static void __kprobes __arm_kprobe(struct kprobe *p) } /* Remove the breakpoint of a probe. Must be called with text_mutex locked */ -static void __kprobes __disarm_kprobe(struct kprobe *p, bool reopt) +static void __disarm_kprobe(struct kprobe *p, bool reopt) { struct kprobe *_p; @@ -911,13 +900,13 @@ static void reuse_unused_kprobe(struct kprobe *ap) BUG_ON(kprobe_unused(ap)); } -static __kprobes void free_aggr_kprobe(struct kprobe *p) +static void free_aggr_kprobe(struct kprobe *p) { arch_remove_kprobe(p); kfree(p); } -static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) +static struct kprobe *alloc_aggr_kprobe(struct kprobe *p) { return kzalloc(sizeof(struct kprobe), GFP_KERNEL); } @@ -931,7 +920,7 @@ static struct ftrace_ops kprobe_ftrace_ops __read_mostly = { static int kprobe_ftrace_enabled; /* Must ensure p->addr is really on ftrace */ -static int __kprobes prepare_kprobe(struct kprobe *p) +static int prepare_kprobe(struct kprobe *p) { if (!kprobe_ftrace(p)) return arch_prepare_kprobe(p); @@ -940,7 +929,7 @@ static int __kprobes prepare_kprobe(struct kprobe *p) } /* Caller must lock kprobe_mutex */ -static void __kprobes arm_kprobe_ftrace(struct kprobe *p) +static void arm_kprobe_ftrace(struct kprobe *p) { int ret; @@ -955,7 +944,7 @@ static void __kprobes arm_kprobe_ftrace(struct kprobe *p) } /* Caller must lock kprobe_mutex */ -static void __kprobes disarm_kprobe_ftrace(struct kprobe *p) +static void disarm_kprobe_ftrace(struct kprobe *p) { int ret; @@ -975,7 +964,7 @@ static void __kprobes disarm_kprobe_ftrace(struct kprobe *p) #endif /* Arm a kprobe with text_mutex */ -static void __kprobes arm_kprobe(struct kprobe *kp) +static void arm_kprobe(struct kprobe *kp) { if (unlikely(kprobe_ftrace(kp))) { arm_kprobe_ftrace(kp); @@ -992,7 +981,7 @@ static void __kprobes arm_kprobe(struct kprobe *kp) } /* Disarm a kprobe with text_mutex */ -static void __kprobes disarm_kprobe(struct kprobe *kp, bool reopt) +static void disarm_kprobe(struct kprobe *kp, bool reopt) { if (unlikely(kprobe_ftrace(kp))) { disarm_kprobe_ftrace(kp); @@ -1008,7 +997,7 @@ static void __kprobes disarm_kprobe(struct kprobe *kp, bool reopt) * Aggregate handlers for multiple kprobes support - these handlers * take care of invoking the individual kprobe handlers on p->list */ -static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) +static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) { struct kprobe *kp; @@ -1022,9 +1011,10 @@ static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) } return 0; } +NOKPROBE_SYMBOL(aggr_pre_handler); -static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs, - unsigned long flags) +static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs, + unsigned long flags) { struct kprobe *kp; @@ -1036,9 +1026,10 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs, } } } +NOKPROBE_SYMBOL(aggr_post_handler); -static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, - int trapnr) +static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, + int trapnr) { struct kprobe *cur = __this_cpu_read(kprobe_instance); @@ -1052,8 +1043,9 @@ static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, } return 0; } +NOKPROBE_SYMBOL(aggr_fault_handler); -static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs) +static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs) { struct kprobe *cur = __this_cpu_read(kprobe_instance); int ret = 0; @@ -1065,9 +1057,10 @@ static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs) reset_kprobe_instance(); return ret; } +NOKPROBE_SYMBOL(aggr_break_handler); /* Walks the list and increments nmissed count for multiprobe case */ -void __kprobes kprobes_inc_nmissed_count(struct kprobe *p) +void kprobes_inc_nmissed_count(struct kprobe *p) { struct kprobe *kp; if (!kprobe_aggrprobe(p)) { @@ -1078,9 +1071,10 @@ void __kprobes kprobes_inc_nmissed_count(struct kprobe *p) } return; } +NOKPROBE_SYMBOL(kprobes_inc_nmissed_count); -void __kprobes recycle_rp_inst(struct kretprobe_instance *ri, - struct hlist_head *head) +void recycle_rp_inst(struct kretprobe_instance *ri, + struct hlist_head *head) { struct kretprobe *rp = ri->rp; @@ -1095,8 +1089,9 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri, /* Unregistering */ hlist_add_head(&ri->hlist, head); } +NOKPROBE_SYMBOL(recycle_rp_inst); -void __kprobes kretprobe_hash_lock(struct task_struct *tsk, +void kretprobe_hash_lock(struct task_struct *tsk, struct hlist_head **head, unsigned long *flags) __acquires(hlist_lock) { @@ -1107,17 +1102,19 @@ __acquires(hlist_lock) hlist_lock = kretprobe_table_lock_ptr(hash); raw_spin_lock_irqsave(hlist_lock, *flags); } +NOKPROBE_SYMBOL(kretprobe_hash_lock); -static void __kprobes kretprobe_table_lock(unsigned long hash, - unsigned long *flags) +static void kretprobe_table_lock(unsigned long hash, + unsigned long *flags) __acquires(hlist_lock) { raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); raw_spin_lock_irqsave(hlist_lock, *flags); } +NOKPROBE_SYMBOL(kretprobe_table_lock); -void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, - unsigned long *flags) +void kretprobe_hash_unlock(struct task_struct *tsk, + unsigned long *flags) __releases(hlist_lock) { unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); @@ -1126,14 +1123,16 @@ __releases(hlist_lock) hlist_lock = kretprobe_table_lock_ptr(hash); raw_spin_unlock_irqrestore(hlist_lock, *flags); } +NOKPROBE_SYMBOL(kretprobe_hash_unlock); -static void __kprobes kretprobe_table_unlock(unsigned long hash, - unsigned long *flags) +static void kretprobe_table_unlock(unsigned long hash, + unsigned long *flags) __releases(hlist_lock) { raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); raw_spin_unlock_irqrestore(hlist_lock, *flags); } +NOKPROBE_SYMBOL(kretprobe_table_unlock); /* * This function is called from finish_task_switch when task tk becomes dead, @@ -1141,7 +1140,7 @@ __releases(hlist_lock) * with this task. These left over instances represent probed functions * that have been called but will never return. */ -void __kprobes kprobe_flush_task(struct task_struct *tk) +void kprobe_flush_task(struct task_struct *tk) { struct kretprobe_instance *ri; struct hlist_head *head, empty_rp; @@ -1166,6 +1165,7 @@ void __kprobes kprobe_flush_task(struct task_struct *tk) kfree(ri); } } +NOKPROBE_SYMBOL(kprobe_flush_task); static inline void free_rp_inst(struct kretprobe *rp) { @@ -1178,7 +1178,7 @@ static inline void free_rp_inst(struct kretprobe *rp) } } -static void __kprobes cleanup_rp_inst(struct kretprobe *rp) +static void cleanup_rp_inst(struct kretprobe *rp) { unsigned long flags, hash; struct kretprobe_instance *ri; @@ -1197,12 +1197,13 @@ static void __kprobes cleanup_rp_inst(struct kretprobe *rp) } free_rp_inst(rp); } +NOKPROBE_SYMBOL(cleanup_rp_inst); /* * Add the new probe to ap->list. Fail if this is the * second jprobe at the address - two jprobes can't coexist */ -static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) +static int add_new_kprobe(struct kprobe *ap, struct kprobe *p) { BUG_ON(kprobe_gone(ap) || kprobe_gone(p)); @@ -1226,7 +1227,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) * Fill in the required fields of the "manager kprobe". Replace the * earlier kprobe in the hlist with the manager kprobe */ -static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p) +static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p) { /* Copy p's insn slot to ap */ copy_kprobe(p, ap); @@ -1252,8 +1253,7 @@ static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p) * This is the second or subsequent kprobe at the address - handle * the intricacies */ -static int __kprobes register_aggr_kprobe(struct kprobe *orig_p, - struct kprobe *p) +static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p) { int ret = 0; struct kprobe *ap = orig_p; @@ -1324,25 +1324,29 @@ out: return ret; } -static int __kprobes in_kprobes_functions(unsigned long addr) +bool __weak arch_within_kprobe_blacklist(unsigned long addr) { - struct kprobe_blackpoint *kb; + /* The __kprobes marked functions and entry code must not be probed */ + return addr >= (unsigned long)__kprobes_text_start && + addr < (unsigned long)__kprobes_text_end; +} - if (addr >= (unsigned long)__kprobes_text_start && - addr < (unsigned long)__kprobes_text_end) - return -EINVAL; +static bool within_kprobe_blacklist(unsigned long addr) +{ + struct kprobe_blacklist_entry *ent; + + if (arch_within_kprobe_blacklist(addr)) + return true; /* * If there exists a kprobe_blacklist, verify and * fail any probe registration in the prohibited area */ - for (kb = kprobe_blacklist; kb->name != NULL; kb++) { - if (kb->start_addr) { - if (addr >= kb->start_addr && - addr < (kb->start_addr + kb->range)) - return -EINVAL; - } + list_for_each_entry(ent, &kprobe_blacklist, list) { + if (addr >= ent->start_addr && addr < ent->end_addr) + return true; } - return 0; + + return false; } /* @@ -1351,7 +1355,7 @@ static int __kprobes in_kprobes_functions(unsigned long addr) * This returns encoded errors if it fails to look up symbol or invalid * combination of parameters. */ -static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p) +static kprobe_opcode_t *kprobe_addr(struct kprobe *p) { kprobe_opcode_t *addr = p->addr; @@ -1374,7 +1378,7 @@ invalid: } /* Check passed kprobe is valid and return kprobe in kprobe_table. */ -static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p) +static struct kprobe *__get_valid_kprobe(struct kprobe *p) { struct kprobe *ap, *list_p; @@ -1406,8 +1410,8 @@ static inline int check_kprobe_rereg(struct kprobe *p) return ret; } -static __kprobes int check_kprobe_address_safe(struct kprobe *p, - struct module **probed_mod) +static int check_kprobe_address_safe(struct kprobe *p, + struct module **probed_mod) { int ret = 0; unsigned long ftrace_addr; @@ -1433,7 +1437,7 @@ static __kprobes int check_kprobe_address_safe(struct kprobe *p, /* Ensure it is not in reserved area nor out of text */ if (!kernel_text_address((unsigned long) p->addr) || - in_kprobes_functions((unsigned long) p->addr) || + within_kprobe_blacklist((unsigned long) p->addr) || jump_label_text_reserved(p->addr, p->addr)) { ret = -EINVAL; goto out; @@ -1469,7 +1473,7 @@ out: return ret; } -int __kprobes register_kprobe(struct kprobe *p) +int register_kprobe(struct kprobe *p) { int ret; struct kprobe *old_p; @@ -1531,7 +1535,7 @@ out: EXPORT_SYMBOL_GPL(register_kprobe); /* Check if all probes on the aggrprobe are disabled */ -static int __kprobes aggr_kprobe_disabled(struct kprobe *ap) +static int aggr_kprobe_disabled(struct kprobe *ap) { struct kprobe *kp; @@ -1547,7 +1551,7 @@ static int __kprobes aggr_kprobe_disabled(struct kprobe *ap) } /* Disable one kprobe: Make sure called under kprobe_mutex is locked */ -static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p) +static struct kprobe *__disable_kprobe(struct kprobe *p) { struct kprobe *orig_p; @@ -1574,7 +1578,7 @@ static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p) /* * Unregister a kprobe without a scheduler synchronization. */ -static int __kprobes __unregister_kprobe_top(struct kprobe *p) +static int __unregister_kprobe_top(struct kprobe *p) { struct kprobe *ap, *list_p; @@ -1631,7 +1635,7 @@ disarmed: return 0; } -static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) +static void __unregister_kprobe_bottom(struct kprobe *p) { struct kprobe *ap; @@ -1647,7 +1651,7 @@ static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) /* Otherwise, do nothing. */ } -int __kprobes register_kprobes(struct kprobe **kps, int num) +int register_kprobes(struct kprobe **kps, int num) { int i, ret = 0; @@ -1665,13 +1669,13 @@ int __kprobes register_kprobes(struct kprobe **kps, int num) } EXPORT_SYMBOL_GPL(register_kprobes); -void __kprobes unregister_kprobe(struct kprobe *p) +void unregister_kprobe(struct kprobe *p) { unregister_kprobes(&p, 1); } EXPORT_SYMBOL_GPL(unregister_kprobe); -void __kprobes unregister_kprobes(struct kprobe **kps, int num) +void unregister_kprobes(struct kprobe **kps, int num) { int i; @@ -1700,7 +1704,7 @@ unsigned long __weak arch_deref_entry_point(void *entry) return (unsigned long)entry; } -int __kprobes register_jprobes(struct jprobe **jps, int num) +int register_jprobes(struct jprobe **jps, int num) { struct jprobe *jp; int ret = 0, i; @@ -1731,19 +1735,19 @@ int __kprobes register_jprobes(struct jprobe **jps, int num) } EXPORT_SYMBOL_GPL(register_jprobes); -int __kprobes register_jprobe(struct jprobe *jp) +int register_jprobe(struct jprobe *jp) { return register_jprobes(&jp, 1); } EXPORT_SYMBOL_GPL(register_jprobe); -void __kprobes unregister_jprobe(struct jprobe *jp) +void unregister_jprobe(struct jprobe *jp) { unregister_jprobes(&jp, 1); } EXPORT_SYMBOL_GPL(unregister_jprobe); -void __kprobes unregister_jprobes(struct jprobe **jps, int num) +void unregister_jprobes(struct jprobe **jps, int num) { int i; @@ -1768,8 +1772,7 @@ EXPORT_SYMBOL_GPL(unregister_jprobes); * This kprobe pre_handler is registered with every kretprobe. When probe * hits it will set up the return probe. */ -static int __kprobes pre_handler_kretprobe(struct kprobe *p, - struct pt_regs *regs) +static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) { struct kretprobe *rp = container_of(p, struct kretprobe, kp); unsigned long hash, flags = 0; @@ -1807,8 +1810,9 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, } return 0; } +NOKPROBE_SYMBOL(pre_handler_kretprobe); -int __kprobes register_kretprobe(struct kretprobe *rp) +int register_kretprobe(struct kretprobe *rp) { int ret = 0; struct kretprobe_instance *inst; @@ -1861,7 +1865,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp) } EXPORT_SYMBOL_GPL(register_kretprobe); -int __kprobes register_kretprobes(struct kretprobe **rps, int num) +int register_kretprobes(struct kretprobe **rps, int num) { int ret = 0, i; @@ -1879,13 +1883,13 @@ int __kprobes register_kretprobes(struct kretprobe **rps, int num) } EXPORT_SYMBOL_GPL(register_kretprobes); -void __kprobes unregister_kretprobe(struct kretprobe *rp) +void unregister_kretprobe(struct kretprobe *rp) { unregister_kretprobes(&rp, 1); } EXPORT_SYMBOL_GPL(unregister_kretprobe); -void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) +void unregister_kretprobes(struct kretprobe **rps, int num) { int i; @@ -1908,38 +1912,38 @@ void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) EXPORT_SYMBOL_GPL(unregister_kretprobes); #else /* CONFIG_KRETPROBES */ -int __kprobes register_kretprobe(struct kretprobe *rp) +int register_kretprobe(struct kretprobe *rp) { return -ENOSYS; } EXPORT_SYMBOL_GPL(register_kretprobe); -int __kprobes register_kretprobes(struct kretprobe **rps, int num) +int register_kretprobes(struct kretprobe **rps, int num) { return -ENOSYS; } EXPORT_SYMBOL_GPL(register_kretprobes); -void __kprobes unregister_kretprobe(struct kretprobe *rp) +void unregister_kretprobe(struct kretprobe *rp) { } EXPORT_SYMBOL_GPL(unregister_kretprobe); -void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) +void unregister_kretprobes(struct kretprobe **rps, int num) { } EXPORT_SYMBOL_GPL(unregister_kretprobes); -static int __kprobes pre_handler_kretprobe(struct kprobe *p, - struct pt_regs *regs) +static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) { return 0; } +NOKPROBE_SYMBOL(pre_handler_kretprobe); #endif /* CONFIG_KRETPROBES */ /* Set the kprobe gone and remove its instruction buffer. */ -static void __kprobes kill_kprobe(struct kprobe *p) +static void kill_kprobe(struct kprobe *p) { struct kprobe *kp; @@ -1963,7 +1967,7 @@ static void __kprobes kill_kprobe(struct kprobe *p) } /* Disable one kprobe */ -int __kprobes disable_kprobe(struct kprobe *kp) +int disable_kprobe(struct kprobe *kp) { int ret = 0; @@ -1979,7 +1983,7 @@ int __kprobes disable_kprobe(struct kprobe *kp) EXPORT_SYMBOL_GPL(disable_kprobe); /* Enable one kprobe */ -int __kprobes enable_kprobe(struct kprobe *kp) +int enable_kprobe(struct kprobe *kp) { int ret = 0; struct kprobe *p; @@ -2012,16 +2016,53 @@ out: } EXPORT_SYMBOL_GPL(enable_kprobe); -void __kprobes dump_kprobe(struct kprobe *kp) +void dump_kprobe(struct kprobe *kp) { printk(KERN_WARNING "Dumping kprobe:\n"); printk(KERN_WARNING "Name: %s\nAddress: %p\nOffset: %x\n", kp->symbol_name, kp->addr, kp->offset); } +NOKPROBE_SYMBOL(dump_kprobe); + +/* + * Lookup and populate the kprobe_blacklist. + * + * Unlike the kretprobe blacklist, we'll need to determine + * the range of addresses that belong to the said functions, + * since a kprobe need not necessarily be at the beginning + * of a function. + */ +static int __init populate_kprobe_blacklist(unsigned long *start, + unsigned long *end) +{ + unsigned long *iter; + struct kprobe_blacklist_entry *ent; + unsigned long entry, offset = 0, size = 0; + + for (iter = start; iter < end; iter++) { + entry = arch_deref_entry_point((void *)*iter); + + if (!kernel_text_address(entry) || + !kallsyms_lookup_size_offset(entry, &size, &offset)) { + pr_err("Failed to find blacklist at %p\n", + (void *)entry); + continue; + } + + ent = kmalloc(sizeof(*ent), GFP_KERNEL); + if (!ent) + return -ENOMEM; + ent->start_addr = entry; + ent->end_addr = entry + size; + INIT_LIST_HEAD(&ent->list); + list_add_tail(&ent->list, &kprobe_blacklist); + } + return 0; +} /* Module notifier call back, checking kprobes on the module */ -static int __kprobes kprobes_module_callback(struct notifier_block *nb, - unsigned long val, void *data) +static int kprobes_module_callback(struct notifier_block *nb, + unsigned long val, void *data) { struct module *mod = data; struct hlist_head *head; @@ -2062,14 +2103,13 @@ static struct notifier_block kprobe_module_nb = { .priority = 0 }; +/* Markers of _kprobe_blacklist section */ +extern unsigned long __start_kprobe_blacklist[]; +extern unsigned long __stop_kprobe_blacklist[]; + static int __init init_kprobes(void) { int i, err = 0; - unsigned long offset = 0, size = 0; - char *modname, namebuf[KSYM_NAME_LEN]; - const char *symbol_name; - void *addr; - struct kprobe_blackpoint *kb; /* FIXME allocate the probe table, currently defined statically */ /* initialize all list heads */ @@ -2079,26 +2119,11 @@ static int __init init_kprobes(void) raw_spin_lock_init(&(kretprobe_table_locks[i].lock)); } - /* - * Lookup and populate the kprobe_blacklist. - * - * Unlike the kretprobe blacklist, we'll need to determine - * the range of addresses that belong to the said functions, - * since a kprobe need not necessarily be at the beginning - * of a function. - */ - for (kb = kprobe_blacklist; kb->name != NULL; kb++) { - kprobe_lookup_name(kb->name, addr); - if (!addr) - continue; - - kb->start_addr = (unsigned long)addr; - symbol_name = kallsyms_lookup(kb->start_addr, - &size, &offset, &modname, namebuf); - if (!symbol_name) - kb->range = 0; - else - kb->range = size; + err = populate_kprobe_blacklist(__start_kprobe_blacklist, + __stop_kprobe_blacklist); + if (err) { + pr_err("kprobes: failed to populate blacklist: %d\n", err); + pr_err("Please take care of using kprobes.\n"); } if (kretprobe_blacklist_size) { @@ -2138,7 +2163,7 @@ static int __init init_kprobes(void) } #ifdef CONFIG_DEBUG_FS -static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, +static void report_probe(struct seq_file *pi, struct kprobe *p, const char *sym, int offset, char *modname, struct kprobe *pp) { char *kprobe_type; @@ -2167,12 +2192,12 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, (kprobe_ftrace(pp) ? "[FTRACE]" : "")); } -static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) +static void *kprobe_seq_start(struct seq_file *f, loff_t *pos) { return (*pos < KPROBE_TABLE_SIZE) ? pos : NULL; } -static void __kprobes *kprobe_seq_next(struct seq_file *f, void *v, loff_t *pos) +static void *kprobe_seq_next(struct seq_file *f, void *v, loff_t *pos) { (*pos)++; if (*pos >= KPROBE_TABLE_SIZE) @@ -2180,12 +2205,12 @@ static void __kprobes *kprobe_seq_next(struct seq_file *f, void *v, loff_t *pos) return pos; } -static void __kprobes kprobe_seq_stop(struct seq_file *f, void *v) +static void kprobe_seq_stop(struct seq_file *f, void *v) { /* Nothing to do */ } -static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v) +static int show_kprobe_addr(struct seq_file *pi, void *v) { struct hlist_head *head; struct kprobe *p, *kp; @@ -2216,7 +2241,7 @@ static const struct seq_operations kprobes_seq_ops = { .show = show_kprobe_addr }; -static int __kprobes kprobes_open(struct inode *inode, struct file *filp) +static int kprobes_open(struct inode *inode, struct file *filp) { return seq_open(filp, &kprobes_seq_ops); } @@ -2228,7 +2253,47 @@ static const struct file_operations debugfs_kprobes_operations = { .release = seq_release, }; -static void __kprobes arm_all_kprobes(void) +/* kprobes/blacklist -- shows which functions can not be probed */ +static void *kprobe_blacklist_seq_start(struct seq_file *m, loff_t *pos) +{ + return seq_list_start(&kprobe_blacklist, *pos); +} + +static void *kprobe_blacklist_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + return seq_list_next(v, &kprobe_blacklist, pos); +} + +static int kprobe_blacklist_seq_show(struct seq_file *m, void *v) +{ + struct kprobe_blacklist_entry *ent = + list_entry(v, struct kprobe_blacklist_entry, list); + + seq_printf(m, "0x%p-0x%p\t%ps\n", (void *)ent->start_addr, + (void *)ent->end_addr, (void *)ent->start_addr); + return 0; +} + +static const struct seq_operations kprobe_blacklist_seq_ops = { + .start = kprobe_blacklist_seq_start, + .next = kprobe_blacklist_seq_next, + .stop = kprobe_seq_stop, /* Reuse void function */ + .show = kprobe_blacklist_seq_show, +}; + +static int kprobe_blacklist_open(struct inode *inode, struct file *filp) +{ + return seq_open(filp, &kprobe_blacklist_seq_ops); +} + +static const struct file_operations debugfs_kprobe_blacklist_ops = { + .open = kprobe_blacklist_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static void arm_all_kprobes(void) { struct hlist_head *head; struct kprobe *p; @@ -2256,7 +2321,7 @@ already_enabled: return; } -static void __kprobes disarm_all_kprobes(void) +static void disarm_all_kprobes(void) { struct hlist_head *head; struct kprobe *p; @@ -2340,7 +2405,7 @@ static const struct file_operations fops_kp = { .llseek = default_llseek, }; -static int __kprobes debugfs_kprobe_init(void) +static int __init debugfs_kprobe_init(void) { struct dentry *dir, *file; unsigned int value = 1; @@ -2351,19 +2416,24 @@ static int __kprobes debugfs_kprobe_init(void) file = debugfs_create_file("list", 0444, dir, NULL, &debugfs_kprobes_operations); - if (!file) { - debugfs_remove(dir); - return -ENOMEM; - } + if (!file) + goto error; file = debugfs_create_file("enabled", 0600, dir, &value, &fops_kp); - if (!file) { - debugfs_remove(dir); - return -ENOMEM; - } + if (!file) + goto error; + + file = debugfs_create_file("blacklist", 0444, dir, NULL, + &debugfs_kprobe_blacklist_ops); + if (!file) + goto error; return 0; + +error: + debugfs_remove(dir); + return -ENOMEM; } late_initcall(debugfs_kprobe_init); diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index d945a949760..6683ccef9ff 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -18,6 +18,9 @@ #include <linux/stat.h> #include <linux/sched.h> #include <linux/capability.h> +#include <linux/compiler.h> + +#include <linux/rcupdate.h> /* rcu_expedited */ #define KERNEL_ATTR_RO(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) @@ -34,6 +37,7 @@ static ssize_t uevent_seqnum_show(struct kobject *kobj, } KERNEL_ATTR_RO(uevent_seqnum); +#ifdef CONFIG_UEVENT_HELPER /* uevent helper program, used during early boot */ static ssize_t uevent_helper_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -53,7 +57,7 @@ static ssize_t uevent_helper_store(struct kobject *kobj, return count; } KERNEL_ATTR_RW(uevent_helper); - +#endif #ifdef CONFIG_PROFILING static ssize_t profiling_show(struct kobject *kobj, @@ -160,8 +164,8 @@ KERNEL_ATTR_RW(rcu_expedited); /* * Make /sys/kernel/notes give the raw contents of our kernel .notes section. */ -extern const void __start_notes __attribute__((weak)); -extern const void __stop_notes __attribute__((weak)); +extern const void __start_notes __weak; +extern const void __stop_notes __weak; #define notes_size (&__stop_notes - &__start_notes) static ssize_t notes_read(struct file *filp, struct kobject *kobj, @@ -186,7 +190,9 @@ EXPORT_SYMBOL_GPL(kernel_kobj); static struct attribute * kernel_attrs[] = { &fscaps_attr.attr, &uevent_seqnum_attr.attr, +#ifdef CONFIG_UEVENT_HELPER &uevent_helper_attr.attr, +#endif #ifdef CONFIG_PROFILING &profiling_attr.attr, #endif diff --git a/kernel/kthread.c b/kernel/kthread.c index b5ae3ee860a..c2390f41307 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -217,7 +217,7 @@ int tsk_fork_get_node(struct task_struct *tsk) if (tsk == kthreadd_task) return tsk->pref_node_fork; #endif - return numa_node_id(); + return NUMA_NO_NODE; } static void create_kthread(struct kthread_create_info *create) @@ -262,7 +262,7 @@ static void create_kthread(struct kthread_create_info *create) * kthread_stop() has been called). The return value should be zero * or a negative error number; it will be passed to kthread_stop(). * - * Returns a task_struct or ERR_PTR(-ENOMEM). + * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR). */ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), void *data, int node, @@ -298,7 +298,7 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), * that thread. */ if (xchg(&create->done, NULL)) - return ERR_PTR(-ENOMEM); + return ERR_PTR(-EINTR); /* * kthreadd (or new kernel thread) will call complete() * shortly. @@ -369,7 +369,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), { struct task_struct *p; - p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt, + p = kthread_create_on_node(threadfn, data, cpu_to_mem(cpu), namefmt, cpu); if (IS_ERR(p)) return p; diff --git a/kernel/latencytop.c b/kernel/latencytop.c index a462b317f9a..a02812743a7 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -88,7 +88,8 @@ static void clear_global_latency_tracing(void) } static void __sched -account_global_scheduler_latency(struct task_struct *tsk, struct latency_record *lat) +account_global_scheduler_latency(struct task_struct *tsk, + struct latency_record *lat) { int firstnonnull = MAXLR + 1; int i; @@ -255,7 +256,7 @@ static int lstats_show(struct seq_file *m, void *v) break; seq_printf(m, " %ps", (void *)bt); } - seq_printf(m, "\n"); + seq_puts(m, "\n"); } } return 0; diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index baab8e5e7f6..8541bfdfd23 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -1,5 +1,5 @@ -obj-y += mutex.o semaphore.o rwsem.o lglock.o +obj-y += mutex.o semaphore.o rwsem.o mcs_spinlock.o ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_lockdep.o = -pg @@ -14,6 +14,7 @@ ifeq ($(CONFIG_PROC_FS),y) obj-$(CONFIG_LOCKDEP) += lockdep_proc.o endif obj-$(CONFIG_SMP) += spinlock.o +obj-$(CONFIG_SMP) += lglock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_RT_MUTEXES) += rtmutex.o obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o @@ -23,3 +24,5 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o +obj-$(CONFIG_QUEUE_RWLOCK) += qrwlock.o +obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index eb8a54783fa..d24e4339b46 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1936,12 +1936,12 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) for (;;) { int distance = curr->lockdep_depth - depth + 1; - hlock = curr->held_locks + depth-1; + hlock = curr->held_locks + depth - 1; /* * Only non-recursive-read entries get new dependencies * added: */ - if (hlock->read != 2) { + if (hlock->read != 2 && hlock->check) { if (!check_prev_add(curr, hlock, next, distance, trylock_loop)) return 0; @@ -2098,7 +2098,7 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, * (If lookup_chain_cache() returns with 1 it acquires * graph_lock for us) */ - if (!hlock->trylock && (hlock->check == 2) && + if (!hlock->trylock && hlock->check && lookup_chain_cache(curr, hlock, chain_key)) { /* * Check whether last held lock: @@ -2517,7 +2517,7 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark) BUG_ON(usage_bit >= LOCK_USAGE_STATES); - if (hlock_class(hlock)->key == __lockdep_no_validate__.subkeys) + if (!hlock->check) continue; if (!mark_lock(curr, hlock, usage_bit)) @@ -2557,7 +2557,7 @@ static void __trace_hardirqs_on_caller(unsigned long ip) debug_atomic_inc(hardirqs_on_events); } -void trace_hardirqs_on_caller(unsigned long ip) +__visible void trace_hardirqs_on_caller(unsigned long ip) { time_hardirqs_on(CALLER_ADDR0, ip); @@ -2610,7 +2610,7 @@ EXPORT_SYMBOL(trace_hardirqs_on); /* * Hardirqs were disabled: */ -void trace_hardirqs_off_caller(unsigned long ip) +__visible void trace_hardirqs_off_caller(unsigned long ip) { struct task_struct *curr = current; @@ -3055,9 +3055,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, int class_idx; u64 chain_key; - if (!prove_locking) - check = 1; - if (unlikely(!debug_locks)) return 0; @@ -3069,8 +3066,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return 0; - if (lock->key == &__lockdep_no_validate__) - check = 1; + if (!prove_locking || lock->key == &__lockdep_no_validate__) + check = 0; if (subclass < NR_LOCKDEP_CACHING_CLASSES) class = lock->class_cache[subclass]; @@ -3138,7 +3135,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, hlock->holdtime_stamp = lockstat_clock(); #endif - if (check == 2 && !mark_irqflags(curr, hlock)) + if (check && !mark_irqflags(curr, hlock)) return 0; /* mark it as used: */ @@ -4191,7 +4188,7 @@ void debug_show_held_locks(struct task_struct *task) } EXPORT_SYMBOL_GPL(debug_show_held_locks); -void lockdep_sys_exit(void) +asmlinkage __visible void lockdep_sys_exit(void) { struct task_struct *curr = current; diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index 4f560cfedc8..51c4b24b632 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -54,9 +54,9 @@ enum { * table (if it's not there yet), and we check it for lock order * conflicts and deadlocks. */ -#define MAX_LOCKDEP_ENTRIES 16384UL +#define MAX_LOCKDEP_ENTRIES 32768UL -#define MAX_LOCKDEP_CHAINS_BITS 15 +#define MAX_LOCKDEP_CHAINS_BITS 16 #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) #define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) @@ -65,7 +65,7 @@ enum { * Stack-trace: tightly packed array of stack backtrace * addresses. Protected by the hash_lock. */ -#define MAX_STACK_TRACE_ENTRIES 262144UL +#define MAX_STACK_TRACE_ENTRIES 524288UL extern struct list_head all_lock_classes; extern struct lock_chain lock_chains[]; diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c new file mode 100644 index 00000000000..0955b885d0d --- /dev/null +++ b/kernel/locking/locktorture.c @@ -0,0 +1,454 @@ +/* + * Module-based torture test facility for locking + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright (C) IBM Corporation, 2014 + * + * Author: Paul E. McKenney <paulmck@us.ibm.com> + * Based on kernel/rcu/torture.c. + */ +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/err.h> +#include <linux/spinlock.h> +#include <linux/smp.h> +#include <linux/interrupt.h> +#include <linux/sched.h> +#include <linux/atomic.h> +#include <linux/bitops.h> +#include <linux/completion.h> +#include <linux/moduleparam.h> +#include <linux/percpu.h> +#include <linux/notifier.h> +#include <linux/reboot.h> +#include <linux/freezer.h> +#include <linux/cpu.h> +#include <linux/delay.h> +#include <linux/stat.h> +#include <linux/slab.h> +#include <linux/trace_clock.h> +#include <asm/byteorder.h> +#include <linux/torture.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>"); + +torture_param(int, nwriters_stress, -1, + "Number of write-locking stress-test threads"); +torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); +torture_param(int, onoff_interval, 0, + "Time between CPU hotplugs (s), 0=disable"); +torture_param(int, shuffle_interval, 3, + "Number of jiffies between shuffles, 0=disable"); +torture_param(int, shutdown_secs, 0, "Shutdown time (j), <= zero to disable."); +torture_param(int, stat_interval, 60, + "Number of seconds between stats printk()s"); +torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable"); +torture_param(bool, verbose, true, + "Enable verbose debugging printk()s"); + +static char *torture_type = "spin_lock"; +module_param(torture_type, charp, 0444); +MODULE_PARM_DESC(torture_type, + "Type of lock to torture (spin_lock, spin_lock_irq, ...)"); + +static atomic_t n_lock_torture_errors; + +static struct task_struct *stats_task; +static struct task_struct **writer_tasks; + +static int nrealwriters_stress; +static bool lock_is_write_held; + +struct lock_writer_stress_stats { + long n_write_lock_fail; + long n_write_lock_acquired; +}; +static struct lock_writer_stress_stats *lwsa; + +#if defined(MODULE) +#define LOCKTORTURE_RUNNABLE_INIT 1 +#else +#define LOCKTORTURE_RUNNABLE_INIT 0 +#endif +int locktorture_runnable = LOCKTORTURE_RUNNABLE_INIT; +module_param(locktorture_runnable, int, 0444); +MODULE_PARM_DESC(locktorture_runnable, "Start locktorture at module init"); + +/* Forward reference. */ +static void lock_torture_cleanup(void); + +/* + * Operations vector for selecting different types of tests. + */ +struct lock_torture_ops { + void (*init)(void); + int (*writelock)(void); + void (*write_delay)(struct torture_random_state *trsp); + void (*writeunlock)(void); + unsigned long flags; + const char *name; +}; + +static struct lock_torture_ops *cur_ops; + +/* + * Definitions for lock torture testing. + */ + +static int torture_lock_busted_write_lock(void) +{ + return 0; /* BUGGY, do not use in real life!!! */ +} + +static void torture_lock_busted_write_delay(struct torture_random_state *trsp) +{ + const unsigned long longdelay_us = 100; + + /* We want a long delay occasionally to force massive contention. */ + if (!(torture_random(trsp) % + (nrealwriters_stress * 2000 * longdelay_us))) + mdelay(longdelay_us); +#ifdef CONFIG_PREEMPT + if (!(torture_random(trsp) % (nrealwriters_stress * 20000))) + preempt_schedule(); /* Allow test to be preempted. */ +#endif +} + +static void torture_lock_busted_write_unlock(void) +{ + /* BUGGY, do not use in real life!!! */ +} + +static struct lock_torture_ops lock_busted_ops = { + .writelock = torture_lock_busted_write_lock, + .write_delay = torture_lock_busted_write_delay, + .writeunlock = torture_lock_busted_write_unlock, + .name = "lock_busted" +}; + +static DEFINE_SPINLOCK(torture_spinlock); + +static int torture_spin_lock_write_lock(void) __acquires(torture_spinlock) +{ + spin_lock(&torture_spinlock); + return 0; +} + +static void torture_spin_lock_write_delay(struct torture_random_state *trsp) +{ + const unsigned long shortdelay_us = 2; + const unsigned long longdelay_us = 100; + + /* We want a short delay mostly to emulate likely code, and + * we want a long delay occasionally to force massive contention. + */ + if (!(torture_random(trsp) % + (nrealwriters_stress * 2000 * longdelay_us))) + mdelay(longdelay_us); + if (!(torture_random(trsp) % + (nrealwriters_stress * 2 * shortdelay_us))) + udelay(shortdelay_us); +#ifdef CONFIG_PREEMPT + if (!(torture_random(trsp) % (nrealwriters_stress * 20000))) + preempt_schedule(); /* Allow test to be preempted. */ +#endif +} + +static void torture_spin_lock_write_unlock(void) __releases(torture_spinlock) +{ + spin_unlock(&torture_spinlock); +} + +static struct lock_torture_ops spin_lock_ops = { + .writelock = torture_spin_lock_write_lock, + .write_delay = torture_spin_lock_write_delay, + .writeunlock = torture_spin_lock_write_unlock, + .name = "spin_lock" +}; + +static int torture_spin_lock_write_lock_irq(void) +__acquires(torture_spinlock_irq) +{ + unsigned long flags; + + spin_lock_irqsave(&torture_spinlock, flags); + cur_ops->flags = flags; + return 0; +} + +static void torture_lock_spin_write_unlock_irq(void) +__releases(torture_spinlock) +{ + spin_unlock_irqrestore(&torture_spinlock, cur_ops->flags); +} + +static struct lock_torture_ops spin_lock_irq_ops = { + .writelock = torture_spin_lock_write_lock_irq, + .write_delay = torture_spin_lock_write_delay, + .writeunlock = torture_lock_spin_write_unlock_irq, + .name = "spin_lock_irq" +}; + +/* + * Lock torture writer kthread. Repeatedly acquires and releases + * the lock, checking for duplicate acquisitions. + */ +static int lock_torture_writer(void *arg) +{ + struct lock_writer_stress_stats *lwsp = arg; + static DEFINE_TORTURE_RANDOM(rand); + + VERBOSE_TOROUT_STRING("lock_torture_writer task started"); + set_user_nice(current, MAX_NICE); + + do { + if ((torture_random(&rand) & 0xfffff) == 0) + schedule_timeout_uninterruptible(1); + cur_ops->writelock(); + if (WARN_ON_ONCE(lock_is_write_held)) + lwsp->n_write_lock_fail++; + lock_is_write_held = 1; + lwsp->n_write_lock_acquired++; + cur_ops->write_delay(&rand); + lock_is_write_held = 0; + cur_ops->writeunlock(); + stutter_wait("lock_torture_writer"); + } while (!torture_must_stop()); + torture_kthread_stopping("lock_torture_writer"); + return 0; +} + +/* + * Create an lock-torture-statistics message in the specified buffer. + */ +static void lock_torture_printk(char *page) +{ + bool fail = 0; + int i; + long max = 0; + long min = lwsa[0].n_write_lock_acquired; + long long sum = 0; + + for (i = 0; i < nrealwriters_stress; i++) { + if (lwsa[i].n_write_lock_fail) + fail = true; + sum += lwsa[i].n_write_lock_acquired; + if (max < lwsa[i].n_write_lock_fail) + max = lwsa[i].n_write_lock_fail; + if (min > lwsa[i].n_write_lock_fail) + min = lwsa[i].n_write_lock_fail; + } + page += sprintf(page, "%s%s ", torture_type, TORTURE_FLAG); + page += sprintf(page, + "Writes: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n", + sum, max, min, max / 2 > min ? "???" : "", + fail, fail ? "!!!" : ""); + if (fail) + atomic_inc(&n_lock_torture_errors); +} + +/* + * Print torture statistics. Caller must ensure that there is only one + * call to this function at a given time!!! This is normally accomplished + * by relying on the module system to only have one copy of the module + * loaded, and then by giving the lock_torture_stats kthread full control + * (or the init/cleanup functions when lock_torture_stats thread is not + * running). + */ +static void lock_torture_stats_print(void) +{ + int size = nrealwriters_stress * 200 + 8192; + char *buf; + + buf = kmalloc(size, GFP_KERNEL); + if (!buf) { + pr_err("lock_torture_stats_print: Out of memory, need: %d", + size); + return; + } + lock_torture_printk(buf); + pr_alert("%s", buf); + kfree(buf); +} + +/* + * Periodically prints torture statistics, if periodic statistics printing + * was specified via the stat_interval module parameter. + * + * No need to worry about fullstop here, since this one doesn't reference + * volatile state or register callbacks. + */ +static int lock_torture_stats(void *arg) +{ + VERBOSE_TOROUT_STRING("lock_torture_stats task started"); + do { + schedule_timeout_interruptible(stat_interval * HZ); + lock_torture_stats_print(); + torture_shutdown_absorb("lock_torture_stats"); + } while (!torture_must_stop()); + torture_kthread_stopping("lock_torture_stats"); + return 0; +} + +static inline void +lock_torture_print_module_parms(struct lock_torture_ops *cur_ops, + const char *tag) +{ + pr_alert("%s" TORTURE_FLAG + "--- %s: nwriters_stress=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n", + torture_type, tag, nrealwriters_stress, stat_interval, verbose, + shuffle_interval, stutter, shutdown_secs, + onoff_interval, onoff_holdoff); +} + +static void lock_torture_cleanup(void) +{ + int i; + + if (torture_cleanup()) + return; + + if (writer_tasks) { + for (i = 0; i < nrealwriters_stress; i++) + torture_stop_kthread(lock_torture_writer, + writer_tasks[i]); + kfree(writer_tasks); + writer_tasks = NULL; + } + + torture_stop_kthread(lock_torture_stats, stats_task); + lock_torture_stats_print(); /* -After- the stats thread is stopped! */ + + if (atomic_read(&n_lock_torture_errors)) + lock_torture_print_module_parms(cur_ops, + "End of test: FAILURE"); + else if (torture_onoff_failures()) + lock_torture_print_module_parms(cur_ops, + "End of test: LOCK_HOTPLUG"); + else + lock_torture_print_module_parms(cur_ops, + "End of test: SUCCESS"); +} + +static int __init lock_torture_init(void) +{ + int i; + int firsterr = 0; + static struct lock_torture_ops *torture_ops[] = { + &lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops, + }; + + if (!torture_init_begin(torture_type, verbose, &locktorture_runnable)) + return -EBUSY; + + /* Process args and tell the world that the torturer is on the job. */ + for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { + cur_ops = torture_ops[i]; + if (strcmp(torture_type, cur_ops->name) == 0) + break; + } + if (i == ARRAY_SIZE(torture_ops)) { + pr_alert("lock-torture: invalid torture type: \"%s\"\n", + torture_type); + pr_alert("lock-torture types:"); + for (i = 0; i < ARRAY_SIZE(torture_ops); i++) + pr_alert(" %s", torture_ops[i]->name); + pr_alert("\n"); + torture_init_end(); + return -EINVAL; + } + if (cur_ops->init) + cur_ops->init(); /* no "goto unwind" prior to this point!!! */ + + if (nwriters_stress >= 0) + nrealwriters_stress = nwriters_stress; + else + nrealwriters_stress = 2 * num_online_cpus(); + lock_torture_print_module_parms(cur_ops, "Start of test"); + + /* Initialize the statistics so that each run gets its own numbers. */ + + lock_is_write_held = 0; + lwsa = kmalloc(sizeof(*lwsa) * nrealwriters_stress, GFP_KERNEL); + if (lwsa == NULL) { + VERBOSE_TOROUT_STRING("lwsa: Out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + for (i = 0; i < nrealwriters_stress; i++) { + lwsa[i].n_write_lock_fail = 0; + lwsa[i].n_write_lock_acquired = 0; + } + + /* Start up the kthreads. */ + + if (onoff_interval > 0) { + firsterr = torture_onoff_init(onoff_holdoff * HZ, + onoff_interval * HZ); + if (firsterr) + goto unwind; + } + if (shuffle_interval > 0) { + firsterr = torture_shuffle_init(shuffle_interval); + if (firsterr) + goto unwind; + } + if (shutdown_secs > 0) { + firsterr = torture_shutdown_init(shutdown_secs, + lock_torture_cleanup); + if (firsterr) + goto unwind; + } + if (stutter > 0) { + firsterr = torture_stutter_init(stutter); + if (firsterr) + goto unwind; + } + + writer_tasks = kzalloc(nrealwriters_stress * sizeof(writer_tasks[0]), + GFP_KERNEL); + if (writer_tasks == NULL) { + VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + for (i = 0; i < nrealwriters_stress; i++) { + firsterr = torture_create_kthread(lock_torture_writer, &lwsa[i], + writer_tasks[i]); + if (firsterr) + goto unwind; + } + if (stat_interval > 0) { + firsterr = torture_create_kthread(lock_torture_stats, NULL, + stats_task); + if (firsterr) + goto unwind; + } + torture_init_end(); + return 0; + +unwind: + torture_init_end(); + lock_torture_cleanup(); + return firsterr; +} + +module_init(lock_torture_init); +module_exit(lock_torture_cleanup); diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c new file mode 100644 index 00000000000..be9ee1559fc --- /dev/null +++ b/kernel/locking/mcs_spinlock.c @@ -0,0 +1,210 @@ + +#include <linux/percpu.h> +#include <linux/mutex.h> +#include <linux/sched.h> +#include "mcs_spinlock.h" + +#ifdef CONFIG_SMP + +/* + * An MCS like lock especially tailored for optimistic spinning for sleeping + * lock implementations (mutex, rwsem, etc). + * + * Using a single mcs node per CPU is safe because sleeping locks should not be + * called from interrupt context and we have preemption disabled while + * spinning. + */ +static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node); + +/* + * We use the value 0 to represent "no CPU", thus the encoded value + * will be the CPU number incremented by 1. + */ +static inline int encode_cpu(int cpu_nr) +{ + return cpu_nr + 1; +} + +static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val) +{ + int cpu_nr = encoded_cpu_val - 1; + + return per_cpu_ptr(&osq_node, cpu_nr); +} + +/* + * Get a stable @node->next pointer, either for unlock() or unqueue() purposes. + * Can return NULL in case we were the last queued and we updated @lock instead. + */ +static inline struct optimistic_spin_node * +osq_wait_next(struct optimistic_spin_queue *lock, + struct optimistic_spin_node *node, + struct optimistic_spin_node *prev) +{ + struct optimistic_spin_node *next = NULL; + int curr = encode_cpu(smp_processor_id()); + int old; + + /* + * If there is a prev node in queue, then the 'old' value will be + * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if + * we're currently last in queue, then the queue will then become empty. + */ + old = prev ? prev->cpu : OSQ_UNLOCKED_VAL; + + for (;;) { + if (atomic_read(&lock->tail) == curr && + atomic_cmpxchg(&lock->tail, curr, old) == curr) { + /* + * We were the last queued, we moved @lock back. @prev + * will now observe @lock and will complete its + * unlock()/unqueue(). + */ + break; + } + + /* + * We must xchg() the @node->next value, because if we were to + * leave it in, a concurrent unlock()/unqueue() from + * @node->next might complete Step-A and think its @prev is + * still valid. + * + * If the concurrent unlock()/unqueue() wins the race, we'll + * wait for either @lock to point to us, through its Step-B, or + * wait for a new @node->next from its Step-C. + */ + if (node->next) { + next = xchg(&node->next, NULL); + if (next) + break; + } + + arch_mutex_cpu_relax(); + } + + return next; +} + +bool osq_lock(struct optimistic_spin_queue *lock) +{ + struct optimistic_spin_node *node = this_cpu_ptr(&osq_node); + struct optimistic_spin_node *prev, *next; + int curr = encode_cpu(smp_processor_id()); + int old; + + node->locked = 0; + node->next = NULL; + node->cpu = curr; + + old = atomic_xchg(&lock->tail, curr); + if (old == OSQ_UNLOCKED_VAL) + return true; + + prev = decode_cpu(old); + node->prev = prev; + ACCESS_ONCE(prev->next) = node; + + /* + * Normally @prev is untouchable after the above store; because at that + * moment unlock can proceed and wipe the node element from stack. + * + * However, since our nodes are static per-cpu storage, we're + * guaranteed their existence -- this allows us to apply + * cmpxchg in an attempt to undo our queueing. + */ + + while (!smp_load_acquire(&node->locked)) { + /* + * If we need to reschedule bail... so we can block. + */ + if (need_resched()) + goto unqueue; + + arch_mutex_cpu_relax(); + } + return true; + +unqueue: + /* + * Step - A -- stabilize @prev + * + * Undo our @prev->next assignment; this will make @prev's + * unlock()/unqueue() wait for a next pointer since @lock points to us + * (or later). + */ + + for (;;) { + if (prev->next == node && + cmpxchg(&prev->next, node, NULL) == node) + break; + + /* + * We can only fail the cmpxchg() racing against an unlock(), + * in which case we should observe @node->locked becomming + * true. + */ + if (smp_load_acquire(&node->locked)) + return true; + + arch_mutex_cpu_relax(); + + /* + * Or we race against a concurrent unqueue()'s step-B, in which + * case its step-C will write us a new @node->prev pointer. + */ + prev = ACCESS_ONCE(node->prev); + } + + /* + * Step - B -- stabilize @next + * + * Similar to unlock(), wait for @node->next or move @lock from @node + * back to @prev. + */ + + next = osq_wait_next(lock, node, prev); + if (!next) + return false; + + /* + * Step - C -- unlink + * + * @prev is stable because its still waiting for a new @prev->next + * pointer, @next is stable because our @node->next pointer is NULL and + * it will wait in Step-A. + */ + + ACCESS_ONCE(next->prev) = prev; + ACCESS_ONCE(prev->next) = next; + + return false; +} + +void osq_unlock(struct optimistic_spin_queue *lock) +{ + struct optimistic_spin_node *node, *next; + int curr = encode_cpu(smp_processor_id()); + + /* + * Fast path for the uncontended case. + */ + if (likely(atomic_cmpxchg(&lock->tail, curr, OSQ_UNLOCKED_VAL) == curr)) + return; + + /* + * Second most likely case. + */ + node = this_cpu_ptr(&osq_node); + next = xchg(&node->next, NULL); + if (next) { + ACCESS_ONCE(next->locked) = 1; + return; + } + + next = osq_wait_next(lock, node, NULL); + if (next) + ACCESS_ONCE(next->locked) = 1; +} + +#endif + diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h new file mode 100644 index 00000000000..74356dc0ce2 --- /dev/null +++ b/kernel/locking/mcs_spinlock.h @@ -0,0 +1,130 @@ +/* + * MCS lock defines + * + * This file contains the main data structure and API definitions of MCS lock. + * + * The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spin-lock + * with the desirable properties of being fair, and with each cpu trying + * to acquire the lock spinning on a local variable. + * It avoids expensive cache bouncings that common test-and-set spin-lock + * implementations incur. + */ +#ifndef __LINUX_MCS_SPINLOCK_H +#define __LINUX_MCS_SPINLOCK_H + +#include <asm/mcs_spinlock.h> + +struct mcs_spinlock { + struct mcs_spinlock *next; + int locked; /* 1 if lock acquired */ +}; + +#ifndef arch_mcs_spin_lock_contended +/* + * Using smp_load_acquire() provides a memory barrier that ensures + * subsequent operations happen after the lock is acquired. + */ +#define arch_mcs_spin_lock_contended(l) \ +do { \ + while (!(smp_load_acquire(l))) \ + arch_mutex_cpu_relax(); \ +} while (0) +#endif + +#ifndef arch_mcs_spin_unlock_contended +/* + * smp_store_release() provides a memory barrier to ensure all + * operations in the critical section has been completed before + * unlocking. + */ +#define arch_mcs_spin_unlock_contended(l) \ + smp_store_release((l), 1) +#endif + +/* + * Note: the smp_load_acquire/smp_store_release pair is not + * sufficient to form a full memory barrier across + * cpus for many architectures (except x86) for mcs_unlock and mcs_lock. + * For applications that need a full barrier across multiple cpus + * with mcs_unlock and mcs_lock pair, smp_mb__after_unlock_lock() should be + * used after mcs_lock. + */ + +/* + * In order to acquire the lock, the caller should declare a local node and + * pass a reference of the node to this function in addition to the lock. + * If the lock has already been acquired, then this will proceed to spin + * on this node->locked until the previous lock holder sets the node->locked + * in mcs_spin_unlock(). + * + * We don't inline mcs_spin_lock() so that perf can correctly account for the + * time spent in this lock function. + */ +static inline +void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node) +{ + struct mcs_spinlock *prev; + + /* Init node */ + node->locked = 0; + node->next = NULL; + + prev = xchg(lock, node); + if (likely(prev == NULL)) { + /* + * Lock acquired, don't need to set node->locked to 1. Threads + * only spin on its own node->locked value for lock acquisition. + * However, since this thread can immediately acquire the lock + * and does not proceed to spin on its own node->locked, this + * value won't be used. If a debug mode is needed to + * audit lock status, then set node->locked value here. + */ + return; + } + ACCESS_ONCE(prev->next) = node; + + /* Wait until the lock holder passes the lock down. */ + arch_mcs_spin_lock_contended(&node->locked); +} + +/* + * Releases the lock. The caller should pass in the corresponding node that + * was used to acquire the lock. + */ +static inline +void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) +{ + struct mcs_spinlock *next = ACCESS_ONCE(node->next); + + if (likely(!next)) { + /* + * Release the lock by setting it to NULL + */ + if (likely(cmpxchg(lock, node, NULL) == node)) + return; + /* Wait until the next pointer is set */ + while (!(next = ACCESS_ONCE(node->next))) + arch_mutex_cpu_relax(); + } + + /* Pass lock to next waiter. */ + arch_mcs_spin_unlock_contended(&next->locked); +} + +/* + * Cancellable version of the MCS lock above. + * + * Intended for adaptive spinning of sleeping locks: + * mutex_lock()/rwsem_down_{read,write}() etc. + */ + +struct optimistic_spin_node { + struct optimistic_spin_node *next, *prev; + int locked; /* 1 if lock acquired */ + int cpu; /* encoded CPU # value */ +}; + +extern bool osq_lock(struct optimistic_spin_queue *lock); +extern void osq_unlock(struct optimistic_spin_queue *lock); + +#endif /* __LINUX_MCS_SPINLOCK_H */ diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index faf6f5b53e7..5cf6731b98e 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -71,18 +71,23 @@ void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, void debug_mutex_unlock(struct mutex *lock) { - if (unlikely(!debug_locks)) - return; + if (likely(debug_locks)) { + DEBUG_LOCKS_WARN_ON(lock->magic != lock); - DEBUG_LOCKS_WARN_ON(lock->magic != lock); + if (!lock->owner) + DEBUG_LOCKS_WARN_ON(!lock->owner); + else + DEBUG_LOCKS_WARN_ON(lock->owner != current); - if (!lock->owner) - DEBUG_LOCKS_WARN_ON(!lock->owner); - else - DEBUG_LOCKS_WARN_ON(lock->owner != current); + DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); + mutex_clear_owner(lock); + } - DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); - mutex_clear_owner(lock); + /* + * __mutex_slowpath_needs_to_unlock() is explicitly 0 for debug + * mutexes so that we can do it here after we've verified state. + */ + atomic_set(&lock->count, 1); } void debug_mutex_init(struct mutex *lock, const char *name, diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 4dd6e4c219d..acca2c1a3c5 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -25,6 +25,7 @@ #include <linux/spinlock.h> #include <linux/interrupt.h> #include <linux/debug_locks.h> +#include "mcs_spinlock.h" /* * In the DEBUG case we are using the "NULL fastpath" for mutexes, @@ -33,6 +34,13 @@ #ifdef CONFIG_DEBUG_MUTEXES # include "mutex-debug.h" # include <asm-generic/mutex-null.h> +/* + * Must be 0 for the debug case so we do not do the unlock outside of the + * wait_lock region. debug_mutex_unlock() will do the actual unlock in this + * case. + */ +# undef __mutex_slowpath_needs_to_unlock +# define __mutex_slowpath_needs_to_unlock() 0 #else # include "mutex.h" # include <asm/mutex.h> @@ -52,7 +60,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) INIT_LIST_HEAD(&lock->wait_list); mutex_clear_owner(lock); #ifdef CONFIG_MUTEX_SPIN_ON_OWNER - lock->spin_mlock = NULL; + osq_lock_init(&lock->osq); #endif debug_mutex_init(lock, name, key); @@ -67,8 +75,7 @@ EXPORT_SYMBOL(__mutex_init); * We also put the fastpath first in the kernel image, to make sure the * branch is predicted by the CPU as default-untaken. */ -static __used noinline void __sched -__mutex_lock_slowpath(atomic_t *lock_count); +__visible void __sched __mutex_lock_slowpath(atomic_t *lock_count); /** * mutex_lock - acquire the mutex @@ -111,54 +118,7 @@ EXPORT_SYMBOL(mutex_lock); * more or less simultaneously, the spinners need to acquire a MCS lock * first before spinning on the owner field. * - * We don't inline mspin_lock() so that perf can correctly account for the - * time spent in this lock function. */ -struct mspin_node { - struct mspin_node *next ; - int locked; /* 1 if lock acquired */ -}; -#define MLOCK(mutex) ((struct mspin_node **)&((mutex)->spin_mlock)) - -static noinline -void mspin_lock(struct mspin_node **lock, struct mspin_node *node) -{ - struct mspin_node *prev; - - /* Init node */ - node->locked = 0; - node->next = NULL; - - prev = xchg(lock, node); - if (likely(prev == NULL)) { - /* Lock acquired */ - node->locked = 1; - return; - } - ACCESS_ONCE(prev->next) = node; - smp_wmb(); - /* Wait until the lock holder passes the lock down */ - while (!ACCESS_ONCE(node->locked)) - arch_mutex_cpu_relax(); -} - -static void mspin_unlock(struct mspin_node **lock, struct mspin_node *node) -{ - struct mspin_node *next = ACCESS_ONCE(node->next); - - if (likely(!next)) { - /* - * Release the lock by setting it to NULL - */ - if (cmpxchg(lock, node, NULL) == node) - return; - /* Wait until the next pointer is set */ - while (!(next = ACCESS_ONCE(node->next))) - arch_mutex_cpu_relax(); - } - ACCESS_ONCE(next->locked) = 1; - smp_wmb(); -} /* * Mutex spinning code migrated from kernel/sched/core.c @@ -212,6 +172,9 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock) struct task_struct *owner; int retval = 1; + if (need_resched()) + return 0; + rcu_read_lock(); owner = ACCESS_ONCE(lock->owner); if (owner) @@ -225,7 +188,8 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock) } #endif -static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count); +__visible __used noinline +void __sched __mutex_unlock_slowpath(atomic_t *lock_count); /** * mutex_unlock - release the mutex @@ -446,9 +410,11 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, if (!mutex_can_spin_on_owner(lock)) goto slowpath; + if (!osq_lock(&lock->osq)) + goto slowpath; + for (;;) { struct task_struct *owner; - struct mspin_node node; if (use_ww_ctx && ww_ctx->acquired > 0) { struct ww_mutex *ww; @@ -463,19 +429,16 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * performed the optimistic spinning cannot be done. */ if (ACCESS_ONCE(ww->ctx)) - goto slowpath; + break; } /* * If there's an owner, wait for it to either * release the lock or go to sleep. */ - mspin_lock(MLOCK(lock), &node); owner = ACCESS_ONCE(lock->owner); - if (owner && !mutex_spin_on_owner(lock, owner)) { - mspin_unlock(MLOCK(lock), &node); - goto slowpath; - } + if (owner && !mutex_spin_on_owner(lock, owner)) + break; if ((atomic_read(&lock->count) == 1) && (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { @@ -488,11 +451,10 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, } mutex_set_owner(lock); - mspin_unlock(MLOCK(lock), &node); + osq_unlock(&lock->osq); preempt_enable(); return 0; } - mspin_unlock(MLOCK(lock), &node); /* * When there's no owner, we might have preempted between the @@ -501,7 +463,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * the owner complete. */ if (!owner && (need_resched() || rt_task(task))) - goto slowpath; + break; /* * The cpu_relax() call is a compiler barrier which forces @@ -511,7 +473,15 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, */ arch_mutex_cpu_relax(); } + osq_unlock(&lock->osq); slowpath: + /* + * If we fell out of the spin path because of need_resched(), + * reschedule now, before we try-lock the mutex. This avoids getting + * scheduled out right after we obtained the mutex. + */ + if (need_resched()) + schedule_preempt_disabled(); #endif spin_lock_mutex(&lock->wait_lock, flags); @@ -717,10 +687,6 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested) struct mutex *lock = container_of(lock_count, struct mutex, count); unsigned long flags; - spin_lock_mutex(&lock->wait_lock, flags); - mutex_release(&lock->dep_map, nested, _RET_IP_); - debug_mutex_unlock(lock); - /* * some architectures leave the lock unlocked in the fastpath failure * case, others need to leave it locked. In the later case we have to @@ -729,6 +695,10 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested) if (__mutex_slowpath_needs_to_unlock()) atomic_set(&lock->count, 1); + spin_lock_mutex(&lock->wait_lock, flags); + mutex_release(&lock->dep_map, nested, _RET_IP_); + debug_mutex_unlock(lock); + if (!list_empty(&lock->wait_list)) { /* get the first entry from the wait-list: */ struct mutex_waiter *waiter = @@ -746,7 +716,7 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested) /* * Release the lock, slowpath: */ -static __used noinline void +__visible void __mutex_unlock_slowpath(atomic_t *lock_count) { __mutex_unlock_common_slowpath(lock_count, 1); @@ -803,7 +773,7 @@ int __sched mutex_lock_killable(struct mutex *lock) } EXPORT_SYMBOL(mutex_lock_killable); -static __used noinline void __sched +__visible void __sched __mutex_lock_slowpath(atomic_t *lock_count) { struct mutex *lock = container_of(lock_count, struct mutex, count); diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c new file mode 100644 index 00000000000..fb5b8ac411a --- /dev/null +++ b/kernel/locking/qrwlock.c @@ -0,0 +1,133 @@ +/* + * Queue read/write lock + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * (C) Copyright 2013-2014 Hewlett-Packard Development Company, L.P. + * + * Authors: Waiman Long <waiman.long@hp.com> + */ +#include <linux/smp.h> +#include <linux/bug.h> +#include <linux/cpumask.h> +#include <linux/percpu.h> +#include <linux/hardirq.h> +#include <linux/mutex.h> +#include <asm/qrwlock.h> + +/** + * rspin_until_writer_unlock - inc reader count & spin until writer is gone + * @lock : Pointer to queue rwlock structure + * @writer: Current queue rwlock writer status byte + * + * In interrupt context or at the head of the queue, the reader will just + * increment the reader count & wait until the writer releases the lock. + */ +static __always_inline void +rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts) +{ + while ((cnts & _QW_WMASK) == _QW_LOCKED) { + arch_mutex_cpu_relax(); + cnts = smp_load_acquire((u32 *)&lock->cnts); + } +} + +/** + * queue_read_lock_slowpath - acquire read lock of a queue rwlock + * @lock: Pointer to queue rwlock structure + */ +void queue_read_lock_slowpath(struct qrwlock *lock) +{ + u32 cnts; + + /* + * Readers come here when they cannot get the lock without waiting + */ + if (unlikely(in_interrupt())) { + /* + * Readers in interrupt context will spin until the lock is + * available without waiting in the queue. + */ + cnts = smp_load_acquire((u32 *)&lock->cnts); + rspin_until_writer_unlock(lock, cnts); + return; + } + atomic_sub(_QR_BIAS, &lock->cnts); + + /* + * Put the reader into the wait queue + */ + arch_spin_lock(&lock->lock); + + /* + * At the head of the wait queue now, wait until the writer state + * goes to 0 and then try to increment the reader count and get + * the lock. It is possible that an incoming writer may steal the + * lock in the interim, so it is necessary to check the writer byte + * to make sure that the write lock isn't taken. + */ + while (atomic_read(&lock->cnts) & _QW_WMASK) + arch_mutex_cpu_relax(); + + cnts = atomic_add_return(_QR_BIAS, &lock->cnts) - _QR_BIAS; + rspin_until_writer_unlock(lock, cnts); + + /* + * Signal the next one in queue to become queue head + */ + arch_spin_unlock(&lock->lock); +} +EXPORT_SYMBOL(queue_read_lock_slowpath); + +/** + * queue_write_lock_slowpath - acquire write lock of a queue rwlock + * @lock : Pointer to queue rwlock structure + */ +void queue_write_lock_slowpath(struct qrwlock *lock) +{ + u32 cnts; + + /* Put the writer into the wait queue */ + arch_spin_lock(&lock->lock); + + /* Try to acquire the lock directly if no reader is present */ + if (!atomic_read(&lock->cnts) && + (atomic_cmpxchg(&lock->cnts, 0, _QW_LOCKED) == 0)) + goto unlock; + + /* + * Set the waiting flag to notify readers that a writer is pending, + * or wait for a previous writer to go away. + */ + for (;;) { + cnts = atomic_read(&lock->cnts); + if (!(cnts & _QW_WMASK) && + (atomic_cmpxchg(&lock->cnts, cnts, + cnts | _QW_WAITING) == cnts)) + break; + + arch_mutex_cpu_relax(); + } + + /* When no more readers, set the locked flag */ + for (;;) { + cnts = atomic_read(&lock->cnts); + if ((cnts == _QW_WAITING) && + (atomic_cmpxchg(&lock->cnts, _QW_WAITING, + _QW_LOCKED) == _QW_WAITING)) + break; + + arch_mutex_cpu_relax(); + } +unlock: + arch_spin_unlock(&lock->lock); +} +EXPORT_SYMBOL(queue_write_lock_slowpath); diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h index 14193d596d7..ab29b6a2266 100644 --- a/kernel/locking/rtmutex-debug.h +++ b/kernel/locking/rtmutex-debug.h @@ -31,3 +31,8 @@ static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, { return (waiter != NULL); } + +static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w) +{ + debug_rt_mutex_print_deadlock(w); +} diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 2e960a2bab8..fc605941b9b 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -83,6 +83,47 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) owner = *p; } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner); } + +/* + * Safe fastpath aware unlock: + * 1) Clear the waiters bit + * 2) Drop lock->wait_lock + * 3) Try to unlock the lock with cmpxchg + */ +static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock) + __releases(lock->wait_lock) +{ + struct task_struct *owner = rt_mutex_owner(lock); + + clear_rt_mutex_waiters(lock); + raw_spin_unlock(&lock->wait_lock); + /* + * If a new waiter comes in between the unlock and the cmpxchg + * we have two situations: + * + * unlock(wait_lock); + * lock(wait_lock); + * cmpxchg(p, owner, 0) == owner + * mark_rt_mutex_waiters(lock); + * acquire(lock); + * or: + * + * unlock(wait_lock); + * lock(wait_lock); + * mark_rt_mutex_waiters(lock); + * + * cmpxchg(p, owner, 0) != owner + * enqueue_waiter(); + * unlock(wait_lock); + * lock(wait_lock); + * wake waiter(); + * unlock(wait_lock); + * lock(wait_lock); + * acquire(lock); + */ + return rt_mutex_cmpxchg(lock, owner, NULL); +} + #else # define rt_mutex_cmpxchg(l,c,n) (0) static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) @@ -90,6 +131,17 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) lock->owner = (struct task_struct *) ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS); } + +/* + * Simple slow path only version: lock->owner is protected by lock->wait_lock. + */ +static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock) + __releases(lock->wait_lock) +{ + lock->owner = NULL; + raw_spin_unlock(&lock->wait_lock); + return true; +} #endif static inline int @@ -213,6 +265,18 @@ struct task_struct *rt_mutex_get_top_task(struct task_struct *task) } /* + * Called by sched_setscheduler() to check whether the priority change + * is overruled by a possible priority boosting. + */ +int rt_mutex_check_prio(struct task_struct *task, int newprio) +{ + if (!task_has_pi_waiters(task)) + return 0; + + return task_top_pi_waiter(task)->task->prio <= newprio; +} + +/* * Adjust the priority of a task, after its pi_waiters got modified. * * This can be both boosting and unboosting. task->pi_lock must be held. @@ -248,27 +312,36 @@ static void rt_mutex_adjust_prio(struct task_struct *task) */ int max_lock_depth = 1024; +static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p) +{ + return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL; +} + /* * Adjust the priority chain. Also used for deadlock detection. * Decreases task's usage by one - may thus free the task. * - * @task: the task owning the mutex (owner) for which a chain walk is probably - * needed + * @task: the task owning the mutex (owner) for which a chain walk is + * probably needed * @deadlock_detect: do we have to carry out deadlock detection? - * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck - * things for a task that has just got its priority adjusted, and - * is waiting on a mutex) + * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck + * things for a task that has just got its priority adjusted, and + * is waiting on a mutex) + * @next_lock: the mutex on which the owner of @orig_lock was blocked before + * we dropped its pi_lock. Is never dereferenced, only used for + * comparison to detect lock chain changes. * @orig_waiter: rt_mutex_waiter struct for the task that has just donated - * its priority to the mutex owner (can be NULL in the case - * depicted above or if the top waiter is gone away and we are - * actually deboosting the owner) - * @top_task: the current top waiter + * its priority to the mutex owner (can be NULL in the case + * depicted above or if the top waiter is gone away and we are + * actually deboosting the owner) + * @top_task: the current top waiter * * Returns 0 or -EDEADLK. */ static int rt_mutex_adjust_prio_chain(struct task_struct *task, int deadlock_detect, struct rt_mutex *orig_lock, + struct rt_mutex *next_lock, struct rt_mutex_waiter *orig_waiter, struct task_struct *top_task) { @@ -302,7 +375,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, } put_task_struct(task); - return deadlock_detect ? -EDEADLK : 0; + return -EDEADLK; } retry: /* @@ -327,13 +400,32 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, goto out_unlock_pi; /* + * We dropped all locks after taking a refcount on @task, so + * the task might have moved on in the lock chain or even left + * the chain completely and blocks now on an unrelated lock or + * on @orig_lock. + * + * We stored the lock on which @task was blocked in @next_lock, + * so we can detect the chain change. + */ + if (next_lock != waiter->lock) + goto out_unlock_pi; + + /* * Drop out, when the task has no waiters. Note, * top_waiter can be NULL, when we are in the deboosting * mode! */ - if (top_waiter && (!task_has_pi_waiters(task) || - top_waiter != task_top_pi_waiter(task))) - goto out_unlock_pi; + if (top_waiter) { + if (!task_has_pi_waiters(task)) + goto out_unlock_pi; + /* + * If deadlock detection is off, we stop here if we + * are not the top pi waiter of the task. + */ + if (!detect_deadlock && top_waiter != task_top_pi_waiter(task)) + goto out_unlock_pi; + } /* * When deadlock detection is off then we check, if further @@ -349,11 +441,16 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, goto retry; } - /* Deadlock detection */ + /* + * Deadlock detection. If the lock is the same as the original + * lock which caused us to walk the lock chain or if the + * current lock is owned by the task which initiated the chain + * walk, we detected a deadlock. + */ if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); raw_spin_unlock(&lock->wait_lock); - ret = deadlock_detect ? -EDEADLK : 0; + ret = -EDEADLK; goto out_unlock_pi; } @@ -398,11 +495,26 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, __rt_mutex_adjust_prio(task); } + /* + * Check whether the task which owns the current lock is pi + * blocked itself. If yes we store a pointer to the lock for + * the lock chain change detection above. After we dropped + * task->pi_lock next_lock cannot be dereferenced anymore. + */ + next_lock = task_blocked_on_lock(task); + raw_spin_unlock_irqrestore(&task->pi_lock, flags); top_waiter = rt_mutex_top_waiter(lock); raw_spin_unlock(&lock->wait_lock); + /* + * We reached the end of the lock chain. Stop right here. No + * point to go back just to figure that out. + */ + if (!next_lock) + goto out_put_task; + if (!detect_deadlock && waiter != top_waiter) goto out_put_task; @@ -512,8 +624,21 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, { struct task_struct *owner = rt_mutex_owner(lock); struct rt_mutex_waiter *top_waiter = waiter; - unsigned long flags; + struct rt_mutex *next_lock; int chain_walk = 0, res; + unsigned long flags; + + /* + * Early deadlock detection. We really don't want the task to + * enqueue on itself just to untangle the mess later. It's not + * only an optimization. We drop the locks, so another waiter + * can come in before the chain walk detects the deadlock. So + * the other will detect the deadlock and return -EDEADLOCK, + * which is wrong, as the other waiter is not in a deadlock + * situation. + */ + if (owner == task) + return -EDEADLK; raw_spin_lock_irqsave(&task->pi_lock, flags); __rt_mutex_adjust_prio(task); @@ -533,20 +658,28 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, if (!owner) return 0; + raw_spin_lock_irqsave(&owner->pi_lock, flags); if (waiter == rt_mutex_top_waiter(lock)) { - raw_spin_lock_irqsave(&owner->pi_lock, flags); rt_mutex_dequeue_pi(owner, top_waiter); rt_mutex_enqueue_pi(owner, waiter); __rt_mutex_adjust_prio(owner); if (owner->pi_blocked_on) chain_walk = 1; - raw_spin_unlock_irqrestore(&owner->pi_lock, flags); - } - else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) + } else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) { chain_walk = 1; + } + + /* Store the lock on which owner is blocked or NULL */ + next_lock = task_blocked_on_lock(owner); - if (!chain_walk) + raw_spin_unlock_irqrestore(&owner->pi_lock, flags); + /* + * Even if full deadlock detection is on, if the owner is not + * blocked itself, we can avoid finding this out in the chain + * walk. + */ + if (!chain_walk || !next_lock) return 0; /* @@ -558,8 +691,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, raw_spin_unlock(&lock->wait_lock); - res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, - task); + res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, + next_lock, waiter, task); raw_spin_lock(&lock->wait_lock); @@ -569,7 +702,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, /* * Wake up the next waiter on the lock. * - * Remove the top waiter from the current tasks waiter list and wake it up. + * Remove the top waiter from the current tasks pi waiter list and + * wake it up. * * Called with lock->wait_lock held. */ @@ -590,10 +724,23 @@ static void wakeup_next_waiter(struct rt_mutex *lock) */ rt_mutex_dequeue_pi(current, waiter); - rt_mutex_set_owner(lock, NULL); + /* + * As we are waking up the top waiter, and the waiter stays + * queued on the lock until it gets the lock, this lock + * obviously has waiters. Just set the bit here and this has + * the added benefit of forcing all new tasks into the + * slow path making sure no task of lower priority than + * the top waiter can steal this lock. + */ + lock->owner = (void *) RT_MUTEX_HAS_WAITERS; raw_spin_unlock_irqrestore(¤t->pi_lock, flags); + /* + * It's safe to dereference waiter as it cannot go away as + * long as we hold lock->wait_lock. The waiter task needs to + * acquire it in order to dequeue the waiter. + */ wake_up_process(waiter->task); } @@ -608,8 +755,8 @@ static void remove_waiter(struct rt_mutex *lock, { int first = (waiter == rt_mutex_top_waiter(lock)); struct task_struct *owner = rt_mutex_owner(lock); + struct rt_mutex *next_lock = NULL; unsigned long flags; - int chain_walk = 0; raw_spin_lock_irqsave(¤t->pi_lock, flags); rt_mutex_dequeue(lock, waiter); @@ -633,13 +780,13 @@ static void remove_waiter(struct rt_mutex *lock, } __rt_mutex_adjust_prio(owner); - if (owner->pi_blocked_on) - chain_walk = 1; + /* Store the lock on which owner is blocked or NULL */ + next_lock = task_blocked_on_lock(owner); raw_spin_unlock_irqrestore(&owner->pi_lock, flags); } - if (!chain_walk) + if (!next_lock) return; /* gets dropped in rt_mutex_adjust_prio_chain()! */ @@ -647,7 +794,7 @@ static void remove_waiter(struct rt_mutex *lock, raw_spin_unlock(&lock->wait_lock); - rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current); + rt_mutex_adjust_prio_chain(owner, 0, lock, next_lock, NULL, current); raw_spin_lock(&lock->wait_lock); } @@ -660,6 +807,7 @@ static void remove_waiter(struct rt_mutex *lock, void rt_mutex_adjust_pi(struct task_struct *task) { struct rt_mutex_waiter *waiter; + struct rt_mutex *next_lock; unsigned long flags; raw_spin_lock_irqsave(&task->pi_lock, flags); @@ -670,12 +818,13 @@ void rt_mutex_adjust_pi(struct task_struct *task) raw_spin_unlock_irqrestore(&task->pi_lock, flags); return; } - + next_lock = waiter->lock; raw_spin_unlock_irqrestore(&task->pi_lock, flags); /* gets dropped in rt_mutex_adjust_prio_chain()! */ get_task_struct(task); - rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task); + + rt_mutex_adjust_prio_chain(task, 0, NULL, next_lock, NULL, task); } /** @@ -727,6 +876,26 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, return ret; } +static void rt_mutex_handle_deadlock(int res, int detect_deadlock, + struct rt_mutex_waiter *w) +{ + /* + * If the result is not -EDEADLOCK or the caller requested + * deadlock detection, nothing to do here. + */ + if (res != -EDEADLOCK || detect_deadlock) + return; + + /* + * Yell lowdly and stop the task right here. + */ + rt_mutex_print_deadlock(w); + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + } +} + /* * Slow path lock function: */ @@ -766,8 +935,10 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, set_current_state(TASK_RUNNING); - if (unlikely(ret)) + if (unlikely(ret)) { remove_waiter(lock, &waiter); + rt_mutex_handle_deadlock(ret, detect_deadlock, &waiter); + } /* * try_to_take_rt_mutex() sets the waiter bit @@ -823,12 +994,49 @@ rt_mutex_slowunlock(struct rt_mutex *lock) rt_mutex_deadlock_account_unlock(current); - if (!rt_mutex_has_waiters(lock)) { - lock->owner = NULL; - raw_spin_unlock(&lock->wait_lock); - return; + /* + * We must be careful here if the fast path is enabled. If we + * have no waiters queued we cannot set owner to NULL here + * because of: + * + * foo->lock->owner = NULL; + * rtmutex_lock(foo->lock); <- fast path + * free = atomic_dec_and_test(foo->refcnt); + * rtmutex_unlock(foo->lock); <- fast path + * if (free) + * kfree(foo); + * raw_spin_unlock(foo->lock->wait_lock); + * + * So for the fastpath enabled kernel: + * + * Nothing can set the waiters bit as long as we hold + * lock->wait_lock. So we do the following sequence: + * + * owner = rt_mutex_owner(lock); + * clear_rt_mutex_waiters(lock); + * raw_spin_unlock(&lock->wait_lock); + * if (cmpxchg(&lock->owner, owner, 0) == owner) + * return; + * goto retry; + * + * The fastpath disabled variant is simple as all access to + * lock->owner is serialized by lock->wait_lock: + * + * lock->owner = NULL; + * raw_spin_unlock(&lock->wait_lock); + */ + while (!rt_mutex_has_waiters(lock)) { + /* Drops lock->wait_lock ! */ + if (unlock_rt_mutex_safe(lock) == true) + return; + /* Relock the rtmutex and try again */ + raw_spin_lock(&lock->wait_lock); } + /* + * The wakeup next waiter path does not suffer from the above + * race. See the comments there. + */ wakeup_next_waiter(lock); raw_spin_unlock(&lock->wait_lock); @@ -1076,7 +1284,8 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, return 1; } - ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); + /* We enforce deadlock detection for futexes */ + ret = task_blocks_on_rt_mutex(lock, waiter, task, 1); if (ret && !rt_mutex_owner(lock)) { /* diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h index a1a1dd06421..f6a1f3c133b 100644 --- a/kernel/locking/rtmutex.h +++ b/kernel/locking/rtmutex.h @@ -24,3 +24,8 @@ #define debug_rt_mutex_print_deadlock(w) do { } while (0) #define debug_rt_mutex_detect_deadlock(w,d) (d) #define debug_rt_mutex_reset_waiter(w) do { } while (0) + +static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w) +{ + WARN(1, "rtmutex deadlock detected\n"); +} diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c index 9be8a914497..2c93571162c 100644 --- a/kernel/locking/rwsem-spinlock.c +++ b/kernel/locking/rwsem-spinlock.c @@ -26,7 +26,7 @@ int rwsem_is_locked(struct rw_semaphore *sem) unsigned long flags; if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) { - ret = (sem->activity != 0); + ret = (sem->count != 0); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); } return ret; @@ -46,7 +46,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, debug_check_no_locks_freed((void *)sem, sizeof(*sem)); lockdep_init_map(&sem->dep_map, name, key, 0); #endif - sem->activity = 0; + sem->count = 0; raw_spin_lock_init(&sem->wait_lock); INIT_LIST_HEAD(&sem->wait_list); } @@ -95,7 +95,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) waiter = list_entry(next, struct rwsem_waiter, list); } while (waiter->type != RWSEM_WAITING_FOR_WRITE); - sem->activity += woken; + sem->count += woken; out: return sem; @@ -126,9 +126,9 @@ void __sched __down_read(struct rw_semaphore *sem) raw_spin_lock_irqsave(&sem->wait_lock, flags); - if (sem->activity >= 0 && list_empty(&sem->wait_list)) { + if (sem->count >= 0 && list_empty(&sem->wait_list)) { /* granted */ - sem->activity++; + sem->count++; raw_spin_unlock_irqrestore(&sem->wait_lock, flags); goto out; } @@ -170,9 +170,9 @@ int __down_read_trylock(struct rw_semaphore *sem) raw_spin_lock_irqsave(&sem->wait_lock, flags); - if (sem->activity >= 0 && list_empty(&sem->wait_list)) { + if (sem->count >= 0 && list_empty(&sem->wait_list)) { /* granted */ - sem->activity++; + sem->count++; ret = 1; } @@ -206,7 +206,7 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) * itself into sleep and waiting for system woke it or someone * else in the head of the wait list up. */ - if (sem->activity == 0) + if (sem->count == 0) break; set_task_state(tsk, TASK_UNINTERRUPTIBLE); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); @@ -214,7 +214,7 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) raw_spin_lock_irqsave(&sem->wait_lock, flags); } /* got the lock */ - sem->activity = -1; + sem->count = -1; list_del(&waiter.list); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); @@ -235,9 +235,9 @@ int __down_write_trylock(struct rw_semaphore *sem) raw_spin_lock_irqsave(&sem->wait_lock, flags); - if (sem->activity == 0) { + if (sem->count == 0) { /* got the lock */ - sem->activity = -1; + sem->count = -1; ret = 1; } @@ -255,7 +255,7 @@ void __up_read(struct rw_semaphore *sem) raw_spin_lock_irqsave(&sem->wait_lock, flags); - if (--sem->activity == 0 && !list_empty(&sem->wait_list)) + if (--sem->count == 0 && !list_empty(&sem->wait_list)) sem = __rwsem_wake_one_writer(sem); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); @@ -270,7 +270,7 @@ void __up_write(struct rw_semaphore *sem) raw_spin_lock_irqsave(&sem->wait_lock, flags); - sem->activity = 0; + sem->count = 0; if (!list_empty(&sem->wait_list)) sem = __rwsem_do_wake(sem, 1); @@ -287,7 +287,7 @@ void __downgrade_write(struct rw_semaphore *sem) raw_spin_lock_irqsave(&sem->wait_lock, flags); - sem->activity = 1; + sem->count = 1; if (!list_empty(&sem->wait_list)) sem = __rwsem_do_wake(sem, 0); diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 19c5fa95e0b..a2391ac135c 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -5,11 +5,66 @@ * * Writer lock-stealing by Alex Shi <alex.shi@intel.com> * and Michel Lespinasse <walken@google.com> + * + * Optimistic spinning by Tim Chen <tim.c.chen@intel.com> + * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes. */ #include <linux/rwsem.h> #include <linux/sched.h> #include <linux/init.h> #include <linux/export.h> +#include <linux/sched/rt.h> + +#include "mcs_spinlock.h" + +/* + * Guide to the rw_semaphore's count field for common values. + * (32-bit case illustrated, similar for 64-bit) + * + * 0x0000000X (1) X readers active or attempting lock, no writer waiting + * X = #active_readers + #readers attempting to lock + * (X*ACTIVE_BIAS) + * + * 0x00000000 rwsem is unlocked, and no one is waiting for the lock or + * attempting to read lock or write lock. + * + * 0xffff000X (1) X readers active or attempting lock, with waiters for lock + * X = #active readers + # readers attempting lock + * (X*ACTIVE_BIAS + WAITING_BIAS) + * (2) 1 writer attempting lock, no waiters for lock + * X-1 = #active readers + #readers attempting lock + * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS) + * (3) 1 writer active, no waiters for lock + * X-1 = #active readers + #readers attempting lock + * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS) + * + * 0xffff0001 (1) 1 reader active or attempting lock, waiters for lock + * (WAITING_BIAS + ACTIVE_BIAS) + * (2) 1 writer active or attempting lock, no waiters for lock + * (ACTIVE_WRITE_BIAS) + * + * 0xffff0000 (1) There are writers or readers queued but none active + * or in the process of attempting lock. + * (WAITING_BIAS) + * Note: writer can attempt to steal lock for this count by adding + * ACTIVE_WRITE_BIAS in cmpxchg and checking the old count + * + * 0xfffe0001 (1) 1 writer active, or attempting lock. Waiters on queue. + * (ACTIVE_WRITE_BIAS + WAITING_BIAS) + * + * Note: Readers attempt to lock by adding ACTIVE_BIAS in down_read and checking + * the count becomes more than 0 for successful lock acquisition, + * i.e. the case where there are only readers or nobody has lock. + * (1st and 2nd case above). + * + * Writers attempt to lock by adding ACTIVE_WRITE_BIAS in down_write and + * checking the count becomes ACTIVE_WRITE_BIAS for successful lock + * acquisition (i.e. nobody else has lock or attempts lock). If + * unsuccessful, in rwsem_down_write_failed, we'll check to see if there + * are only waiters but none active (5th case above), and attempt to + * steal the lock. + * + */ /* * Initialize an rwsem: @@ -27,6 +82,10 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, sem->count = RWSEM_UNLOCKED_VALUE; raw_spin_lock_init(&sem->wait_lock); INIT_LIST_HEAD(&sem->wait_list); +#ifdef CONFIG_RWSEM_SPIN_ON_OWNER + sem->owner = NULL; + osq_lock_init(&sem->osq); +#endif } EXPORT_SYMBOL(__init_rwsem); @@ -141,8 +200,9 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) } /* - * wait for the read lock to be granted + * Wait for the read lock to be granted */ +__visible struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) { long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; @@ -187,63 +247,221 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) return sem; } +static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) +{ + if (!(count & RWSEM_ACTIVE_MASK)) { + /* try acquiring the write lock */ + if (sem->count == RWSEM_WAITING_BIAS && + cmpxchg(&sem->count, RWSEM_WAITING_BIAS, + RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) { + if (!list_is_singular(&sem->wait_list)) + rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); + return true; + } + } + return false; +} + +#ifdef CONFIG_RWSEM_SPIN_ON_OWNER /* - * wait until we successfully acquire the write lock + * Try to acquire write lock before the writer has been put on wait queue. */ +static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) +{ + long old, count = ACCESS_ONCE(sem->count); + + while (true) { + if (!(count == 0 || count == RWSEM_WAITING_BIAS)) + return false; + + old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS); + if (old == count) + return true; + + count = old; + } +} + +static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) +{ + struct task_struct *owner; + bool on_cpu = false; + + if (need_resched()) + return false; + + rcu_read_lock(); + owner = ACCESS_ONCE(sem->owner); + if (owner) + on_cpu = owner->on_cpu; + rcu_read_unlock(); + + /* + * If sem->owner is not set, yet we have just recently entered the + * slowpath, then there is a possibility reader(s) may have the lock. + * To be safe, avoid spinning in these situations. + */ + return on_cpu; +} + +static inline bool owner_running(struct rw_semaphore *sem, + struct task_struct *owner) +{ + if (sem->owner != owner) + return false; + + /* + * Ensure we emit the owner->on_cpu, dereference _after_ checking + * sem->owner still matches owner, if that fails, owner might + * point to free()d memory, if it still matches, the rcu_read_lock() + * ensures the memory stays valid. + */ + barrier(); + + return owner->on_cpu; +} + +static noinline +bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) +{ + rcu_read_lock(); + while (owner_running(sem, owner)) { + if (need_resched()) + break; + + arch_mutex_cpu_relax(); + } + rcu_read_unlock(); + + /* + * We break out the loop above on need_resched() or when the + * owner changed, which is a sign for heavy contention. Return + * success only when sem->owner is NULL. + */ + return sem->owner == NULL; +} + +static bool rwsem_optimistic_spin(struct rw_semaphore *sem) +{ + struct task_struct *owner; + bool taken = false; + + preempt_disable(); + + /* sem->wait_lock should not be held when doing optimistic spinning */ + if (!rwsem_can_spin_on_owner(sem)) + goto done; + + if (!osq_lock(&sem->osq)) + goto done; + + while (true) { + owner = ACCESS_ONCE(sem->owner); + if (owner && !rwsem_spin_on_owner(sem, owner)) + break; + + /* wait_lock will be acquired if write_lock is obtained */ + if (rwsem_try_write_lock_unqueued(sem)) { + taken = true; + break; + } + + /* + * When there's no owner, we might have preempted between the + * owner acquiring the lock and setting the owner field. If + * we're an RT task that will live-lock because we won't let + * the owner complete. + */ + if (!owner && (need_resched() || rt_task(current))) + break; + + /* + * The cpu_relax() call is a compiler barrier which forces + * everything in this loop to be re-loaded. We don't need + * memory barriers as we'll eventually observe the right + * values at the cost of a few extra spins. + */ + arch_mutex_cpu_relax(); + } + osq_unlock(&sem->osq); +done: + preempt_enable(); + return taken; +} + +#else +static bool rwsem_optimistic_spin(struct rw_semaphore *sem) +{ + return false; +} +#endif + +/* + * Wait until we successfully acquire the write lock + */ +__visible struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) { - long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS; + long count; + bool waiting = true; /* any queued threads before us */ struct rwsem_waiter waiter; - struct task_struct *tsk = current; - /* set up my own style of waitqueue */ - waiter.task = tsk; + /* undo write bias from down_write operation, stop active locking */ + count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem); + + /* do optimistic spinning and steal lock if possible */ + if (rwsem_optimistic_spin(sem)) + return sem; + + /* + * Optimistic spinning failed, proceed to the slowpath + * and block until we can acquire the sem. + */ + waiter.task = current; waiter.type = RWSEM_WAITING_FOR_WRITE; raw_spin_lock_irq(&sem->wait_lock); + + /* account for this before adding a new element to the list */ if (list_empty(&sem->wait_list)) - adjustment += RWSEM_WAITING_BIAS; + waiting = false; + list_add_tail(&waiter.list, &sem->wait_list); /* we're now waiting on the lock, but no longer actively locking */ - count = rwsem_atomic_update(adjustment, sem); + if (waiting) { + count = ACCESS_ONCE(sem->count); + + /* + * If there were already threads queued before us and there are + * no active writers, the lock must be read owned; so we try to + * wake any read locks that were queued ahead of us. + */ + if (count > RWSEM_WAITING_BIAS) + sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS); - /* If there were already threads queued before us and there are no - * active writers, the lock must be read owned; so we try to wake - * any read locks that were queued ahead of us. */ - if (count > RWSEM_WAITING_BIAS && - adjustment == -RWSEM_ACTIVE_WRITE_BIAS) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS); + } else + count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); /* wait until we successfully acquire the lock */ - set_task_state(tsk, TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); while (true) { - if (!(count & RWSEM_ACTIVE_MASK)) { - /* Try acquiring the write lock. */ - count = RWSEM_ACTIVE_WRITE_BIAS; - if (!list_is_singular(&sem->wait_list)) - count += RWSEM_WAITING_BIAS; - - if (sem->count == RWSEM_WAITING_BIAS && - cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) == - RWSEM_WAITING_BIAS) - break; - } - + if (rwsem_try_write_lock(count, sem)) + break; raw_spin_unlock_irq(&sem->wait_lock); /* Block until there are no active lockers. */ do { schedule(); - set_task_state(tsk, TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); } while ((count = sem->count) & RWSEM_ACTIVE_MASK); raw_spin_lock_irq(&sem->wait_lock); } + __set_current_state(TASK_RUNNING); list_del(&waiter.list); raw_spin_unlock_irq(&sem->wait_lock); - tsk->state = TASK_RUNNING; return sem; } @@ -252,6 +470,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) * handle waking up a waiter on the semaphore * - up_read/up_write has decremented the active part of count if we come here */ +__visible struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) { unsigned long flags; @@ -272,6 +491,7 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) * - caller incremented waiting part of count and discovered it still negative * - just wake up any readers at the front of the queue */ +__visible struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) { unsigned long flags; diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index cfff1435bdf..e2d3bc7f03b 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -12,6 +12,27 @@ #include <linux/atomic.h> +#ifdef CONFIG_RWSEM_SPIN_ON_OWNER +static inline void rwsem_set_owner(struct rw_semaphore *sem) +{ + sem->owner = current; +} + +static inline void rwsem_clear_owner(struct rw_semaphore *sem) +{ + sem->owner = NULL; +} + +#else +static inline void rwsem_set_owner(struct rw_semaphore *sem) +{ +} + +static inline void rwsem_clear_owner(struct rw_semaphore *sem) +{ +} +#endif + /* * lock for reading */ @@ -48,6 +69,7 @@ void __sched down_write(struct rw_semaphore *sem) rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(sem, __down_write_trylock, __down_write); + rwsem_set_owner(sem); } EXPORT_SYMBOL(down_write); @@ -59,8 +81,11 @@ int down_write_trylock(struct rw_semaphore *sem) { int ret = __down_write_trylock(sem); - if (ret == 1) + if (ret == 1) { rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_); + rwsem_set_owner(sem); + } + return ret; } @@ -85,6 +110,7 @@ void up_write(struct rw_semaphore *sem) { rwsem_release(&sem->dep_map, 1, _RET_IP_); + rwsem_clear_owner(sem); __up_write(sem); } @@ -99,6 +125,7 @@ void downgrade_write(struct rw_semaphore *sem) * lockdep: a downgraded write will live on as a write * dependency. */ + rwsem_clear_owner(sem); __downgrade_write(sem); } @@ -122,6 +149,7 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_); LOCK_CONTENDED(sem, __down_write_trylock, __down_write); + rwsem_set_owner(sem); } EXPORT_SYMBOL(_down_write_nest_lock); @@ -141,6 +169,7 @@ void down_write_nested(struct rw_semaphore *sem, int subclass) rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); LOCK_CONTENDED(sem, __down_write_trylock, __down_write); + rwsem_set_owner(sem); } EXPORT_SYMBOL(down_write_nested); diff --git a/kernel/module.c b/kernel/module.c index d24fcf29cb6..81e727cf6df 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -640,7 +640,7 @@ static int module_unload_init(struct module *mod) INIT_LIST_HEAD(&mod->target_list); /* Hold reference count during initialization. */ - __this_cpu_write(mod->refptr->incs, 1); + raw_cpu_write(mod->refptr->incs, 1); return 0; } @@ -815,9 +815,6 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, return -EFAULT; name[MODULE_NAME_LEN-1] = '\0'; - if (!(flags & O_NONBLOCK)) - pr_warn("waiting module removal not supported: please upgrade\n"); - if (mutex_lock_interruptible(&module_mutex) != 0) return -EINTR; @@ -1013,9 +1010,11 @@ static size_t module_flags_taint(struct module *mod, char *buf) buf[l++] = 'F'; if (mod->taints & (1 << TAINT_CRAP)) buf[l++] = 'C'; + if (mod->taints & (1 << TAINT_UNSIGNED_MODULE)) + buf[l++] = 'E'; /* * TAINT_FORCED_RMMOD: could be added. - * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't + * TAINT_CPU_OUT_OF_SPEC, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't * apply to modules. */ return l; @@ -1948,6 +1947,10 @@ static int simplify_symbols(struct module *mod, const struct load_info *info) switch (sym[i].st_shndx) { case SHN_COMMON: + /* Ignore common symbols */ + if (!strncmp(name, "__gnu_lto", 9)) + break; + /* We compiled with -fno-common. These are not supposed to happen. */ pr_debug("Common symbol: %s\n", name); @@ -3017,21 +3020,6 @@ static int do_init_module(struct module *mod) */ current->flags &= ~PF_USED_ASYNC; - blocking_notifier_call_chain(&module_notify_list, - MODULE_STATE_COMING, mod); - - /* Set RO and NX regions for core */ - set_section_ro_nx(mod->module_core, - mod->core_text_size, - mod->core_ro_size, - mod->core_size); - - /* Set RO and NX regions for init */ - set_section_ro_nx(mod->module_init, - mod->init_text_size, - mod->init_ro_size, - mod->init_size); - do_mod_ctors(mod); /* Start the module */ if (mod->init != NULL) @@ -3162,9 +3150,26 @@ static int complete_formation(struct module *mod, struct load_info *info) /* This relies on module_mutex for list integrity. */ module_bug_finalize(info->hdr, info->sechdrs, mod); + /* Set RO and NX regions for core */ + set_section_ro_nx(mod->module_core, + mod->core_text_size, + mod->core_ro_size, + mod->core_size); + + /* Set RO and NX regions for init */ + set_section_ro_nx(mod->module_init, + mod->init_text_size, + mod->init_ro_size, + mod->init_size); + /* Mark state as coming so strong_try_module_get() ignores us, * but kallsyms etc. can see us. */ mod->state = MODULE_STATE_COMING; + mutex_unlock(&module_mutex); + + blocking_notifier_call_chain(&module_notify_list, + MODULE_STATE_COMING, mod); + return 0; out: mutex_unlock(&module_mutex); @@ -3187,6 +3192,7 @@ static int load_module(struct load_info *info, const char __user *uargs, { struct module *mod; long err; + char *after_dashes; err = module_sig_check(info); if (err) @@ -3214,7 +3220,7 @@ static int load_module(struct load_info *info, const char __user *uargs, pr_notice_once("%s: module verification failed: signature " "and/or required key missing - tainting " "kernel\n", mod->name); - add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_STILL_OK); + add_taint_module(mod, TAINT_UNSIGNED_MODULE, LOCKDEP_STILL_OK); } #endif @@ -3265,16 +3271,24 @@ static int load_module(struct load_info *info, const char __user *uargs, dynamic_debug_setup(info->debug, info->num_debug); + /* Ftrace init must be called in the MODULE_STATE_UNFORMED state */ + ftrace_module_init(mod); + /* Finally it's fully formed, ready to start executing. */ err = complete_formation(mod, info); if (err) goto ddebug_cleanup; /* Module is ready to execute: parsing args may do that. */ - err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, - -32768, 32767, unknown_module_param_cb); - if (err < 0) + after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, + -32768, 32767, unknown_module_param_cb); + if (IS_ERR(after_dashes)) { + err = PTR_ERR(after_dashes); goto bug_cleanup; + } else if (after_dashes) { + pr_warn("%s: parameters '%s' after `--' ignored\n", + mod->name, after_dashes); + } /* Link in to syfs. */ err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp); @@ -3809,12 +3823,12 @@ void print_modules(void) list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; - printk(" %s%s", mod->name, module_flags(mod, buf)); + pr_cont(" %s%s", mod->name, module_flags(mod, buf)); } preempt_enable(); if (last_unloaded_module[0]) - printk(" [last unloaded: %s]", last_unloaded_module); - printk("\n"); + pr_cont(" [last unloaded: %s]", last_unloaded_module); + pr_cont("\n"); } #ifdef CONFIG_MODVERSIONS diff --git a/kernel/notifier.c b/kernel/notifier.c index 2d5cc4ccff7..4803da6eab6 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -71,9 +71,9 @@ static int notifier_chain_unregister(struct notifier_block **nl, * @returns: notifier_call_chain returns the value returned by the * last notifier function called. */ -static int __kprobes notifier_call_chain(struct notifier_block **nl, - unsigned long val, void *v, - int nr_to_call, int *nr_calls) +static int notifier_call_chain(struct notifier_block **nl, + unsigned long val, void *v, + int nr_to_call, int *nr_calls) { int ret = NOTIFY_DONE; struct notifier_block *nb, *next_nb; @@ -102,6 +102,7 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl, } return ret; } +NOKPROBE_SYMBOL(notifier_call_chain); /* * Atomic notifier chain routines. Registration and unregistration @@ -172,9 +173,9 @@ EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister); * Otherwise the return value is the return value * of the last notifier function called. */ -int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh, - unsigned long val, void *v, - int nr_to_call, int *nr_calls) +int __atomic_notifier_call_chain(struct atomic_notifier_head *nh, + unsigned long val, void *v, + int nr_to_call, int *nr_calls) { int ret; @@ -184,13 +185,15 @@ int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh, return ret; } EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain); +NOKPROBE_SYMBOL(__atomic_notifier_call_chain); -int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh, - unsigned long val, void *v) +int atomic_notifier_call_chain(struct atomic_notifier_head *nh, + unsigned long val, void *v) { return __atomic_notifier_call_chain(nh, val, v, -1, NULL); } EXPORT_SYMBOL_GPL(atomic_notifier_call_chain); +NOKPROBE_SYMBOL(atomic_notifier_call_chain); /* * Blocking notifier chain routines. All access to the chain is @@ -309,7 +312,7 @@ int __blocking_notifier_call_chain(struct blocking_notifier_head *nh, * racy then it does not matter what the result of the test * is, we re-check the list after having taken the lock anyway: */ - if (rcu_dereference_raw(nh->head)) { + if (rcu_access_pointer(nh->head)) { down_read(&nh->rwsem); ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls); @@ -527,7 +530,7 @@ EXPORT_SYMBOL_GPL(srcu_init_notifier_head); static ATOMIC_NOTIFIER_HEAD(die_chain); -int notrace __kprobes notify_die(enum die_val val, const char *str, +int notrace notify_die(enum die_val val, const char *str, struct pt_regs *regs, long err, int trap, int sig) { struct die_args args = { @@ -540,6 +543,7 @@ int notrace __kprobes notify_die(enum die_val val, const char *str, }; return atomic_notifier_call_chain(&die_chain, val, &args); } +NOKPROBE_SYMBOL(notify_die); int register_die_notifier(struct notifier_block *nb) { diff --git a/kernel/panic.c b/kernel/panic.c index 6d630037509..62e16cef9cc 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -32,6 +32,7 @@ static unsigned long tainted_mask; static int pause_on_oops; static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); +static bool crash_kexec_post_notifiers; int panic_timeout = CONFIG_PANIC_TIMEOUT; EXPORT_SYMBOL_GPL(panic_timeout); @@ -100,7 +101,7 @@ void panic(const char *fmt, ...) va_start(args, fmt); vsnprintf(buf, sizeof(buf), fmt, args); va_end(args); - printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); + pr_emerg("Kernel panic - not syncing: %s\n", buf); #ifdef CONFIG_DEBUG_BUGVERBOSE /* * Avoid nested stack-dumping if a panic occurs during oops processing @@ -112,9 +113,11 @@ void panic(const char *fmt, ...) /* * If we have crashed and we have a crash kernel loaded let it handle * everything else. - * Do we want to call this before we try to display a message? + * If we want to run this after calling panic_notifiers, pass + * the "crash_kexec_post_notifiers" option to the kernel. */ - crash_kexec(NULL); + if (!crash_kexec_post_notifiers) + crash_kexec(NULL); /* * Note smp_send_stop is the usual smp shutdown function, which @@ -131,6 +134,15 @@ void panic(const char *fmt, ...) kmsg_dump(KMSG_DUMP_PANIC); + /* + * If you doubt kdump always works fine in any situation, + * "crash_kexec_post_notifiers" offers you a chance to run + * panic_notifiers and dumping kmsg before kdump. + * Note: since some panic_notifiers can make crashed kernel + * more unstable, it can increase risks of the kdump failure too. + */ + crash_kexec(NULL); + bust_spinlocks(0); if (!panic_blink) @@ -141,7 +153,7 @@ void panic(const char *fmt, ...) * Delay timeout seconds before rebooting the machine. * We can't use the "normal" timers since we just panicked. */ - printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout); + pr_emerg("Rebooting in %d seconds..", panic_timeout); for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) { touch_nmi_watchdog(); @@ -165,7 +177,7 @@ void panic(const char *fmt, ...) extern int stop_a_enabled; /* Make sure the user can actually press Stop-A (L1-A) */ stop_a_enabled = 1; - printk(KERN_EMERG "Press Stop-A (L1-A) to return to the boot prom\n"); + pr_emerg("Press Stop-A (L1-A) to return to the boot prom\n"); } #endif #if defined(CONFIG_S390) @@ -176,6 +188,7 @@ void panic(const char *fmt, ...) disabled_wait(caller); } #endif + pr_emerg("---[ end Kernel panic - not syncing: %s\n", buf); local_irq_enable(); for (i = 0; ; i += PANIC_TIMER_STEP) { touch_softlockup_watchdog(); @@ -199,7 +212,7 @@ struct tnt { static const struct tnt tnts[] = { { TAINT_PROPRIETARY_MODULE, 'P', 'G' }, { TAINT_FORCED_MODULE, 'F', ' ' }, - { TAINT_UNSAFE_SMP, 'S', ' ' }, + { TAINT_CPU_OUT_OF_SPEC, 'S', ' ' }, { TAINT_FORCED_RMMOD, 'R', ' ' }, { TAINT_MACHINE_CHECK, 'M', ' ' }, { TAINT_BAD_PAGE, 'B', ' ' }, @@ -210,6 +223,7 @@ static const struct tnt tnts[] = { { TAINT_CRAP, 'C', ' ' }, { TAINT_FIRMWARE_WORKAROUND, 'I', ' ' }, { TAINT_OOT_MODULE, 'O', ' ' }, + { TAINT_UNSIGNED_MODULE, 'E', ' ' }, }; /** @@ -228,6 +242,7 @@ static const struct tnt tnts[] = { * 'C' - modules from drivers/staging are loaded. * 'I' - Working around severe firmware bug. * 'O' - Out-of-tree module has been loaded. + * 'E' - Unsigned module has been loaded. * * The string is overwritten by the next call to print_tainted(). */ @@ -274,8 +289,7 @@ unsigned long get_taint(void) void add_taint(unsigned flag, enum lockdep_ok lockdep_ok) { if (lockdep_ok == LOCKDEP_NOW_UNRELIABLE && __debug_locks_off()) - printk(KERN_WARNING - "Disabling lock debugging due to kernel taint\n"); + pr_warn("Disabling lock debugging due to kernel taint\n"); set_bit(flag, &tainted_mask); } @@ -380,8 +394,7 @@ late_initcall(init_oops_id); void print_oops_end_marker(void) { init_oops_id(); - printk(KERN_WARNING "---[ end trace %016llx ]---\n", - (unsigned long long)oops_id); + pr_warn("---[ end trace %016llx ]---\n", (unsigned long long)oops_id); } /* @@ -459,7 +472,7 @@ EXPORT_SYMBOL(warn_slowpath_null); * Called when gcc's -fstack-protector feature is used, and * gcc detects corruption of the on-stack canary value */ -void __stack_chk_fail(void) +__visible void __stack_chk_fail(void) { panic("stack-protector: Kernel stack is corrupted in: %p\n", __builtin_return_address(0)); @@ -471,6 +484,13 @@ EXPORT_SYMBOL(__stack_chk_fail); core_param(panic, panic_timeout, int, 0644); core_param(pause_on_oops, pause_on_oops, int, 0644); +static int __init setup_crash_kexec_post_notifiers(char *s) +{ + crash_kexec_post_notifiers = true; + return 0; +} +early_param("crash_kexec_post_notifiers", setup_crash_kexec_post_notifiers); + static int __init oops_setup(char *s) { if (!s) diff --git a/kernel/params.c b/kernel/params.c index b00142e7f3b..1e52ca233fd 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -177,13 +177,13 @@ static char *next_arg(char *args, char **param, char **val) } /* Args looks like "foo=bar,bar2 baz=fuz wiz". */ -int parse_args(const char *doing, - char *args, - const struct kernel_param *params, - unsigned num, - s16 min_level, - s16 max_level, - int (*unknown)(char *param, char *val, const char *doing)) +char *parse_args(const char *doing, + char *args, + const struct kernel_param *params, + unsigned num, + s16 min_level, + s16 max_level, + int (*unknown)(char *param, char *val, const char *doing)) { char *param, *val; @@ -198,6 +198,9 @@ int parse_args(const char *doing, int irq_was_disabled; args = next_arg(args, ¶m, &val); + /* Stop at -- */ + if (!val && strcmp(param, "--") == 0) + return args; irq_was_disabled = irqs_disabled(); ret = parse_one(param, val, doing, params, num, min_level, max_level, unknown); @@ -208,22 +211,22 @@ int parse_args(const char *doing, switch (ret) { case -ENOENT: pr_err("%s: Unknown parameter `%s'\n", doing, param); - return ret; + return ERR_PTR(ret); case -ENOSPC: pr_err("%s: `%s' too large for parameter `%s'\n", doing, val ?: "", param); - return ret; + return ERR_PTR(ret); case 0: break; default: pr_err("%s: `%s' invalid for parameter `%s'\n", doing, val ?: "", param); - return ret; + return ERR_PTR(ret); } } /* All parsed OK. */ - return 0; + return NULL; } /* Lazy bastard, eh? */ diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 06c62de9c71..db95d8eb761 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -318,7 +318,9 @@ static void *pidns_get(struct task_struct *task) struct pid_namespace *ns; rcu_read_lock(); - ns = get_pid_ns(task_active_pid_ns(task)); + ns = task_active_pid_ns(task); + if (ns) + get_pid_ns(ns); rcu_read_unlock(); return ns; diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 2fac9cc79b3..9a83d780fac 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -257,8 +257,7 @@ config ARCH_HAS_OPP bool config PM_OPP - bool "Operating Performance Point (OPP) Layer library" - depends on ARCH_HAS_OPP + bool ---help--- SOCs have a standard set of tuples consisting of frequency and voltage pairs that the device will support per voltage domain. This diff --git a/kernel/power/console.c b/kernel/power/console.c index eacb8bd8cab..aba9c545a0e 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c @@ -9,6 +9,7 @@ #include <linux/kbd_kern.h> #include <linux/vt.h> #include <linux/module.h> +#include <linux/slab.h> #include "power.h" #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 37170d4dd9a..fcc2611d3f1 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -28,14 +28,16 @@ #include <linux/syscore_ops.h> #include <linux/ctype.h> #include <linux/genhd.h> +#include <trace/events/power.h> #include "power.h" static int nocompress; static int noresume; +static int nohibernate; static int resume_wait; -static int resume_delay; +static unsigned int resume_delay; static char resume_file[256] = CONFIG_PM_STD_PARTITION; dev_t swsusp_resume_device; sector_t swsusp_resume_block; @@ -61,6 +63,11 @@ bool freezer_test_done; static const struct platform_hibernation_ops *hibernation_ops; +bool hibernation_available(void) +{ + return (nohibernate == 0); +} + /** * hibernation_set_ops - Set the global hibernate operations. * @ops: Hibernation operations to use in subsequent hibernation transitions. @@ -228,19 +235,23 @@ static void platform_recover(int platform_mode) void swsusp_show_speed(struct timeval *start, struct timeval *stop, unsigned nr_pages, char *msg) { - s64 elapsed_centisecs64; - int centisecs; - int k; - int kps; + u64 elapsed_centisecs64; + unsigned int centisecs; + unsigned int k; + unsigned int kps; elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start); + /* + * If "(s64)elapsed_centisecs64 < 0", it will print long elapsed time, + * it is obvious enough for what went wrong. + */ do_div(elapsed_centisecs64, NSEC_PER_SEC / 100); centisecs = elapsed_centisecs64; if (centisecs == 0) centisecs = 1; /* avoid div-by-zero */ k = nr_pages * (PAGE_SIZE / 1024); kps = (k * 100) / centisecs; - printk(KERN_INFO "PM: %s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", + printk(KERN_INFO "PM: %s %u kbytes in %u.%02u seconds (%u.%02u MB/s)\n", msg, k, centisecs / 100, centisecs % 100, kps / 1000, (kps % 1000) / 10); @@ -288,7 +299,9 @@ static int create_image(int platform_mode) in_suspend = 1; save_processor_state(); + trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, true); error = swsusp_arch_suspend(); + trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false); if (error) printk(KERN_ERR "PM: Error %d creating hibernation image\n", error); @@ -595,7 +608,8 @@ static void power_down(void) case HIBERNATION_PLATFORM: hibernation_platform_enter(); case HIBERNATION_SHUTDOWN: - kernel_power_off(); + if (pm_power_off) + kernel_power_off(); break; #ifdef CONFIG_SUSPEND case HIBERNATION_SUSPEND: @@ -623,7 +637,8 @@ static void power_down(void) * corruption after resume. */ printk(KERN_CRIT "PM: Please power down manually\n"); - while(1); + while (1) + cpu_relax(); } /** @@ -633,6 +648,11 @@ int hibernate(void) { int error; + if (!hibernation_available()) { + pr_debug("PM: Hibernation not available.\n"); + return -EPERM; + } + lock_system_sleep(); /* The snapshot device should not be opened while we're running */ if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { @@ -725,7 +745,7 @@ static int software_resume(void) /* * If the user said "noresume".. bail out early. */ - if (noresume) + if (noresume || !hibernation_available()) return 0; /* @@ -891,6 +911,9 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, int i; char *start = buf; + if (!hibernation_available()) + return sprintf(buf, "[disabled]\n"); + for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { if (!hibernation_modes[i]) continue; @@ -925,6 +948,9 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, char *p; int mode = HIBERNATION_INVALID; + if (!hibernation_available()) + return -EPERM; + p = memchr(buf, '\n', n); len = p ? p - buf : n; @@ -973,16 +999,20 @@ static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr, static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { - unsigned int maj, min; dev_t res; - int ret = -EINVAL; + int len = n; + char *name; - if (sscanf(buf, "%u:%u", &maj, &min) != 2) - goto out; + if (len && buf[len-1] == '\n') + len--; + name = kstrndup(buf, len, GFP_KERNEL); + if (!name) + return -ENOMEM; - res = MKDEV(maj,min); - if (maj != MAJOR(res) || min != MINOR(res)) - goto out; + res = name_to_dev_t(name); + kfree(name); + if (!res) + return -EINVAL; lock_system_sleep(); swsusp_resume_device = res; @@ -990,9 +1020,7 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, printk(KERN_INFO "PM: Starting manual resume from disk\n"); noresume = 0; software_resume(); - ret = n; - out: - return ret; + return n; } power_attr(resume); @@ -1090,6 +1118,10 @@ static int __init hibernate_setup(char *str) noresume = 1; else if (!strncmp(str, "nocompress", 10)) nocompress = 1; + else if (!strncmp(str, "no", 2)) { + noresume = 1; + nohibernate = 1; + } return 1; } @@ -1107,13 +1139,30 @@ static int __init resumewait_setup(char *str) static int __init resumedelay_setup(char *str) { - resume_delay = simple_strtoul(str, NULL, 0); + int rc = kstrtouint(str, 0, &resume_delay); + + if (rc) + return rc; + return 1; +} + +static int __init nohibernate_setup(char *str) +{ + noresume = 1; + nohibernate = 1; return 1; } +static int __init kaslr_nohibernate_setup(char *str) +{ + return nohibernate_setup(str); +} + __setup("noresume", noresume_setup); __setup("resume_offset=", resume_offset_setup); __setup("resume=", resume_setup); __setup("hibernate=", hibernate_setup); __setup("resumewait", resumewait_setup); __setup("resumedelay=", resumedelay_setup); +__setup("nohibernate", nohibernate_setup); +__setup("kaslr", kaslr_nohibernate_setup); diff --git a/kernel/power/main.c b/kernel/power/main.c index 1d1bf630e6e..8e90f330f13 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -279,34 +279,32 @@ static inline void pm_print_times_init(void) {} struct kobject *power_kobj; /** - * state - control system power state. + * state - control system sleep states. * - * show() returns what states are supported, which is hard-coded to - * 'standby' (Power-On Suspend), 'mem' (Suspend-to-RAM), and - * 'disk' (Suspend-to-Disk). + * show() returns available sleep state labels, which may be "mem", "standby", + * "freeze" and "disk" (hibernation). See Documentation/power/states.txt for a + * description of what they mean. * - * store() accepts one of those strings, translates it into the - * proper enumerated value, and initiates a suspend transition. + * store() accepts one of those strings, translates it into the proper + * enumerated value, and initiates a suspend transition. */ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { char *s = buf; #ifdef CONFIG_SUSPEND - int i; + suspend_state_t i; + + for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) + if (pm_states[i].state) + s += sprintf(s,"%s ", pm_states[i].label); - for (i = 0; i < PM_SUSPEND_MAX; i++) { - if (pm_states[i] && valid_state(i)) - s += sprintf(s,"%s ", pm_states[i]); - } #endif -#ifdef CONFIG_HIBERNATION - s += sprintf(s, "%s\n", "disk"); -#else + if (hibernation_available()) + s += sprintf(s, "disk "); if (s != buf) /* convert the last space to a newline */ *(s-1) = '\n'; -#endif return (s - buf); } @@ -314,7 +312,7 @@ static suspend_state_t decode_state(const char *buf, size_t n) { #ifdef CONFIG_SUSPEND suspend_state_t state = PM_SUSPEND_MIN; - const char * const *s; + struct pm_sleep_state *s; #endif char *p; int len; @@ -328,8 +326,9 @@ static suspend_state_t decode_state(const char *buf, size_t n) #ifdef CONFIG_SUSPEND for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) - if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) - return state; + if (s->state && len == strlen(s->label) + && !strncmp(buf, s->label, len)) + return s->state; #endif return PM_SUSPEND_ON; @@ -447,8 +446,8 @@ static ssize_t autosleep_show(struct kobject *kobj, #ifdef CONFIG_SUSPEND if (state < PM_SUSPEND_MAX) - return sprintf(buf, "%s\n", valid_state(state) ? - pm_states[state] : "error"); + return sprintf(buf, "%s\n", pm_states[state].state ? + pm_states[state].label : "error"); #endif #ifdef CONFIG_HIBERNATION return sprintf(buf, "disk\n"); diff --git a/kernel/power/power.h b/kernel/power/power.h index 7d4b7ffb3c1..c60f13b5270 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -2,6 +2,7 @@ #include <linux/suspend_ioctls.h> #include <linux/utsname.h> #include <linux/freezer.h> +#include <linux/compiler.h> struct swsusp_info { struct new_utsname uts; @@ -11,7 +12,7 @@ struct swsusp_info { unsigned long image_pages; unsigned long pages; unsigned long size; -} __attribute__((aligned(PAGE_SIZE))); +} __aligned(PAGE_SIZE); #ifdef CONFIG_HIBERNATION /* kernel/power/snapshot.c */ @@ -49,6 +50,8 @@ static inline char *check_image_kernel(struct swsusp_info *info) */ #define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT) +asmlinkage int swsusp_save(void); + /* kernel/power/hibernate.c */ extern bool freezer_test_done; @@ -175,17 +178,20 @@ extern void swsusp_show_speed(struct timeval *, struct timeval *, unsigned int, char *); #ifdef CONFIG_SUSPEND +struct pm_sleep_state { + const char *label; + suspend_state_t state; +}; + /* kernel/power/suspend.c */ -extern const char *const pm_states[]; +extern struct pm_sleep_state pm_states[]; -extern bool valid_state(suspend_state_t state); extern int suspend_devices_and_enter(suspend_state_t state); #else /* !CONFIG_SUSPEND */ static inline int suspend_devices_and_enter(suspend_state_t state) { return -ENOSYS; } -static inline bool valid_state(suspend_state_t state) { return false; } #endif /* !CONFIG_SUSPEND */ #ifdef CONFIG_PM_TEST_SUSPEND diff --git a/kernel/power/process.c b/kernel/power/process.c index 06ec8869dbf..4ee194eb524 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -17,6 +17,7 @@ #include <linux/delay.h> #include <linux/workqueue.h> #include <linux/kmod.h> +#include <trace/events/power.h> /* * Timeout for stopping processes @@ -175,6 +176,7 @@ void thaw_processes(void) struct task_struct *g, *p; struct task_struct *curr = current; + trace_suspend_resume(TPS("thaw_processes"), 0, true); if (pm_freezing) atomic_dec(&system_freezing_cnt); pm_freezing = false; @@ -184,6 +186,7 @@ void thaw_processes(void) printk("Restarting tasks ... "); + __usermodehelper_set_disable_depth(UMH_FREEZING); thaw_workqueues(); read_lock(&tasklist_lock); @@ -201,6 +204,7 @@ void thaw_processes(void) schedule(); printk("done.\n"); + trace_suspend_resume(TPS("thaw_processes"), 0, false); } void thaw_kernel_threads(void) diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 8dff9b48075..884b7705886 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -66,6 +66,7 @@ static struct pm_qos_constraints cpu_dma_constraints = { .list = PLIST_HEAD_INIT(cpu_dma_constraints.list), .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, .default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, + .no_constraint_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, .type = PM_QOS_MIN, .notifiers = &cpu_dma_lat_notifier, }; @@ -79,6 +80,7 @@ static struct pm_qos_constraints network_lat_constraints = { .list = PLIST_HEAD_INIT(network_lat_constraints.list), .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, .default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, + .no_constraint_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, .type = PM_QOS_MIN, .notifiers = &network_lat_notifier, }; @@ -93,6 +95,7 @@ static struct pm_qos_constraints network_tput_constraints = { .list = PLIST_HEAD_INIT(network_tput_constraints.list), .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, .default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, + .no_constraint_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, .type = PM_QOS_MAX, .notifiers = &network_throughput_notifier, }; @@ -128,7 +131,7 @@ static const struct file_operations pm_qos_power_fops = { static inline int pm_qos_get_value(struct pm_qos_constraints *c) { if (plist_head_empty(&c->list)) - return c->default_value; + return c->no_constraint_value; switch (c->type) { case PM_QOS_MIN: @@ -170,6 +173,7 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, { unsigned long flags; int prev_value, curr_value, new_value; + int ret; spin_lock_irqsave(&pm_qos_lock, flags); prev_value = pm_qos_get_value(c); @@ -205,13 +209,15 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, trace_pm_qos_update_target(action, prev_value, curr_value); if (prev_value != curr_value) { - blocking_notifier_call_chain(c->notifiers, - (unsigned long)curr_value, - NULL); - return 1; + ret = 1; + if (c->notifiers) + blocking_notifier_call_chain(c->notifiers, + (unsigned long)curr_value, + NULL); } else { - return 0; + ret = 0; } + return ret; } /** diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index d9f61a14580..1ea328aafdc 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -27,6 +27,7 @@ #include <linux/highmem.h> #include <linux/list.h> #include <linux/slab.h> +#include <linux/compiler.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> @@ -155,7 +156,7 @@ static inline void free_image_page(void *addr, int clear_nosave_free) struct linked_page { struct linked_page *next; char data[LINKED_PAGE_DATA_SIZE]; -} __attribute__((packed)); +} __packed; static inline void free_list_of_pages(struct linked_page *list, int clear_page_nosave) @@ -1268,7 +1269,7 @@ static void free_unnecessary_pages(void) * [number of saveable pages] - [number of pages that can be freed in theory] * * where the second term is the sum of (1) reclaimable slab pages, (2) active - * and (3) inactive anonymouns pages, (4) active and (5) inactive file pages, + * and (3) inactive anonymous pages, (4) active and (5) inactive file pages, * minus mapped file pages. */ static unsigned long minimum_image_size(unsigned long saveable) @@ -1585,7 +1586,7 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, return -ENOMEM; } -asmlinkage int swsusp_save(void) +asmlinkage __visible int swsusp_save(void) { unsigned int nr_pages, nr_highmem; diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 62ee437b5c7..ed35a4790af 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -14,6 +14,7 @@ #include <linux/init.h> #include <linux/console.h> #include <linux/cpu.h> +#include <linux/cpuidle.h> #include <linux/syscalls.h> #include <linux/gfp.h> #include <linux/io.h> @@ -26,25 +27,34 @@ #include <linux/syscore_ops.h> #include <linux/ftrace.h> #include <trace/events/power.h> +#include <linux/compiler.h> #include "power.h" -const char *const pm_states[PM_SUSPEND_MAX] = { - [PM_SUSPEND_FREEZE] = "freeze", - [PM_SUSPEND_STANDBY] = "standby", - [PM_SUSPEND_MEM] = "mem", +struct pm_sleep_state pm_states[PM_SUSPEND_MAX] = { + [PM_SUSPEND_FREEZE] = { .label = "freeze", .state = PM_SUSPEND_FREEZE }, + [PM_SUSPEND_STANDBY] = { .label = "standby", }, + [PM_SUSPEND_MEM] = { .label = "mem", }, }; static const struct platform_suspend_ops *suspend_ops; +static const struct platform_freeze_ops *freeze_ops; static bool need_suspend_ops(suspend_state_t state) { - return !!(state > PM_SUSPEND_FREEZE); + return state > PM_SUSPEND_FREEZE; } static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head); static bool suspend_freeze_wake; +void freeze_set_ops(const struct platform_freeze_ops *ops) +{ + lock_system_sleep(); + freeze_ops = ops; + unlock_system_sleep(); +} + static void freeze_begin(void) { suspend_freeze_wake = false; @@ -52,7 +62,11 @@ static void freeze_begin(void) static void freeze_enter(void) { + cpuidle_use_deepest_state(true); + cpuidle_resume(); wait_event(suspend_freeze_wait_head, suspend_freeze_wake); + cpuidle_pause(); + cpuidle_use_deepest_state(false); } void freeze_wake(void) @@ -62,42 +76,62 @@ void freeze_wake(void) } EXPORT_SYMBOL_GPL(freeze_wake); +static bool valid_state(suspend_state_t state) +{ + /* + * PM_SUSPEND_STANDBY and PM_SUSPEND_MEM states need low level + * support and need to be valid to the low level + * implementation, no valid callback implies that none are valid. + */ + return suspend_ops && suspend_ops->valid && suspend_ops->valid(state); +} + +/* + * If this is set, the "mem" label always corresponds to the deepest sleep state + * available, the "standby" label corresponds to the second deepest sleep state + * available (if any), and the "freeze" label corresponds to the remaining + * available sleep state (if there is one). + */ +static bool relative_states; + +static int __init sleep_states_setup(char *str) +{ + relative_states = !strncmp(str, "1", 1); + if (relative_states) { + pm_states[PM_SUSPEND_MEM].state = PM_SUSPEND_FREEZE; + pm_states[PM_SUSPEND_FREEZE].state = 0; + } + return 1; +} + +__setup("relative_sleep_states=", sleep_states_setup); + /** * suspend_set_ops - Set the global suspend method table. * @ops: Suspend operations to use. */ void suspend_set_ops(const struct platform_suspend_ops *ops) { + suspend_state_t i; + int j = PM_SUSPEND_MAX - 1; + lock_system_sleep(); + suspend_ops = ops; + for (i = PM_SUSPEND_MEM; i >= PM_SUSPEND_STANDBY; i--) + if (valid_state(i)) + pm_states[j--].state = i; + else if (!relative_states) + pm_states[j--].state = 0; + + pm_states[j--].state = PM_SUSPEND_FREEZE; + while (j >= PM_SUSPEND_MIN) + pm_states[j--].state = 0; + unlock_system_sleep(); } EXPORT_SYMBOL_GPL(suspend_set_ops); -bool valid_state(suspend_state_t state) -{ - if (state == PM_SUSPEND_FREEZE) { -#ifdef CONFIG_PM_DEBUG - if (pm_test_level != TEST_NONE && - pm_test_level != TEST_FREEZER && - pm_test_level != TEST_DEVICES && - pm_test_level != TEST_PLATFORM) { - printk(KERN_WARNING "Unsupported pm_test mode for " - "freeze state, please choose " - "none/freezer/devices/platform.\n"); - return false; - } -#endif - return true; - } - /* - * PM_SUSPEND_STANDBY and PM_SUSPEND_MEMORY states need lowlevel - * support and need to be valid to the lowlevel - * implementation, no valid callback implies that none are valid. - */ - return suspend_ops && suspend_ops->valid && suspend_ops->valid(state); -} - /** * suspend_valid_only_mem - Generic memory-only valid callback. * @@ -143,7 +177,9 @@ static int suspend_prepare(suspend_state_t state) if (error) goto Finish; + trace_suspend_resume(TPS("freeze_processes"), 0, true); error = suspend_freeze_processes(); + trace_suspend_resume(TPS("freeze_processes"), 0, false); if (!error) return 0; @@ -156,13 +192,13 @@ static int suspend_prepare(suspend_state_t state) } /* default implementation */ -void __attribute__ ((weak)) arch_suspend_disable_irqs(void) +void __weak arch_suspend_disable_irqs(void) { local_irq_disable(); } /* default implementation */ -void __attribute__ ((weak)) arch_suspend_enable_irqs(void) +void __weak arch_suspend_enable_irqs(void) { local_irq_enable(); } @@ -206,7 +242,9 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) * all the devices are suspended. */ if (state == PM_SUSPEND_FREEZE) { + trace_suspend_resume(TPS("machine_suspend"), state, true); freeze_enter(); + trace_suspend_resume(TPS("machine_suspend"), state, false); goto Platform_wake; } @@ -222,7 +260,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) if (!error) { *wakeup = pm_wakeup_pending(); if (!(suspend_test(TEST_CORE) || *wakeup)) { + trace_suspend_resume(TPS("machine_suspend"), + state, true); error = suspend_ops->enter(state); + trace_suspend_resume(TPS("machine_suspend"), + state, false); events_check_enabled = false; } syscore_resume(); @@ -260,11 +302,14 @@ int suspend_devices_and_enter(suspend_state_t state) if (need_suspend_ops(state) && !suspend_ops) return -ENOSYS; - trace_machine_suspend(state); if (need_suspend_ops(state) && suspend_ops->begin) { error = suspend_ops->begin(state); if (error) goto Close; + } else if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin) { + error = freeze_ops->begin(); + if (error) + goto Close; } suspend_console(); suspend_test_start(); @@ -290,7 +335,9 @@ int suspend_devices_and_enter(suspend_state_t state) Close: if (need_suspend_ops(state) && suspend_ops->end) suspend_ops->end(); - trace_machine_suspend(PWR_EVENT_EXIT); + else if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end) + freeze_ops->end(); + return error; Recover_platform: @@ -324,20 +371,31 @@ static int enter_state(suspend_state_t state) { int error; - if (!valid_state(state)) - return -ENODEV; - + trace_suspend_resume(TPS("suspend_enter"), state, true); + if (state == PM_SUSPEND_FREEZE) { +#ifdef CONFIG_PM_DEBUG + if (pm_test_level != TEST_NONE && pm_test_level <= TEST_CPUS) { + pr_warning("PM: Unsupported test mode for freeze state," + "please choose none/freezer/devices/platform.\n"); + return -EAGAIN; + } +#endif + } else if (!valid_state(state)) { + return -EINVAL; + } if (!mutex_trylock(&pm_mutex)) return -EBUSY; if (state == PM_SUSPEND_FREEZE) freeze_begin(); + trace_suspend_resume(TPS("sync_filesystems"), 0, true); printk(KERN_INFO "PM: Syncing filesystems ... "); sys_sync(); printk("done.\n"); + trace_suspend_resume(TPS("sync_filesystems"), 0, false); - pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); + pr_debug("PM: Preparing system for %s sleep\n", pm_states[state].label); error = suspend_prepare(state); if (error) goto Unlock; @@ -345,7 +403,8 @@ static int enter_state(suspend_state_t state) if (suspend_test(TEST_FREEZER)) goto Finish; - pr_debug("PM: Entering %s sleep\n", pm_states[state]); + trace_suspend_resume(TPS("suspend_enter"), state, false); + pr_debug("PM: Entering %s sleep\n", pm_states[state].label); pm_restrict_gfp_mask(); error = suspend_devices_and_enter(state); pm_restore_gfp_mask(); diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index 9b2a1d58558..269b097e78e 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -92,13 +92,13 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) } if (state == PM_SUSPEND_MEM) { - printk(info_test, pm_states[state]); + printk(info_test, pm_states[state].label); status = pm_suspend(state); if (status == -ENODEV) state = PM_SUSPEND_STANDBY; } if (state == PM_SUSPEND_STANDBY) { - printk(info_test, pm_states[state]); + printk(info_test, pm_states[state].label); status = pm_suspend(state); } if (status < 0) @@ -136,18 +136,16 @@ static char warn_bad_state[] __initdata = static int __init setup_test_suspend(char *value) { - unsigned i; + suspend_state_t i; /* "=mem" ==> "mem" */ value++; - for (i = 0; i < PM_SUSPEND_MAX; i++) { - if (!pm_states[i]) - continue; - if (strcmp(pm_states[i], value) != 0) - continue; - test_state = (__force suspend_state_t) i; - return 0; - } + for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) + if (!strcmp(pm_states[i].label, value)) { + test_state = pm_states[i].state; + return 0; + } + printk(warn_bad_state, value); return 0; } @@ -164,8 +162,8 @@ static int __init test_suspend(void) /* PM is initialized by now; is that state testable? */ if (test_state == PM_SUSPEND_ON) goto done; - if (!valid_state(test_state)) { - printk(warn_bad_state, pm_states[test_state]); + if (!pm_states[test_state].state) { + printk(warn_bad_state, pm_states[test_state].label); goto done; } diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 7c33ed20041..aaa3261dea5 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -101,7 +101,7 @@ struct swsusp_header { unsigned int flags; /* Flags to pass to the "boot" kernel */ char orig_sig[10]; char sig[10]; -} __attribute__((packed)); +} __packed; static struct swsusp_header *swsusp_header; @@ -567,7 +567,7 @@ static int lzo_compress_threadfn(void *data) /** * save_image_lzo - Save the suspend image data compressed with LZO. - * @handle: Swap mam handle to use for saving the image. + * @handle: Swap map handle to use for saving the image. * @snapshot: Image to read data from. * @nr_to_write: Number of pages to save. */ diff --git a/kernel/power/user.c b/kernel/power/user.c index 98d357584cd..526e8911460 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -49,6 +49,9 @@ static int snapshot_open(struct inode *inode, struct file *filp) struct snapshot_data *data; int error; + if (!hibernation_available()) + return -EPERM; + lock_system_sleep(); if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c index 8f50de394d2..019069c84ff 100644 --- a/kernel/power/wakelock.c +++ b/kernel/power/wakelock.c @@ -18,6 +18,8 @@ #include <linux/rbtree.h> #include <linux/slab.h> +#include "power.h" + static DEFINE_MUTEX(wakelocks_lock); struct wakelock { diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index b1d255f0413..13e839dbca0 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -54,20 +54,16 @@ #include "console_cmdline.h" #include "braille.h" -/* printk's without a loglevel use this.. */ -#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL - -/* We show everything that is MORE important than this.. */ -#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */ -#define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */ - int console_printk[4] = { - DEFAULT_CONSOLE_LOGLEVEL, /* console_loglevel */ + CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */ DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */ - MINIMUM_CONSOLE_LOGLEVEL, /* minimum_console_loglevel */ - DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ + CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */ + CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ }; +/* Deferred messaged from sched code are marked by this special level */ +#define SCHED_MESSAGE_LOGLEVEL -2 + /* * Low level drivers may need that to know if they can schedule in * their unblank() callback or not. So let's export it. @@ -91,6 +87,29 @@ static struct lockdep_map console_lock_dep_map = { #endif /* + * Helper macros to handle lockdep when locking/unlocking console_sem. We use + * macros instead of functions so that _RET_IP_ contains useful information. + */ +#define down_console_sem() do { \ + down(&console_sem);\ + mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);\ +} while (0) + +static int __down_trylock_console_sem(unsigned long ip) +{ + if (down_trylock(&console_sem)) + return 1; + mutex_acquire(&console_lock_dep_map, 0, 1, ip); + return 0; +} +#define down_trylock_console_sem() __down_trylock_console_sem(_RET_IP_) + +#define up_console_sem() do { \ + mutex_release(&console_lock_dep_map, 1, _RET_IP_);\ + up(&console_sem);\ +} while (0) + +/* * This is used for debugging the mess that is the VT code by * keeping track if we have the console semaphore held. It's * definitely not the perfect debug tool (we don't know if _WE_ @@ -206,8 +225,9 @@ struct printk_log { }; /* - * The logbuf_lock protects kmsg buffer, indices, counters. It is also - * used in interesting ways to provide interlocking in console_unlock(); + * The logbuf_lock protects kmsg buffer, indices, counters. This can be taken + * within the scheduler's rq lock. It must be released before calling + * console_unlock() or anything else that might wake up a process. */ static DEFINE_RAW_SPINLOCK(logbuf_lock); @@ -250,9 +270,6 @@ static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); static char *log_buf = __log_buf; static u32 log_buf_len = __LOG_BUF_LEN; -/* cpu currently holding logbuf_lock */ -static volatile unsigned int logbuf_cpu = UINT_MAX; - /* human readable text of the record */ static char *log_text(const struct printk_log *msg) { @@ -297,37 +314,109 @@ static u32 log_next(u32 idx) return idx + msg->len; } -/* insert record into the buffer, discard old ones, update heads */ -static void log_store(int facility, int level, - enum log_flags flags, u64 ts_nsec, - const char *dict, u16 dict_len, - const char *text, u16 text_len) +/* + * Check whether there is enough free space for the given message. + * + * The same values of first_idx and next_idx mean that the buffer + * is either empty or full. + * + * If the buffer is empty, we must respect the position of the indexes. + * They cannot be reset to the beginning of the buffer. + */ +static int logbuf_has_space(u32 msg_size, bool empty) { - struct printk_log *msg; - u32 size, pad_len; + u32 free; - /* number of '\0' padding bytes to next message */ - size = sizeof(struct printk_log) + text_len + dict_len; - pad_len = (-size) & (LOG_ALIGN - 1); - size += pad_len; + if (log_next_idx > log_first_idx || empty) + free = max(log_buf_len - log_next_idx, log_first_idx); + else + free = log_first_idx - log_next_idx; + + /* + * We need space also for an empty header that signalizes wrapping + * of the buffer. + */ + return free >= msg_size + sizeof(struct printk_log); +} +static int log_make_free_space(u32 msg_size) +{ while (log_first_seq < log_next_seq) { - u32 free; + if (logbuf_has_space(msg_size, false)) + return 0; + /* drop old messages until we have enough continuous space */ + log_first_idx = log_next(log_first_idx); + log_first_seq++; + } - if (log_next_idx > log_first_idx) - free = max(log_buf_len - log_next_idx, log_first_idx); - else - free = log_first_idx - log_next_idx; + /* sequence numbers are equal, so the log buffer is empty */ + if (logbuf_has_space(msg_size, true)) + return 0; - if (free > size + sizeof(struct printk_log)) - break; + return -ENOMEM; +} - /* drop old messages until we have enough contiuous space */ - log_first_idx = log_next(log_first_idx); - log_first_seq++; +/* compute the message size including the padding bytes */ +static u32 msg_used_size(u16 text_len, u16 dict_len, u32 *pad_len) +{ + u32 size; + + size = sizeof(struct printk_log) + text_len + dict_len; + *pad_len = (-size) & (LOG_ALIGN - 1); + size += *pad_len; + + return size; +} + +/* + * Define how much of the log buffer we could take at maximum. The value + * must be greater than two. Note that only half of the buffer is available + * when the index points to the middle. + */ +#define MAX_LOG_TAKE_PART 4 +static const char trunc_msg[] = "<truncated>"; + +static u32 truncate_msg(u16 *text_len, u16 *trunc_msg_len, + u16 *dict_len, u32 *pad_len) +{ + /* + * The message should not take the whole buffer. Otherwise, it might + * get removed too soon. + */ + u32 max_text_len = log_buf_len / MAX_LOG_TAKE_PART; + if (*text_len > max_text_len) + *text_len = max_text_len; + /* enable the warning message */ + *trunc_msg_len = strlen(trunc_msg); + /* disable the "dict" completely */ + *dict_len = 0; + /* compute the size again, count also the warning message */ + return msg_used_size(*text_len + *trunc_msg_len, 0, pad_len); +} + +/* insert record into the buffer, discard old ones, update heads */ +static int log_store(int facility, int level, + enum log_flags flags, u64 ts_nsec, + const char *dict, u16 dict_len, + const char *text, u16 text_len) +{ + struct printk_log *msg; + u32 size, pad_len; + u16 trunc_msg_len = 0; + + /* number of '\0' padding bytes to next message */ + size = msg_used_size(text_len, dict_len, &pad_len); + + if (log_make_free_space(size)) { + /* truncate the message if it is too long for empty buffer */ + size = truncate_msg(&text_len, &trunc_msg_len, + &dict_len, &pad_len); + /* survive when the log buffer is too small for trunc_msg */ + if (log_make_free_space(size)) + return 0; } - if (log_next_idx + size + sizeof(struct printk_log) >= log_buf_len) { + if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) { /* * This message + an additional empty header does not fit * at the end of the buffer. Add an empty header with len == 0 @@ -341,6 +430,10 @@ static void log_store(int facility, int level, msg = (struct printk_log *)(log_buf + log_next_idx); memcpy(log_text(msg), text, text_len); msg->text_len = text_len; + if (trunc_msg_len) { + memcpy(log_text(msg) + text_len, trunc_msg, trunc_msg_len); + msg->text_len += trunc_msg_len; + } memcpy(log_dict(msg), dict, dict_len); msg->dict_len = dict_len; msg->facility = facility; @@ -351,11 +444,13 @@ static void log_store(int facility, int level, else msg->ts_nsec = local_clock(); memset(log_dict(msg) + dict_len, 0, pad_len); - msg->len = sizeof(struct printk_log) + text_len + dict_len + pad_len; + msg->len = size; /* insert message */ log_next_idx += msg->len; log_next_seq++; + + return msg->text_len; } #ifdef CONFIG_SECURITY_DMESG_RESTRICT @@ -1076,7 +1171,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear) next_seq = log_next_seq; len = 0; - prev = 0; while (len >= 0 && seq < next_seq) { struct printk_log *msg = log_from_idx(idx); int textlen; @@ -1304,7 +1398,10 @@ static void zap_locks(void) sema_init(&console_sem, 1); } -/* Check if we have any console registered that can be called early in boot. */ +/* + * Check if we have any console that is capable of printing while cpu is + * booting or shutting down. Requires console_sem. + */ static int have_callable_console(void) { struct console *con; @@ -1334,36 +1431,22 @@ static inline int can_use_console(unsigned int cpu) * messages from a 'printk'. Return true (and with the * console_lock held, and 'console_locked' set) if it * is successful, false otherwise. - * - * This gets called with the 'logbuf_lock' spinlock held and - * interrupts disabled. It should return with 'lockbuf_lock' - * released but interrupts still disabled. */ static int console_trylock_for_printk(unsigned int cpu) - __releases(&logbuf_lock) { - int retval = 0, wake = 0; - - if (console_trylock()) { - retval = 1; - - /* - * If we can't use the console, we need to release - * the console semaphore by hand to avoid flushing - * the buffer. We need to hold the console semaphore - * in order to do this test safely. - */ - if (!can_use_console(cpu)) { - console_locked = 0; - wake = 1; - retval = 0; - } + if (!console_trylock()) + return 0; + /* + * If we can't use the console, we need to release the console + * semaphore by hand to avoid flushing the buffer. We need to hold the + * console semaphore in order to do this test safely. + */ + if (!can_use_console(cpu)) { + console_locked = 0; + up_console_sem(); + return 0; } - logbuf_cpu = UINT_MAX; - raw_spin_unlock(&logbuf_lock); - if (wake) - up(&console_sem); - return retval; + return 1; } int printk_delay_msec __read_mostly; @@ -1491,11 +1574,19 @@ asmlinkage int vprintk_emit(int facility, int level, static int recursion_bug; static char textbuf[LOG_LINE_MAX]; char *text = textbuf; - size_t text_len; + size_t text_len = 0; enum log_flags lflags = 0; unsigned long flags; int this_cpu; int printed_len = 0; + bool in_sched = false; + /* cpu currently holding logbuf_lock in this function */ + static volatile unsigned int logbuf_cpu = UINT_MAX; + + if (level == SCHED_MESSAGE_LOGLEVEL) { + level = -1; + in_sched = true; + } boot_delay_msec(level); printk_delay(); @@ -1531,17 +1622,22 @@ asmlinkage int vprintk_emit(int facility, int level, "BUG: recent printk recursion!"; recursion_bug = 0; - printed_len += strlen(recursion_msg); + text_len = strlen(recursion_msg); /* emit KERN_CRIT message */ - log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, - NULL, 0, recursion_msg, printed_len); + printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, + NULL, 0, recursion_msg, text_len); } /* * The printf needs to come first; we need the syslog * prefix which might be passed-in as a parameter. */ - text_len = vscnprintf(text, sizeof(textbuf), fmt, args); + if (in_sched) + text_len = scnprintf(text, sizeof(textbuf), + KERN_WARNING "[sched_delayed] "); + + text_len += vscnprintf(text + text_len, + sizeof(textbuf) - text_len, fmt, args); /* mark and strip a trailing newline */ if (text_len && text[text_len-1] == '\n') { @@ -1561,9 +1657,12 @@ asmlinkage int vprintk_emit(int facility, int level, level = kern_level - '0'; case 'd': /* KERN_DEFAULT */ lflags |= LOG_PREFIX; - case 'c': /* KERN_CONT */ - break; } + /* + * No need to check length here because vscnprintf + * put '\0' at the end of the string. Only valid and + * newly printed level is detected. + */ text_len -= end_of_header - text; text = (char *)end_of_header; } @@ -1584,9 +1683,12 @@ asmlinkage int vprintk_emit(int facility, int level, cont_flush(LOG_NEWLINE); /* buffer line if possible, otherwise store it right away */ - if (!cont_add(facility, level, text, text_len)) - log_store(facility, level, lflags | LOG_CONT, 0, - dict, dictlen, text, text_len); + if (cont_add(facility, level, text, text_len)) + printed_len += text_len; + else + printed_len += log_store(facility, level, + lflags | LOG_CONT, 0, + dict, dictlen, text, text_len); } else { bool stored = false; @@ -1605,27 +1707,30 @@ asmlinkage int vprintk_emit(int facility, int level, cont_flush(LOG_NEWLINE); } - if (!stored) - log_store(facility, level, lflags, 0, - dict, dictlen, text, text_len); + if (stored) + printed_len += text_len; + else + printed_len += log_store(facility, level, lflags, 0, + dict, dictlen, text, text_len); } - printed_len += text_len; - /* - * Try to acquire and then immediately release the console semaphore. - * The release will print out buffers and wake up /dev/kmsg and syslog() - * users. - * - * The console_trylock_for_printk() function will release 'logbuf_lock' - * regardless of whether it actually gets the console semaphore or not. - */ - if (console_trylock_for_printk(this_cpu)) - console_unlock(); + logbuf_cpu = UINT_MAX; + raw_spin_unlock(&logbuf_lock); + + /* If called from the scheduler, we can not call up(). */ + if (!in_sched) { + /* + * Try to acquire and then immediately release the console + * semaphore. The release will print out buffers and wake up + * /dev/kmsg and syslog() users. + */ + if (console_trylock_for_printk(this_cpu)) + console_unlock(); + } lockdep_on(); out_restore_irqs: local_irq_restore(flags); - return printed_len; } EXPORT_SYMBOL(vprintk_emit); @@ -1672,7 +1777,7 @@ EXPORT_SYMBOL(printk_emit); * * See the vsnprintf() documentation for format string extensions over C99. */ -asmlinkage int printk(const char *fmt, ...) +asmlinkage __visible int printk(const char *fmt, ...) { va_list args; int r; @@ -1735,7 +1840,7 @@ void early_vprintk(const char *fmt, va_list ap) } } -asmlinkage void early_printk(const char *fmt, ...) +asmlinkage __visible void early_printk(const char *fmt, ...) { va_list ap; @@ -1880,14 +1985,14 @@ void suspend_console(void) printk("Suspending console(s) (use no_console_suspend to debug)\n"); console_lock(); console_suspended = 1; - up(&console_sem); + up_console_sem(); } void resume_console(void) { if (!console_suspend_enabled) return; - down(&console_sem); + down_console_sem(); console_suspended = 0; console_unlock(); } @@ -1929,12 +2034,11 @@ void console_lock(void) { might_sleep(); - down(&console_sem); + down_console_sem(); if (console_suspended) return; console_locked = 1; console_may_schedule = 1; - mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_); } EXPORT_SYMBOL(console_lock); @@ -1948,15 +2052,14 @@ EXPORT_SYMBOL(console_lock); */ int console_trylock(void) { - if (down_trylock(&console_sem)) + if (down_trylock_console_sem()) return 0; if (console_suspended) { - up(&console_sem); + up_console_sem(); return 0; } console_locked = 1; console_may_schedule = 0; - mutex_acquire(&console_lock_dep_map, 0, 1, _RET_IP_); return 1; } EXPORT_SYMBOL(console_trylock); @@ -2018,7 +2121,7 @@ void console_unlock(void) bool retry; if (console_suspended) { - up(&console_sem); + up_console_sem(); return; } @@ -2039,10 +2142,15 @@ again: } if (console_seq < log_first_seq) { + len = sprintf(text, "** %u printk messages dropped ** ", + (unsigned)(log_first_seq - console_seq)); + /* messages are gone, move to first one */ console_seq = log_first_seq; console_idx = log_first_idx; console_prev = 0; + } else { + len = 0; } skip: if (console_seq == log_next_seq) @@ -2067,8 +2175,8 @@ skip: } level = msg->level; - len = msg_print_text(msg, console_prev, false, - text, sizeof(text)); + len += msg_print_text(msg, console_prev, false, + text + len, sizeof(text) - len); console_idx = log_next(console_idx); console_seq++; console_prev = msg->flags; @@ -2080,7 +2188,6 @@ skip: local_irq_restore(flags); } console_locked = 0; - mutex_release(&console_lock_dep_map, 1, _RET_IP_); /* Release the exclusive_console once it is used */ if (unlikely(exclusive_console)) @@ -2088,7 +2195,7 @@ skip: raw_spin_unlock(&logbuf_lock); - up(&console_sem); + up_console_sem(); /* * Someone could have filled up the buffer again, so re-check if there's @@ -2133,7 +2240,7 @@ void console_unblank(void) * oops_in_progress is set to 1.. */ if (oops_in_progress) { - if (down_trylock(&console_sem) != 0) + if (down_trylock_console_sem() != 0) return; } else console_lock(); @@ -2409,6 +2516,7 @@ int unregister_console(struct console *console) if (console_drivers != NULL && console->flags & CON_CONSDEV) console_drivers->flags |= CON_CONSDEV; + console->flags &= ~CON_ENABLED; console_unlock(); console_sysfs_notify(); return res; @@ -2433,21 +2541,19 @@ late_initcall(printk_late_init); /* * Delayed printk version, for scheduler-internal messages: */ -#define PRINTK_BUF_SIZE 512 - #define PRINTK_PENDING_WAKEUP 0x01 -#define PRINTK_PENDING_SCHED 0x02 +#define PRINTK_PENDING_OUTPUT 0x02 static DEFINE_PER_CPU(int, printk_pending); -static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf); static void wake_up_klogd_work_func(struct irq_work *irq_work) { int pending = __this_cpu_xchg(printk_pending, 0); - if (pending & PRINTK_PENDING_SCHED) { - char *buf = __get_cpu_var(printk_sched_buf); - pr_warn("[sched_delayed] %s", buf); + if (pending & PRINTK_PENDING_OUTPUT) { + /* If trylock fails, someone else is doing the printing */ + if (console_trylock()) + console_unlock(); } if (pending & PRINTK_PENDING_WAKEUP) @@ -2469,23 +2575,19 @@ void wake_up_klogd(void) preempt_enable(); } -int printk_sched(const char *fmt, ...) +int printk_deferred(const char *fmt, ...) { - unsigned long flags; va_list args; - char *buf; int r; - local_irq_save(flags); - buf = __get_cpu_var(printk_sched_buf); - + preempt_disable(); va_start(args, fmt); - r = vsnprintf(buf, PRINTK_BUF_SIZE, fmt, args); + r = vprintk_emit(0, SCHED_MESSAGE_LOGLEVEL, NULL, 0, fmt, args); va_end(args); - __this_cpu_or(printk_pending, PRINTK_PENDING_SCHED); + __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); - local_irq_restore(flags); + preempt_enable(); return r; } @@ -2788,7 +2890,6 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, next_idx = idx; l = 0; - prev = 0; while (seq < dumper->next_seq) { struct printk_log *msg = log_from_idx(idx); diff --git a/kernel/profile.c b/kernel/profile.c index 6631e1ef55a..54bf5ba2642 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -52,9 +52,9 @@ static DEFINE_MUTEX(profile_flip_mutex); int profile_setup(char *str) { - static char schedstr[] = "schedule"; - static char sleepstr[] = "sleep"; - static char kvmstr[] = "kvm"; + static const char schedstr[] = "schedule"; + static const char sleepstr[] = "sleep"; + static const char kvmstr[] = "kvm"; int par; if (!strncmp(str, sleepstr, strlen(sleepstr))) { @@ -64,12 +64,10 @@ int profile_setup(char *str) str += strlen(sleepstr) + 1; if (get_option(&str, &par)) prof_shift = par; - printk(KERN_INFO - "kernel sleep profiling enabled (shift: %ld)\n", + pr_info("kernel sleep profiling enabled (shift: %ld)\n", prof_shift); #else - printk(KERN_WARNING - "kernel sleep profiling requires CONFIG_SCHEDSTATS\n"); + pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n"); #endif /* CONFIG_SCHEDSTATS */ } else if (!strncmp(str, schedstr, strlen(schedstr))) { prof_on = SCHED_PROFILING; @@ -77,8 +75,7 @@ int profile_setup(char *str) str += strlen(schedstr) + 1; if (get_option(&str, &par)) prof_shift = par; - printk(KERN_INFO - "kernel schedule profiling enabled (shift: %ld)\n", + pr_info("kernel schedule profiling enabled (shift: %ld)\n", prof_shift); } else if (!strncmp(str, kvmstr, strlen(kvmstr))) { prof_on = KVM_PROFILING; @@ -86,13 +83,12 @@ int profile_setup(char *str) str += strlen(kvmstr) + 1; if (get_option(&str, &par)) prof_shift = par; - printk(KERN_INFO - "kernel KVM profiling enabled (shift: %ld)\n", + pr_info("kernel KVM profiling enabled (shift: %ld)\n", prof_shift); } else if (get_option(&str, &par)) { prof_shift = par; prof_on = CPU_PROFILING; - printk(KERN_INFO "kernel profiling enabled (shift: %ld)\n", + pr_info("kernel profiling enabled (shift: %ld)\n", prof_shift); } return 1; @@ -549,14 +545,14 @@ static int create_hash_tables(void) struct page *page; page = alloc_pages_exact_node(node, - GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, + GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, 0); if (!page) goto out_cleanup; per_cpu(cpu_profile_hits, cpu)[1] = (struct profile_hit *)page_address(page); page = alloc_pages_exact_node(node, - GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, + GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, 0); if (!page) goto out_cleanup; @@ -591,18 +587,28 @@ out_cleanup: int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */ { struct proc_dir_entry *entry; + int err = 0; if (!prof_on) return 0; - if (create_hash_tables()) - return -ENOMEM; + + cpu_notifier_register_begin(); + + if (create_hash_tables()) { + err = -ENOMEM; + goto out; + } + entry = proc_create("profile", S_IWUSR | S_IRUGO, NULL, &proc_profile_operations); if (!entry) - return 0; + goto out; proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t)); - hotcpu_notifier(profile_cpu_callback, 0); - return 0; + __hotcpu_notifier(profile_cpu_callback, 0); + +out: + cpu_notifier_register_done(); + return err; } -module_init(create_proc_profile); +subsys_initcall(create_proc_profile); #endif /* CONFIG_PROC_FS */ diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 1f4bcb3cc21..adf98622cb3 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -1180,8 +1180,8 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, return ret; } -asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, - compat_long_t addr, compat_long_t data) +COMPAT_SYSCALL_DEFINE4(ptrace, compat_long_t, request, compat_long_t, pid, + compat_long_t, addr, compat_long_t, data) { struct task_struct *child; long ret; diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 01e9ec37a3e..807ccfbf69b 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -1,5 +1,5 @@ obj-y += update.o srcu.o -obj-$(CONFIG_RCU_TORTURE_TEST) += torture.o +obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_TREE_RCU) += tree.o obj-$(CONFIG_TREE_PREEMPT_RCU) += tree.o obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 79c3877e9c5..bfda2726ca4 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -12,8 +12,8 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright IBM Corporation, 2011 * @@ -23,6 +23,7 @@ #ifndef __LINUX_RCU_H #define __LINUX_RCU_H +#include <trace/events/rcu.h> #ifdef CONFIG_RCU_TRACE #define RCU_TRACE(stmt) stmt #else /* #ifdef CONFIG_RCU_TRACE */ @@ -116,8 +117,6 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) } } -extern int rcu_expedited; - #ifdef CONFIG_RCU_STALL_COMMON extern int rcu_cpu_stall_suppress; diff --git a/kernel/rcu/torture.c b/kernel/rcu/rcutorture.c index 732f8ae3086..948a7693748 100644 --- a/kernel/rcu/torture.c +++ b/kernel/rcu/rcutorture.c @@ -12,13 +12,13 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright (C) IBM Corporation, 2005, 2006 * * Authors: Paul E. McKenney <paulmck@us.ibm.com> - * Josh Triplett <josh@freedesktop.org> + * Josh Triplett <josh@joshtriplett.org> * * See also: Documentation/RCU/torture.txt */ @@ -48,110 +48,60 @@ #include <linux/slab.h> #include <linux/trace_clock.h> #include <asm/byteorder.h> +#include <linux/torture.h> MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>"); +MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@joshtriplett.org>"); + + +torture_param(int, fqs_duration, 0, + "Duration of fqs bursts (us), 0 to disable"); +torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)"); +torture_param(int, fqs_stutter, 3, "Wait time between fqs bursts (s)"); +torture_param(bool, gp_cond, false, "Use conditional/async GP wait primitives"); +torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); +torture_param(bool, gp_normal, false, + "Use normal (non-expedited) GP wait primitives"); +torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives"); +torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers"); +torture_param(int, n_barrier_cbs, 0, + "# of callbacks/kthreads for barrier testing"); +torture_param(int, nfakewriters, 4, "Number of RCU fake writer threads"); +torture_param(int, nreaders, -1, "Number of RCU reader threads"); +torture_param(int, object_debug, 0, + "Enable debug-object double call_rcu() testing"); +torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); +torture_param(int, onoff_interval, 0, + "Time between CPU hotplugs (s), 0=disable"); +torture_param(int, shuffle_interval, 3, "Number of seconds between shuffles"); +torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable."); +torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable."); +torture_param(int, stall_cpu_holdoff, 10, + "Time to wait before starting stall (s)."); +torture_param(int, stat_interval, 60, + "Number of seconds between stats printk()s"); +torture_param(int, stutter, 5, "Number of seconds to run/halt test"); +torture_param(int, test_boost, 1, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); +torture_param(int, test_boost_duration, 4, + "Duration of each boost test, seconds."); +torture_param(int, test_boost_interval, 7, + "Interval between boost tests, seconds."); +torture_param(bool, test_no_idle_hz, true, + "Test support for tickless idle CPUs"); +torture_param(bool, verbose, true, + "Enable verbose debugging printk()s"); -MODULE_ALIAS("rcutorture"); -#ifdef MODULE_PARAM_PREFIX -#undef MODULE_PARAM_PREFIX -#endif -#define MODULE_PARAM_PREFIX "rcutorture." - -static int fqs_duration; -module_param(fqs_duration, int, 0444); -MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us), 0 to disable"); -static int fqs_holdoff; -module_param(fqs_holdoff, int, 0444); -MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); -static int fqs_stutter = 3; -module_param(fqs_stutter, int, 0444); -MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); -static bool gp_exp; -module_param(gp_exp, bool, 0444); -MODULE_PARM_DESC(gp_exp, "Use expedited GP wait primitives"); -static bool gp_normal; -module_param(gp_normal, bool, 0444); -MODULE_PARM_DESC(gp_normal, "Use normal (non-expedited) GP wait primitives"); -static int irqreader = 1; -module_param(irqreader, int, 0444); -MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); -static int n_barrier_cbs; -module_param(n_barrier_cbs, int, 0444); -MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing"); -static int nfakewriters = 4; -module_param(nfakewriters, int, 0444); -MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads"); -static int nreaders = -1; -module_param(nreaders, int, 0444); -MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); -static int object_debug; -module_param(object_debug, int, 0444); -MODULE_PARM_DESC(object_debug, "Enable debug-object double call_rcu() testing"); -static int onoff_holdoff; -module_param(onoff_holdoff, int, 0444); -MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)"); -static int onoff_interval; -module_param(onoff_interval, int, 0444); -MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); -static int shuffle_interval = 3; -module_param(shuffle_interval, int, 0444); -MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); -static int shutdown_secs; -module_param(shutdown_secs, int, 0444); -MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), <= zero to disable."); -static int stall_cpu; -module_param(stall_cpu, int, 0444); -MODULE_PARM_DESC(stall_cpu, "Stall duration (s), zero to disable."); -static int stall_cpu_holdoff = 10; -module_param(stall_cpu_holdoff, int, 0444); -MODULE_PARM_DESC(stall_cpu_holdoff, "Time to wait before starting stall (s)."); -static int stat_interval = 60; -module_param(stat_interval, int, 0644); -MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); -static int stutter = 5; -module_param(stutter, int, 0444); -MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); -static int test_boost = 1; -module_param(test_boost, int, 0444); -MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); -static int test_boost_duration = 4; -module_param(test_boost_duration, int, 0444); -MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds."); -static int test_boost_interval = 7; -module_param(test_boost_interval, int, 0444); -MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds."); -static bool test_no_idle_hz = true; -module_param(test_no_idle_hz, bool, 0444); -MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); static char *torture_type = "rcu"; module_param(torture_type, charp, 0444); MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)"); -static bool verbose; -module_param(verbose, bool, 0444); -MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); - -#define TORTURE_FLAG "-torture:" -#define PRINTK_STRING(s) \ - do { pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0) -#define VERBOSE_PRINTK_STRING(s) \ - do { if (verbose) pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0) -#define VERBOSE_PRINTK_ERRSTRING(s) \ - do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0) static int nrealreaders; static struct task_struct *writer_task; static struct task_struct **fakewriter_tasks; static struct task_struct **reader_tasks; static struct task_struct *stats_task; -static struct task_struct *shuffler_task; -static struct task_struct *stutter_task; static struct task_struct *fqs_task; static struct task_struct *boost_tasks[NR_CPUS]; -static struct task_struct *shutdown_task; -#ifdef CONFIG_HOTPLUG_CPU -static struct task_struct *onoff_task; -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ static struct task_struct *stall_task; static struct task_struct **barrier_cbs_tasks; static struct task_struct *barrier_task; @@ -170,10 +120,10 @@ static struct rcu_torture __rcu *rcu_torture_current; static unsigned long rcu_torture_current_version; static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; static DEFINE_SPINLOCK(rcu_torture_lock); -static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = - { 0 }; -static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) = - { 0 }; +static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], + rcu_torture_count) = { 0 }; +static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], + rcu_torture_batch) = { 0 }; static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1]; static atomic_t n_rcu_torture_alloc; static atomic_t n_rcu_torture_alloc_fail; @@ -186,22 +136,21 @@ static long n_rcu_torture_boost_rterror; static long n_rcu_torture_boost_failure; static long n_rcu_torture_boosts; static long n_rcu_torture_timers; -static long n_offline_attempts; -static long n_offline_successes; -static unsigned long sum_offline; -static int min_offline = -1; -static int max_offline; -static long n_online_attempts; -static long n_online_successes; -static unsigned long sum_online; -static int min_online = -1; -static int max_online; static long n_barrier_attempts; static long n_barrier_successes; static struct list_head rcu_torture_removed; -static cpumask_var_t shuffle_tmp_mask; -static int stutter_pause_test; +static int rcu_torture_writer_state; +#define RTWS_FIXED_DELAY 0 +#define RTWS_DELAY 1 +#define RTWS_REPLACE 2 +#define RTWS_DEF_FREE 3 +#define RTWS_EXP_SYNC 4 +#define RTWS_COND_GET 5 +#define RTWS_COND_SYNC 6 +#define RTWS_SYNC 7 +#define RTWS_STUTTER 8 +#define RTWS_STOPPING 9 #if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) #define RCUTORTURE_RUNNABLE_INIT 1 @@ -232,7 +181,6 @@ static u64 notrace rcu_trace_clock_local(void) } #endif /* #else #ifdef CONFIG_RCU_TRACE */ -static unsigned long shutdown_time; /* jiffies to system shutdown. */ static unsigned long boost_starttime; /* jiffies of next boost test start. */ DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ /* and boost task create/destroy. */ @@ -242,51 +190,6 @@ static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */ static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */ static DECLARE_WAIT_QUEUE_HEAD(barrier_wq); -/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ - -#define FULLSTOP_DONTSTOP 0 /* Normal operation. */ -#define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */ -#define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */ -static int fullstop = FULLSTOP_RMMOD; -/* - * Protect fullstop transitions and spawning of kthreads. - */ -static DEFINE_MUTEX(fullstop_mutex); - -/* Forward reference. */ -static void rcu_torture_cleanup(void); - -/* - * Detect and respond to a system shutdown. - */ -static int -rcutorture_shutdown_notify(struct notifier_block *unused1, - unsigned long unused2, void *unused3) -{ - mutex_lock(&fullstop_mutex); - if (fullstop == FULLSTOP_DONTSTOP) - fullstop = FULLSTOP_SHUTDOWN; - else - pr_warn(/* but going down anyway, so... */ - "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); - mutex_unlock(&fullstop_mutex); - return NOTIFY_DONE; -} - -/* - * Absorb kthreads into a kernel function that won't return, so that - * they won't ever access module text or data again. - */ -static void rcutorture_shutdown_absorb(const char *title) -{ - if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { - pr_notice( - "rcutorture thread %s parking due to system shutdown\n", - title); - schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT); - } -} - /* * Allocate an element from the rcu_tortures pool. */ @@ -320,57 +223,22 @@ rcu_torture_free(struct rcu_torture *p) spin_unlock_bh(&rcu_torture_lock); } -struct rcu_random_state { - unsigned long rrs_state; - long rrs_count; -}; - -#define RCU_RANDOM_MULT 39916801 /* prime */ -#define RCU_RANDOM_ADD 479001701 /* prime */ -#define RCU_RANDOM_REFRESH 10000 - -#define DEFINE_RCU_RANDOM(name) struct rcu_random_state name = { 0, 0 } - -/* - * Crude but fast random-number generator. Uses a linear congruential - * generator, with occasional help from cpu_clock(). - */ -static unsigned long -rcu_random(struct rcu_random_state *rrsp) -{ - if (--rrsp->rrs_count < 0) { - rrsp->rrs_state += (unsigned long)local_clock(); - rrsp->rrs_count = RCU_RANDOM_REFRESH; - } - rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD; - return swahw32(rrsp->rrs_state); -} - -static void -rcu_stutter_wait(const char *title) -{ - while (stutter_pause_test || !rcutorture_runnable) { - if (rcutorture_runnable) - schedule_timeout_interruptible(1); - else - schedule_timeout_interruptible(round_jiffies_relative(HZ)); - rcutorture_shutdown_absorb(title); - } -} - /* * Operations vector for selecting different types of tests. */ struct rcu_torture_ops { + int ttype; void (*init)(void); int (*readlock)(void); - void (*read_delay)(struct rcu_random_state *rrsp); + void (*read_delay)(struct torture_random_state *rrsp); void (*readunlock)(int idx); int (*completed)(void); void (*deferred_free)(struct rcu_torture *p); void (*sync)(void); void (*exp_sync)(void); + unsigned long (*get_state)(void); + void (*cond_sync)(unsigned long oldstate); void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); void (*cb_barrier)(void); void (*fqs)(void); @@ -392,7 +260,7 @@ static int rcu_torture_read_lock(void) __acquires(RCU) return 0; } -static void rcu_read_delay(struct rcu_random_state *rrsp) +static void rcu_read_delay(struct torture_random_state *rrsp) { const unsigned long shortdelay_us = 200; const unsigned long longdelay_ms = 50; @@ -401,12 +269,13 @@ static void rcu_read_delay(struct rcu_random_state *rrsp) * period, and we want a long delay occasionally to trigger * force_quiescent_state. */ - if (!(rcu_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) + if (!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) mdelay(longdelay_ms); - if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) + if (!(torture_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) udelay(shortdelay_us); #ifdef CONFIG_PREEMPT - if (!preempt_count() && !(rcu_random(rrsp) % (nrealreaders * 20000))) + if (!preempt_count() && + !(torture_random(rrsp) % (nrealreaders * 20000))) preempt_schedule(); /* No QS if preempt_disable() in effect */ #endif } @@ -421,27 +290,59 @@ static int rcu_torture_completed(void) return rcu_batches_completed(); } -static void -rcu_torture_cb(struct rcu_head *p) +/* + * Update callback in the pipe. This should be invoked after a grace period. + */ +static bool +rcu_torture_pipe_update_one(struct rcu_torture *rp) { int i; - struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); - if (fullstop != FULLSTOP_DONTSTOP) { - /* Test is ending, just drop callbacks on the floor. */ - /* The next initialization will pick up the pieces. */ - return; - } i = rp->rtort_pipe_count; if (i > RCU_TORTURE_PIPE_LEN) i = RCU_TORTURE_PIPE_LEN; atomic_inc(&rcu_torture_wcount[i]); if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { rp->rtort_mbtest = 0; + return true; + } + return false; +} + +/* + * Update all callbacks in the pipe. Suitable for synchronous grace-period + * primitives. + */ +static void +rcu_torture_pipe_update(struct rcu_torture *old_rp) +{ + struct rcu_torture *rp; + struct rcu_torture *rp1; + + if (old_rp) + list_add(&old_rp->rtort_free, &rcu_torture_removed); + list_for_each_entry_safe(rp, rp1, &rcu_torture_removed, rtort_free) { + if (rcu_torture_pipe_update_one(rp)) { + list_del(&rp->rtort_free); + rcu_torture_free(rp); + } + } +} + +static void +rcu_torture_cb(struct rcu_head *p) +{ + struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); + + if (torture_must_stop_irq()) { + /* Test is ending, just drop callbacks on the floor. */ + /* The next initialization will pick up the pieces. */ + return; + } + if (rcu_torture_pipe_update_one(rp)) rcu_torture_free(rp); - } else { + else cur_ops->deferred_free(rp); - } } static int rcu_no_completed(void) @@ -460,6 +361,7 @@ static void rcu_sync_torture_init(void) } static struct rcu_torture_ops rcu_ops = { + .ttype = RCU_FLAVOR, .init = rcu_sync_torture_init, .readlock = rcu_torture_read_lock, .read_delay = rcu_read_delay, @@ -468,6 +370,8 @@ static struct rcu_torture_ops rcu_ops = { .deferred_free = rcu_torture_deferred_free, .sync = synchronize_rcu, .exp_sync = synchronize_rcu_expedited, + .get_state = get_state_synchronize_rcu, + .cond_sync = cond_synchronize_rcu, .call = call_rcu, .cb_barrier = rcu_barrier, .fqs = rcu_force_quiescent_state, @@ -503,6 +407,7 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p) } static struct rcu_torture_ops rcu_bh_ops = { + .ttype = RCU_BH_FLAVOR, .init = rcu_sync_torture_init, .readlock = rcu_bh_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ @@ -520,6 +425,49 @@ static struct rcu_torture_ops rcu_bh_ops = { }; /* + * Don't even think about trying any of these in real life!!! + * The names includes "busted", and they really means it! + * The only purpose of these functions is to provide a buggy RCU + * implementation to make sure that rcutorture correctly emits + * buggy-RCU error messages. + */ +static void rcu_busted_torture_deferred_free(struct rcu_torture *p) +{ + /* This is a deliberate bug for testing purposes only! */ + rcu_torture_cb(&p->rtort_rcu); +} + +static void synchronize_rcu_busted(void) +{ + /* This is a deliberate bug for testing purposes only! */ +} + +static void +call_rcu_busted(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +{ + /* This is a deliberate bug for testing purposes only! */ + func(head); +} + +static struct rcu_torture_ops rcu_busted_ops = { + .ttype = INVALID_RCU_FLAVOR, + .init = rcu_sync_torture_init, + .readlock = rcu_torture_read_lock, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = rcu_torture_read_unlock, + .completed = rcu_no_completed, + .deferred_free = rcu_busted_torture_deferred_free, + .sync = synchronize_rcu_busted, + .exp_sync = synchronize_rcu_busted, + .call = call_rcu_busted, + .cb_barrier = NULL, + .fqs = NULL, + .stats = NULL, + .irq_capable = 1, + .name = "rcu_busted" +}; + +/* * Definitions for srcu torture testing. */ @@ -530,7 +478,7 @@ static int srcu_torture_read_lock(void) __acquires(&srcu_ctl) return srcu_read_lock(&srcu_ctl); } -static void srcu_read_delay(struct rcu_random_state *rrsp) +static void srcu_read_delay(struct torture_random_state *rrsp) { long delay; const long uspertick = 1000000 / HZ; @@ -538,7 +486,8 @@ static void srcu_read_delay(struct rcu_random_state *rrsp) /* We want there to be long-running readers, but not all the time. */ - delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick); + delay = torture_random(rrsp) % + (nrealreaders * 2 * longdelay * uspertick); if (!delay) schedule_timeout_interruptible(longdelay); else @@ -584,9 +533,11 @@ static void srcu_torture_stats(char *page) page += sprintf(page, "%s%s per-CPU(idx=%d):", torture_type, TORTURE_FLAG, idx); for_each_possible_cpu(cpu) { - page += sprintf(page, " %d(%lu,%lu)", cpu, - per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], - per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); + long c0, c1; + + c0 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx]; + c1 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]; + page += sprintf(page, " %d(%ld,%ld)", cpu, c0, c1); } sprintf(page, "\n"); } @@ -597,6 +548,7 @@ static void srcu_torture_synchronize_expedited(void) } static struct rcu_torture_ops srcu_ops = { + .ttype = SRCU_FLAVOR, .init = rcu_sync_torture_init, .readlock = srcu_torture_read_lock, .read_delay = srcu_read_delay, @@ -632,6 +584,7 @@ static void rcu_sched_torture_deferred_free(struct rcu_torture *p) } static struct rcu_torture_ops sched_ops = { + .ttype = RCU_SCHED_FLAVOR, .init = rcu_sync_torture_init, .readlock = sched_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ @@ -677,12 +630,12 @@ static int rcu_torture_boost(void *arg) struct rcu_boost_inflight rbi = { .inflight = 0 }; struct sched_param sp; - VERBOSE_PRINTK_STRING("rcu_torture_boost started"); + VERBOSE_TOROUT_STRING("rcu_torture_boost started"); /* Set real-time priority. */ sp.sched_priority = 1; if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) { - VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!"); + VERBOSE_TOROUT_STRING("rcu_torture_boost RT prio failed!"); n_rcu_torture_boost_rterror++; } @@ -693,9 +646,8 @@ static int rcu_torture_boost(void *arg) oldstarttime = boost_starttime; while (ULONG_CMP_LT(jiffies, oldstarttime)) { schedule_timeout_interruptible(oldstarttime - jiffies); - rcu_stutter_wait("rcu_torture_boost"); - if (kthread_should_stop() || - fullstop != FULLSTOP_DONTSTOP) + stutter_wait("rcu_torture_boost"); + if (torture_must_stop()) goto checkwait; } @@ -710,15 +662,14 @@ static int rcu_torture_boost(void *arg) call_rcu(&rbi.rcu, rcu_torture_boost_cb); if (jiffies - call_rcu_time > test_boost_duration * HZ - HZ / 2) { - VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed"); + VERBOSE_TOROUT_STRING("rcu_torture_boost boosting failed"); n_rcu_torture_boost_failure++; } call_rcu_time = jiffies; } cond_resched(); - rcu_stutter_wait("rcu_torture_boost"); - if (kthread_should_stop() || - fullstop != FULLSTOP_DONTSTOP) + stutter_wait("rcu_torture_boost"); + if (torture_must_stop()) goto checkwait; } @@ -742,16 +693,17 @@ static int rcu_torture_boost(void *arg) } /* Go do the stutter. */ -checkwait: rcu_stutter_wait("rcu_torture_boost"); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); +checkwait: stutter_wait("rcu_torture_boost"); + } while (!torture_must_stop()); /* Clean up and exit. */ - VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping"); - rcutorture_shutdown_absorb("rcu_torture_boost"); - while (!kthread_should_stop() || rbi.inflight) + while (!kthread_should_stop() || rbi.inflight) { + torture_shutdown_absorb("rcu_torture_boost"); schedule_timeout_uninterruptible(1); + } smp_mb(); /* order accesses to ->inflight before stack-frame death. */ destroy_rcu_head_on_stack(&rbi.rcu); + torture_kthread_stopping("rcu_torture_boost"); return 0; } @@ -766,7 +718,7 @@ rcu_torture_fqs(void *arg) unsigned long fqs_resume_time; int fqs_burst_remaining; - VERBOSE_PRINTK_STRING("rcu_torture_fqs task started"); + VERBOSE_TOROUT_STRING("rcu_torture_fqs task started"); do { fqs_resume_time = jiffies + fqs_stutter * HZ; while (ULONG_CMP_LT(jiffies, fqs_resume_time) && @@ -780,12 +732,9 @@ rcu_torture_fqs(void *arg) udelay(fqs_holdoff); fqs_burst_remaining -= fqs_holdoff; } - rcu_stutter_wait("rcu_torture_fqs"); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); - VERBOSE_PRINTK_STRING("rcu_torture_fqs task stopping"); - rcutorture_shutdown_absorb("rcu_torture_fqs"); - while (!kthread_should_stop()) - schedule_timeout_uninterruptible(1); + stutter_wait("rcu_torture_fqs"); + } while (!torture_must_stop()); + torture_kthread_stopping("rcu_torture_fqs"); return 0; } @@ -797,23 +746,59 @@ rcu_torture_fqs(void *arg) static int rcu_torture_writer(void *arg) { - bool exp; + unsigned long gp_snap; + bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal; + bool gp_sync1 = gp_sync; int i; struct rcu_torture *rp; - struct rcu_torture *rp1; struct rcu_torture *old_rp; - static DEFINE_RCU_RANDOM(rand); - - VERBOSE_PRINTK_STRING("rcu_torture_writer task started"); - set_user_nice(current, 19); + static DEFINE_TORTURE_RANDOM(rand); + int synctype[] = { RTWS_DEF_FREE, RTWS_EXP_SYNC, + RTWS_COND_GET, RTWS_SYNC }; + int nsynctypes = 0; + + VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); + + /* Initialize synctype[] array. If none set, take default. */ + if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync) + gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true; + if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync) + synctype[nsynctypes++] = RTWS_COND_GET; + else if (gp_cond && (!cur_ops->get_state || !cur_ops->cond_sync)) + pr_alert("rcu_torture_writer: gp_cond without primitives.\n"); + if (gp_exp1 && cur_ops->exp_sync) + synctype[nsynctypes++] = RTWS_EXP_SYNC; + else if (gp_exp && !cur_ops->exp_sync) + pr_alert("rcu_torture_writer: gp_exp without primitives.\n"); + if (gp_normal1 && cur_ops->deferred_free) + synctype[nsynctypes++] = RTWS_DEF_FREE; + else if (gp_normal && !cur_ops->deferred_free) + pr_alert("rcu_torture_writer: gp_normal without primitives.\n"); + if (gp_sync1 && cur_ops->sync) + synctype[nsynctypes++] = RTWS_SYNC; + else if (gp_sync && !cur_ops->sync) + pr_alert("rcu_torture_writer: gp_sync without primitives.\n"); + if (WARN_ONCE(nsynctypes == 0, + "rcu_torture_writer: No update-side primitives.\n")) { + /* + * No updates primitives, so don't try updating. + * The resulting test won't be testing much, hence the + * above WARN_ONCE(). + */ + rcu_torture_writer_state = RTWS_STOPPING; + torture_kthread_stopping("rcu_torture_writer"); + } do { + rcu_torture_writer_state = RTWS_FIXED_DELAY; schedule_timeout_uninterruptible(1); rp = rcu_torture_alloc(); if (rp == NULL) continue; rp->rtort_pipe_count = 0; - udelay(rcu_random(&rand) & 0x3ff); + rcu_torture_writer_state = RTWS_DELAY; + udelay(torture_random(&rand) & 0x3ff); + rcu_torture_writer_state = RTWS_REPLACE; old_rp = rcu_dereference_check(rcu_torture_current, current == writer_task); rp->rtort_mbtest = 1; @@ -825,39 +810,43 @@ rcu_torture_writer(void *arg) i = RCU_TORTURE_PIPE_LEN; atomic_inc(&rcu_torture_wcount[i]); old_rp->rtort_pipe_count++; - if (gp_normal == gp_exp) - exp = !!(rcu_random(&rand) & 0x80); - else - exp = gp_exp; - if (!exp) { + switch (synctype[torture_random(&rand) % nsynctypes]) { + case RTWS_DEF_FREE: + rcu_torture_writer_state = RTWS_DEF_FREE; cur_ops->deferred_free(old_rp); - } else { + break; + case RTWS_EXP_SYNC: + rcu_torture_writer_state = RTWS_EXP_SYNC; cur_ops->exp_sync(); - list_add(&old_rp->rtort_free, - &rcu_torture_removed); - list_for_each_entry_safe(rp, rp1, - &rcu_torture_removed, - rtort_free) { - i = rp->rtort_pipe_count; - if (i > RCU_TORTURE_PIPE_LEN) - i = RCU_TORTURE_PIPE_LEN; - atomic_inc(&rcu_torture_wcount[i]); - if (++rp->rtort_pipe_count >= - RCU_TORTURE_PIPE_LEN) { - rp->rtort_mbtest = 0; - list_del(&rp->rtort_free); - rcu_torture_free(rp); - } - } + rcu_torture_pipe_update(old_rp); + break; + case RTWS_COND_GET: + rcu_torture_writer_state = RTWS_COND_GET; + gp_snap = cur_ops->get_state(); + i = torture_random(&rand) % 16; + if (i != 0) + schedule_timeout_interruptible(i); + udelay(torture_random(&rand) % 1000); + rcu_torture_writer_state = RTWS_COND_SYNC; + cur_ops->cond_sync(gp_snap); + rcu_torture_pipe_update(old_rp); + break; + case RTWS_SYNC: + rcu_torture_writer_state = RTWS_SYNC; + cur_ops->sync(); + rcu_torture_pipe_update(old_rp); + break; + default: + WARN_ON_ONCE(1); + break; } } rcutorture_record_progress(++rcu_torture_current_version); - rcu_stutter_wait("rcu_torture_writer"); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); - VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); - rcutorture_shutdown_absorb("rcu_torture_writer"); - while (!kthread_should_stop()) - schedule_timeout_uninterruptible(1); + rcu_torture_writer_state = RTWS_STUTTER; + stutter_wait("rcu_torture_writer"); + } while (!torture_must_stop()); + rcu_torture_writer_state = RTWS_STOPPING; + torture_kthread_stopping("rcu_torture_writer"); return 0; } @@ -868,19 +857,19 @@ rcu_torture_writer(void *arg) static int rcu_torture_fakewriter(void *arg) { - DEFINE_RCU_RANDOM(rand); + DEFINE_TORTURE_RANDOM(rand); - VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started"); - set_user_nice(current, 19); + VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task started"); + set_user_nice(current, MAX_NICE); do { - schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); - udelay(rcu_random(&rand) & 0x3ff); + schedule_timeout_uninterruptible(1 + torture_random(&rand)%10); + udelay(torture_random(&rand) & 0x3ff); if (cur_ops->cb_barrier != NULL && - rcu_random(&rand) % (nfakewriters * 8) == 0) { + torture_random(&rand) % (nfakewriters * 8) == 0) { cur_ops->cb_barrier(); } else if (gp_normal == gp_exp) { - if (rcu_random(&rand) & 0x80) + if (torture_random(&rand) & 0x80) cur_ops->sync(); else cur_ops->exp_sync(); @@ -889,17 +878,14 @@ rcu_torture_fakewriter(void *arg) } else { cur_ops->exp_sync(); } - rcu_stutter_wait("rcu_torture_fakewriter"); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + stutter_wait("rcu_torture_fakewriter"); + } while (!torture_must_stop()); - VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping"); - rcutorture_shutdown_absorb("rcu_torture_fakewriter"); - while (!kthread_should_stop()) - schedule_timeout_uninterruptible(1); + torture_kthread_stopping("rcu_torture_fakewriter"); return 0; } -void rcutorture_trace_dump(void) +static void rcutorture_trace_dump(void) { static atomic_t beenhere = ATOMIC_INIT(0); @@ -921,7 +907,7 @@ static void rcu_torture_timer(unsigned long unused) int idx; int completed; int completed_end; - static DEFINE_RCU_RANDOM(rand); + static DEFINE_TORTURE_RANDOM(rand); static DEFINE_SPINLOCK(rand_lock); struct rcu_torture *p; int pipe_count; @@ -980,14 +966,14 @@ rcu_torture_reader(void *arg) int completed; int completed_end; int idx; - DEFINE_RCU_RANDOM(rand); + DEFINE_TORTURE_RANDOM(rand); struct rcu_torture *p; int pipe_count; struct timer_list t; unsigned long long ts; - VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); - set_user_nice(current, 19); + VERBOSE_TOROUT_STRING("rcu_torture_reader task started"); + set_user_nice(current, MAX_NICE); if (irqreader && cur_ops->irq_capable) setup_timer_on_stack(&t, rcu_torture_timer, 0); @@ -1033,15 +1019,14 @@ rcu_torture_reader(void *arg) __this_cpu_inc(rcu_torture_batch[completed]); preempt_enable(); cur_ops->readunlock(idx); - schedule(); - rcu_stutter_wait("rcu_torture_reader"); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); - VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); - rcutorture_shutdown_absorb("rcu_torture_reader"); - if (irqreader && cur_ops->irq_capable) + cond_resched(); + stutter_wait("rcu_torture_reader"); + } while (!torture_must_stop()); + if (irqreader && cur_ops->irq_capable) { del_timer_sync(&t); - while (!kthread_should_stop()) - schedule_timeout_uninterruptible(1); + destroy_timer_on_stack(&t); + } + torture_kthread_stopping("rcu_torture_reader"); return 0; } @@ -1055,6 +1040,7 @@ rcu_torture_printk(char *page) int i; long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; + static unsigned long rtcv_snap = ULONG_MAX; for_each_possible_cpu(cpu) { for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { @@ -1083,13 +1069,7 @@ rcu_torture_printk(char *page) n_rcu_torture_boost_failure, n_rcu_torture_boosts, n_rcu_torture_timers); - page += sprintf(page, - "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ", - n_online_successes, n_online_attempts, - n_offline_successes, n_offline_attempts, - min_online, max_online, - min_offline, max_offline, - sum_online, sum_offline, HZ); + page = torture_onoff_stats(page); page += sprintf(page, "barrier: %ld/%ld:%ld", n_barrier_successes, n_barrier_attempts, @@ -1121,6 +1101,22 @@ rcu_torture_printk(char *page) page += sprintf(page, "\n"); if (cur_ops->stats) cur_ops->stats(page); + if (rtcv_snap == rcu_torture_current_version && + rcu_torture_current != NULL) { + int __maybe_unused flags; + unsigned long __maybe_unused gpnum; + unsigned long __maybe_unused completed; + + rcutorture_get_gp_data(cur_ops->ttype, + &flags, &gpnum, &completed); + page += sprintf(page, + "??? Writer stall state %d g%lu c%lu f%#x\n", + rcu_torture_writer_state, + gpnum, completed, flags); + show_rcu_gp_kthreads(); + rcutorture_trace_dump(); + } + rtcv_snap = rcu_torture_current_version; } /* @@ -1150,123 +1146,17 @@ rcu_torture_stats_print(void) /* * Periodically prints torture statistics, if periodic statistics printing * was specified via the stat_interval module parameter. - * - * No need to worry about fullstop here, since this one doesn't reference - * volatile state or register callbacks. */ static int rcu_torture_stats(void *arg) { - VERBOSE_PRINTK_STRING("rcu_torture_stats task started"); + VERBOSE_TOROUT_STRING("rcu_torture_stats task started"); do { schedule_timeout_interruptible(stat_interval * HZ); rcu_torture_stats_print(); - rcutorture_shutdown_absorb("rcu_torture_stats"); - } while (!kthread_should_stop()); - VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping"); - return 0; -} - -static int rcu_idle_cpu; /* Force all torture tasks off this CPU */ - -/* Shuffle tasks such that we allow @rcu_idle_cpu to become idle. A special case - * is when @rcu_idle_cpu = -1, when we allow the tasks to run on all CPUs. - */ -static void rcu_torture_shuffle_tasks(void) -{ - int i; - - cpumask_setall(shuffle_tmp_mask); - get_online_cpus(); - - /* No point in shuffling if there is only one online CPU (ex: UP) */ - if (num_online_cpus() == 1) { - put_online_cpus(); - return; - } - - if (rcu_idle_cpu != -1) - cpumask_clear_cpu(rcu_idle_cpu, shuffle_tmp_mask); - - set_cpus_allowed_ptr(current, shuffle_tmp_mask); - - if (reader_tasks) { - for (i = 0; i < nrealreaders; i++) - if (reader_tasks[i]) - set_cpus_allowed_ptr(reader_tasks[i], - shuffle_tmp_mask); - } - if (fakewriter_tasks) { - for (i = 0; i < nfakewriters; i++) - if (fakewriter_tasks[i]) - set_cpus_allowed_ptr(fakewriter_tasks[i], - shuffle_tmp_mask); - } - if (writer_task) - set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask); - if (stats_task) - set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask); - if (stutter_task) - set_cpus_allowed_ptr(stutter_task, shuffle_tmp_mask); - if (fqs_task) - set_cpus_allowed_ptr(fqs_task, shuffle_tmp_mask); - if (shutdown_task) - set_cpus_allowed_ptr(shutdown_task, shuffle_tmp_mask); -#ifdef CONFIG_HOTPLUG_CPU - if (onoff_task) - set_cpus_allowed_ptr(onoff_task, shuffle_tmp_mask); -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - if (stall_task) - set_cpus_allowed_ptr(stall_task, shuffle_tmp_mask); - if (barrier_cbs_tasks) - for (i = 0; i < n_barrier_cbs; i++) - if (barrier_cbs_tasks[i]) - set_cpus_allowed_ptr(barrier_cbs_tasks[i], - shuffle_tmp_mask); - if (barrier_task) - set_cpus_allowed_ptr(barrier_task, shuffle_tmp_mask); - - if (rcu_idle_cpu == -1) - rcu_idle_cpu = num_online_cpus() - 1; - else - rcu_idle_cpu--; - - put_online_cpus(); -} - -/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the - * system to become idle at a time and cut off its timer ticks. This is meant - * to test the support for such tickless idle CPU in RCU. - */ -static int -rcu_torture_shuffle(void *arg) -{ - VERBOSE_PRINTK_STRING("rcu_torture_shuffle task started"); - do { - schedule_timeout_interruptible(shuffle_interval * HZ); - rcu_torture_shuffle_tasks(); - rcutorture_shutdown_absorb("rcu_torture_shuffle"); - } while (!kthread_should_stop()); - VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping"); - return 0; -} - -/* Cause the rcutorture test to "stutter", starting and stopping all - * threads periodically. - */ -static int -rcu_torture_stutter(void *arg) -{ - VERBOSE_PRINTK_STRING("rcu_torture_stutter task started"); - do { - schedule_timeout_interruptible(stutter * HZ); - stutter_pause_test = 1; - if (!kthread_should_stop()) - schedule_timeout_interruptible(stutter * HZ); - stutter_pause_test = 0; - rcutorture_shutdown_absorb("rcu_torture_stutter"); - } while (!kthread_should_stop()); - VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping"); + torture_shutdown_absorb("rcu_torture_stats"); + } while (!torture_must_stop()); + torture_kthread_stopping("rcu_torture_stats"); return 0; } @@ -1293,10 +1183,6 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) onoff_interval, onoff_holdoff); } -static struct notifier_block rcutorture_shutdown_nb = { - .notifier_call = rcutorture_shutdown_notify, -}; - static void rcutorture_booster_cleanup(int cpu) { struct task_struct *t; @@ -1304,14 +1190,12 @@ static void rcutorture_booster_cleanup(int cpu) if (boost_tasks[cpu] == NULL) return; mutex_lock(&boost_mutex); - VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task"); t = boost_tasks[cpu]; boost_tasks[cpu] = NULL; mutex_unlock(&boost_mutex); /* This must be outside of the mutex, otherwise deadlock! */ - kthread_stop(t); - boost_tasks[cpu] = NULL; + torture_stop_kthread(rcu_torture_boost, t); } static int rcutorture_booster_init(int cpu) @@ -1323,13 +1207,13 @@ static int rcutorture_booster_init(int cpu) /* Don't allow time recalculation while creating a new task. */ mutex_lock(&boost_mutex); - VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task"); + VERBOSE_TOROUT_STRING("Creating rcu_torture_boost task"); boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL, cpu_to_node(cpu), "rcu_torture_boost"); if (IS_ERR(boost_tasks[cpu])) { retval = PTR_ERR(boost_tasks[cpu]); - VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed"); + VERBOSE_TOROUT_STRING("rcu_torture_boost task create failed"); n_rcu_torture_boost_ktrerror++; boost_tasks[cpu] = NULL; mutex_unlock(&boost_mutex); @@ -1342,175 +1226,6 @@ static int rcutorture_booster_init(int cpu) } /* - * Cause the rcutorture test to shutdown the system after the test has - * run for the time specified by the shutdown_secs module parameter. - */ -static int -rcu_torture_shutdown(void *arg) -{ - long delta; - unsigned long jiffies_snap; - - VERBOSE_PRINTK_STRING("rcu_torture_shutdown task started"); - jiffies_snap = ACCESS_ONCE(jiffies); - while (ULONG_CMP_LT(jiffies_snap, shutdown_time) && - !kthread_should_stop()) { - delta = shutdown_time - jiffies_snap; - if (verbose) - pr_alert("%s" TORTURE_FLAG - "rcu_torture_shutdown task: %lu jiffies remaining\n", - torture_type, delta); - schedule_timeout_interruptible(delta); - jiffies_snap = ACCESS_ONCE(jiffies); - } - if (kthread_should_stop()) { - VERBOSE_PRINTK_STRING("rcu_torture_shutdown task stopping"); - return 0; - } - - /* OK, shut down the system. */ - - VERBOSE_PRINTK_STRING("rcu_torture_shutdown task shutting down system"); - shutdown_task = NULL; /* Avoid self-kill deadlock. */ - rcu_torture_cleanup(); /* Get the success/failure message. */ - kernel_power_off(); /* Shut down the system. */ - return 0; -} - -#ifdef CONFIG_HOTPLUG_CPU - -/* - * Execute random CPU-hotplug operations at the interval specified - * by the onoff_interval. - */ -static int -rcu_torture_onoff(void *arg) -{ - int cpu; - unsigned long delta; - int maxcpu = -1; - DEFINE_RCU_RANDOM(rand); - int ret; - unsigned long starttime; - - VERBOSE_PRINTK_STRING("rcu_torture_onoff task started"); - for_each_online_cpu(cpu) - maxcpu = cpu; - WARN_ON(maxcpu < 0); - if (onoff_holdoff > 0) { - VERBOSE_PRINTK_STRING("rcu_torture_onoff begin holdoff"); - schedule_timeout_interruptible(onoff_holdoff * HZ); - VERBOSE_PRINTK_STRING("rcu_torture_onoff end holdoff"); - } - while (!kthread_should_stop()) { - cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1); - if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "rcu_torture_onoff task: offlining %d\n", - torture_type, cpu); - starttime = jiffies; - n_offline_attempts++; - ret = cpu_down(cpu); - if (ret) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "rcu_torture_onoff task: offline %d failed: errno %d\n", - torture_type, cpu, ret); - } else { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "rcu_torture_onoff task: offlined %d\n", - torture_type, cpu); - n_offline_successes++; - delta = jiffies - starttime; - sum_offline += delta; - if (min_offline < 0) { - min_offline = delta; - max_offline = delta; - } - if (min_offline > delta) - min_offline = delta; - if (max_offline < delta) - max_offline = delta; - } - } else if (cpu_is_hotpluggable(cpu)) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "rcu_torture_onoff task: onlining %d\n", - torture_type, cpu); - starttime = jiffies; - n_online_attempts++; - ret = cpu_up(cpu); - if (ret) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "rcu_torture_onoff task: online %d failed: errno %d\n", - torture_type, cpu, ret); - } else { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "rcu_torture_onoff task: onlined %d\n", - torture_type, cpu); - n_online_successes++; - delta = jiffies - starttime; - sum_online += delta; - if (min_online < 0) { - min_online = delta; - max_online = delta; - } - if (min_online > delta) - min_online = delta; - if (max_online < delta) - max_online = delta; - } - } - schedule_timeout_interruptible(onoff_interval * HZ); - } - VERBOSE_PRINTK_STRING("rcu_torture_onoff task stopping"); - return 0; -} - -static int -rcu_torture_onoff_init(void) -{ - int ret; - - if (onoff_interval <= 0) - return 0; - onoff_task = kthread_run(rcu_torture_onoff, NULL, "rcu_torture_onoff"); - if (IS_ERR(onoff_task)) { - ret = PTR_ERR(onoff_task); - onoff_task = NULL; - return ret; - } - return 0; -} - -static void rcu_torture_onoff_cleanup(void) -{ - if (onoff_task == NULL) - return; - VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task"); - kthread_stop(onoff_task); - onoff_task = NULL; -} - -#else /* #ifdef CONFIG_HOTPLUG_CPU */ - -static int -rcu_torture_onoff_init(void) -{ - return 0; -} - -static void rcu_torture_onoff_cleanup(void) -{ -} - -#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ - -/* * CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then * induces a CPU stall for the time specified by stall_cpu. */ @@ -1518,11 +1233,11 @@ static int rcu_torture_stall(void *args) { unsigned long stop_at; - VERBOSE_PRINTK_STRING("rcu_torture_stall task started"); + VERBOSE_TOROUT_STRING("rcu_torture_stall task started"); if (stall_cpu_holdoff > 0) { - VERBOSE_PRINTK_STRING("rcu_torture_stall begin holdoff"); + VERBOSE_TOROUT_STRING("rcu_torture_stall begin holdoff"); schedule_timeout_interruptible(stall_cpu_holdoff * HZ); - VERBOSE_PRINTK_STRING("rcu_torture_stall end holdoff"); + VERBOSE_TOROUT_STRING("rcu_torture_stall end holdoff"); } if (!kthread_should_stop()) { stop_at = get_seconds() + stall_cpu; @@ -1536,7 +1251,7 @@ static int rcu_torture_stall(void *args) rcu_read_unlock(); pr_alert("rcu_torture_stall end.\n"); } - rcutorture_shutdown_absorb("rcu_torture_stall"); + torture_shutdown_absorb("rcu_torture_stall"); while (!kthread_should_stop()) schedule_timeout_interruptible(10 * HZ); return 0; @@ -1545,31 +1260,13 @@ static int rcu_torture_stall(void *args) /* Spawn CPU-stall kthread, if stall_cpu specified. */ static int __init rcu_torture_stall_init(void) { - int ret; - if (stall_cpu <= 0) return 0; - stall_task = kthread_run(rcu_torture_stall, NULL, "rcu_torture_stall"); - if (IS_ERR(stall_task)) { - ret = PTR_ERR(stall_task); - stall_task = NULL; - return ret; - } - return 0; -} - -/* Clean up after the CPU-stall kthread, if one was spawned. */ -static void rcu_torture_stall_cleanup(void) -{ - if (stall_task == NULL) - return; - VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task."); - kthread_stop(stall_task); - stall_task = NULL; + return torture_create_kthread(rcu_torture_stall, NULL, stall_task); } /* Callback function for RCU barrier testing. */ -void rcu_torture_barrier_cbf(struct rcu_head *rcu) +static void rcu_torture_barrier_cbf(struct rcu_head *rcu) { atomic_inc(&barrier_cbs_invoked); } @@ -1583,28 +1280,24 @@ static int rcu_torture_barrier_cbs(void *arg) struct rcu_head rcu; init_rcu_head_on_stack(&rcu); - VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task started"); - set_user_nice(current, 19); + VERBOSE_TOROUT_STRING("rcu_torture_barrier_cbs task started"); + set_user_nice(current, MAX_NICE); do { wait_event(barrier_cbs_wq[myid], (newphase = ACCESS_ONCE(barrier_phase)) != lastphase || - kthread_should_stop() || - fullstop != FULLSTOP_DONTSTOP); + torture_must_stop()); lastphase = newphase; smp_mb(); /* ensure barrier_phase load before ->call(). */ - if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) + if (torture_must_stop()) break; cur_ops->call(&rcu, rcu_torture_barrier_cbf); if (atomic_dec_and_test(&barrier_cbs_count)) wake_up(&barrier_wq); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); - VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task stopping"); - rcutorture_shutdown_absorb("rcu_torture_barrier_cbs"); - while (!kthread_should_stop()) - schedule_timeout_interruptible(1); + } while (!torture_must_stop()); cur_ops->cb_barrier(); destroy_rcu_head_on_stack(&rcu); + torture_kthread_stopping("rcu_torture_barrier_cbs"); return 0; } @@ -1613,7 +1306,7 @@ static int rcu_torture_barrier(void *arg) { int i; - VERBOSE_PRINTK_STRING("rcu_torture_barrier task starting"); + VERBOSE_TOROUT_STRING("rcu_torture_barrier task starting"); do { atomic_set(&barrier_cbs_invoked, 0); atomic_set(&barrier_cbs_count, n_barrier_cbs); @@ -1623,9 +1316,8 @@ static int rcu_torture_barrier(void *arg) wake_up(&barrier_cbs_wq[i]); wait_event(barrier_wq, atomic_read(&barrier_cbs_count) == 0 || - kthread_should_stop() || - fullstop != FULLSTOP_DONTSTOP); - if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) + torture_must_stop()); + if (torture_must_stop()) break; n_barrier_attempts++; cur_ops->cb_barrier(); /* Implies smp_mb() for wait_event(). */ @@ -1635,11 +1327,8 @@ static int rcu_torture_barrier(void *arg) } n_barrier_successes++; schedule_timeout_interruptible(HZ / 10); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); - VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping"); - rcutorture_shutdown_absorb("rcu_torture_barrier"); - while (!kthread_should_stop()) - schedule_timeout_interruptible(1); + } while (!torture_must_stop()); + torture_kthread_stopping("rcu_torture_barrier"); return 0; } @@ -1672,24 +1361,13 @@ static int rcu_torture_barrier_init(void) return -ENOMEM; for (i = 0; i < n_barrier_cbs; i++) { init_waitqueue_head(&barrier_cbs_wq[i]); - barrier_cbs_tasks[i] = kthread_run(rcu_torture_barrier_cbs, - (void *)(long)i, - "rcu_torture_barrier_cbs"); - if (IS_ERR(barrier_cbs_tasks[i])) { - ret = PTR_ERR(barrier_cbs_tasks[i]); - VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier_cbs"); - barrier_cbs_tasks[i] = NULL; + ret = torture_create_kthread(rcu_torture_barrier_cbs, + (void *)(long)i, + barrier_cbs_tasks[i]); + if (ret) return ret; - } } - barrier_task = kthread_run(rcu_torture_barrier, NULL, - "rcu_torture_barrier"); - if (IS_ERR(barrier_task)) { - ret = PTR_ERR(barrier_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier"); - barrier_task = NULL; - } - return 0; + return torture_create_kthread(rcu_torture_barrier, NULL, barrier_task); } /* Clean up after RCU barrier testing. */ @@ -1697,19 +1375,11 @@ static void rcu_torture_barrier_cleanup(void) { int i; - if (barrier_task != NULL) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier task"); - kthread_stop(barrier_task); - barrier_task = NULL; - } + torture_stop_kthread(rcu_torture_barrier, barrier_task); if (barrier_cbs_tasks != NULL) { - for (i = 0; i < n_barrier_cbs; i++) { - if (barrier_cbs_tasks[i] != NULL) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier_cbs task"); - kthread_stop(barrier_cbs_tasks[i]); - barrier_cbs_tasks[i] = NULL; - } - } + for (i = 0; i < n_barrier_cbs; i++) + torture_stop_kthread(rcu_torture_barrier_cbs, + barrier_cbs_tasks[i]); kfree(barrier_cbs_tasks); barrier_cbs_tasks = NULL; } @@ -1747,90 +1417,42 @@ rcu_torture_cleanup(void) { int i; - mutex_lock(&fullstop_mutex); rcutorture_record_test_transition(); - if (fullstop == FULLSTOP_SHUTDOWN) { - pr_warn(/* but going down anyway, so... */ - "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); - mutex_unlock(&fullstop_mutex); - schedule_timeout_uninterruptible(10); + if (torture_cleanup()) { if (cur_ops->cb_barrier != NULL) cur_ops->cb_barrier(); return; } - fullstop = FULLSTOP_RMMOD; - mutex_unlock(&fullstop_mutex); - unregister_reboot_notifier(&rcutorture_shutdown_nb); - rcu_torture_barrier_cleanup(); - rcu_torture_stall_cleanup(); - if (stutter_task) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); - kthread_stop(stutter_task); - } - stutter_task = NULL; - if (shuffler_task) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task"); - kthread_stop(shuffler_task); - free_cpumask_var(shuffle_tmp_mask); - } - shuffler_task = NULL; - if (writer_task) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task"); - kthread_stop(writer_task); - } - writer_task = NULL; + rcu_torture_barrier_cleanup(); + torture_stop_kthread(rcu_torture_stall, stall_task); + torture_stop_kthread(rcu_torture_writer, writer_task); if (reader_tasks) { - for (i = 0; i < nrealreaders; i++) { - if (reader_tasks[i]) { - VERBOSE_PRINTK_STRING( - "Stopping rcu_torture_reader task"); - kthread_stop(reader_tasks[i]); - } - reader_tasks[i] = NULL; - } + for (i = 0; i < nrealreaders; i++) + torture_stop_kthread(rcu_torture_reader, + reader_tasks[i]); kfree(reader_tasks); - reader_tasks = NULL; } rcu_torture_current = NULL; if (fakewriter_tasks) { for (i = 0; i < nfakewriters; i++) { - if (fakewriter_tasks[i]) { - VERBOSE_PRINTK_STRING( - "Stopping rcu_torture_fakewriter task"); - kthread_stop(fakewriter_tasks[i]); - } - fakewriter_tasks[i] = NULL; + torture_stop_kthread(rcu_torture_fakewriter, + fakewriter_tasks[i]); } kfree(fakewriter_tasks); fakewriter_tasks = NULL; } - if (stats_task) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task"); - kthread_stop(stats_task); - } - stats_task = NULL; - - if (fqs_task) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_fqs task"); - kthread_stop(fqs_task); - } - fqs_task = NULL; + torture_stop_kthread(rcu_torture_stats, stats_task); + torture_stop_kthread(rcu_torture_fqs, fqs_task); if ((test_boost == 1 && cur_ops->can_boost) || test_boost == 2) { unregister_cpu_notifier(&rcutorture_cpu_nb); for_each_possible_cpu(i) rcutorture_booster_cleanup(i); } - if (shutdown_task != NULL) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task"); - kthread_stop(shutdown_task); - } - shutdown_task = NULL; - rcu_torture_onoff_cleanup(); /* Wait for all RCU callbacks to fire. */ @@ -1841,8 +1463,7 @@ rcu_torture_cleanup(void) if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error) rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); - else if (n_online_successes != n_online_attempts || - n_offline_successes != n_offline_attempts) + else if (torture_onoff_failures()) rcu_torture_print_module_parms(cur_ops, "End of test: RCU_HOTPLUG"); else @@ -1911,12 +1532,12 @@ rcu_torture_init(void) int i; int cpu; int firsterr = 0; - int retval; static struct rcu_torture_ops *torture_ops[] = { - &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops, + &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops, }; - mutex_lock(&fullstop_mutex); + if (!torture_init_begin(torture_type, verbose, &rcutorture_runnable)) + return -EBUSY; /* Process args and tell the world that the torturer is on the job. */ for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { @@ -1931,7 +1552,7 @@ rcu_torture_init(void) for (i = 0; i < ARRAY_SIZE(torture_ops); i++) pr_alert(" %s", torture_ops[i]->name); pr_alert("\n"); - mutex_unlock(&fullstop_mutex); + torture_init_end(); return -EINVAL; } if (cur_ops->fqs == NULL && fqs_duration != 0) { @@ -1941,12 +1562,14 @@ rcu_torture_init(void) if (cur_ops->init) cur_ops->init(); /* no "goto unwind" prior to this point!!! */ - if (nreaders >= 0) + if (nreaders >= 0) { nrealreaders = nreaders; - else - nrealreaders = 2 * num_online_cpus(); + } else { + nrealreaders = num_online_cpus() - 1; + if (nrealreaders <= 0) + nrealreaders = 1; + } rcu_torture_print_module_parms(cur_ops, "Start of test"); - fullstop = FULLSTOP_DONTSTOP; /* Set up the freelist. */ @@ -1982,108 +1605,62 @@ rcu_torture_init(void) /* Start up the kthreads. */ - VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task"); - writer_task = kthread_create(rcu_torture_writer, NULL, - "rcu_torture_writer"); - if (IS_ERR(writer_task)) { - firsterr = PTR_ERR(writer_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create writer"); - writer_task = NULL; + firsterr = torture_create_kthread(rcu_torture_writer, NULL, + writer_task); + if (firsterr) goto unwind; - } - wake_up_process(writer_task); fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), GFP_KERNEL); if (fakewriter_tasks == NULL) { - VERBOSE_PRINTK_ERRSTRING("out of memory"); + VERBOSE_TOROUT_ERRSTRING("out of memory"); firsterr = -ENOMEM; goto unwind; } for (i = 0; i < nfakewriters; i++) { - VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task"); - fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL, - "rcu_torture_fakewriter"); - if (IS_ERR(fakewriter_tasks[i])) { - firsterr = PTR_ERR(fakewriter_tasks[i]); - VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter"); - fakewriter_tasks[i] = NULL; + firsterr = torture_create_kthread(rcu_torture_fakewriter, + NULL, fakewriter_tasks[i]); + if (firsterr) goto unwind; - } } reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]), GFP_KERNEL); if (reader_tasks == NULL) { - VERBOSE_PRINTK_ERRSTRING("out of memory"); + VERBOSE_TOROUT_ERRSTRING("out of memory"); firsterr = -ENOMEM; goto unwind; } for (i = 0; i < nrealreaders; i++) { - VERBOSE_PRINTK_STRING("Creating rcu_torture_reader task"); - reader_tasks[i] = kthread_run(rcu_torture_reader, NULL, - "rcu_torture_reader"); - if (IS_ERR(reader_tasks[i])) { - firsterr = PTR_ERR(reader_tasks[i]); - VERBOSE_PRINTK_ERRSTRING("Failed to create reader"); - reader_tasks[i] = NULL; + firsterr = torture_create_kthread(rcu_torture_reader, NULL, + reader_tasks[i]); + if (firsterr) goto unwind; - } } if (stat_interval > 0) { - VERBOSE_PRINTK_STRING("Creating rcu_torture_stats task"); - stats_task = kthread_run(rcu_torture_stats, NULL, - "rcu_torture_stats"); - if (IS_ERR(stats_task)) { - firsterr = PTR_ERR(stats_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create stats"); - stats_task = NULL; + firsterr = torture_create_kthread(rcu_torture_stats, NULL, + stats_task); + if (firsterr) goto unwind; - } } if (test_no_idle_hz) { - rcu_idle_cpu = num_online_cpus() - 1; - - if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) { - firsterr = -ENOMEM; - VERBOSE_PRINTK_ERRSTRING("Failed to alloc mask"); - goto unwind; - } - - /* Create the shuffler thread */ - shuffler_task = kthread_run(rcu_torture_shuffle, NULL, - "rcu_torture_shuffle"); - if (IS_ERR(shuffler_task)) { - free_cpumask_var(shuffle_tmp_mask); - firsterr = PTR_ERR(shuffler_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create shuffler"); - shuffler_task = NULL; + firsterr = torture_shuffle_init(shuffle_interval * HZ); + if (firsterr) goto unwind; - } } if (stutter < 0) stutter = 0; if (stutter) { - /* Create the stutter thread */ - stutter_task = kthread_run(rcu_torture_stutter, NULL, - "rcu_torture_stutter"); - if (IS_ERR(stutter_task)) { - firsterr = PTR_ERR(stutter_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create stutter"); - stutter_task = NULL; + firsterr = torture_stutter_init(stutter * HZ); + if (firsterr) goto unwind; - } } if (fqs_duration < 0) fqs_duration = 0; if (fqs_duration) { - /* Create the stutter thread */ - fqs_task = kthread_run(rcu_torture_fqs, NULL, - "rcu_torture_fqs"); - if (IS_ERR(fqs_task)) { - firsterr = PTR_ERR(fqs_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create fqs"); - fqs_task = NULL; + /* Create the fqs thread */ + firsterr = torture_create_kthread(rcu_torture_fqs, NULL, + fqs_task); + if (firsterr) goto unwind; - } } if (test_boost_interval < 1) test_boost_interval = 1; @@ -2097,49 +1674,31 @@ rcu_torture_init(void) for_each_possible_cpu(i) { if (cpu_is_offline(i)) continue; /* Heuristic: CPU can go offline. */ - retval = rcutorture_booster_init(i); - if (retval < 0) { - firsterr = retval; + firsterr = rcutorture_booster_init(i); + if (firsterr) goto unwind; - } - } - } - if (shutdown_secs > 0) { - shutdown_time = jiffies + shutdown_secs * HZ; - shutdown_task = kthread_create(rcu_torture_shutdown, NULL, - "rcu_torture_shutdown"); - if (IS_ERR(shutdown_task)) { - firsterr = PTR_ERR(shutdown_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown"); - shutdown_task = NULL; - goto unwind; } - wake_up_process(shutdown_task); } - i = rcu_torture_onoff_init(); - if (i != 0) { - firsterr = i; + firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup); + if (firsterr) goto unwind; - } - register_reboot_notifier(&rcutorture_shutdown_nb); - i = rcu_torture_stall_init(); - if (i != 0) { - firsterr = i; + firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval * HZ); + if (firsterr) goto unwind; - } - retval = rcu_torture_barrier_init(); - if (retval != 0) { - firsterr = retval; + firsterr = rcu_torture_stall_init(); + if (firsterr) + goto unwind; + firsterr = rcu_torture_barrier_init(); + if (firsterr) goto unwind; - } if (object_debug) rcu_test_debug_objects(); rcutorture_record_test_transition(); - mutex_unlock(&fullstop_mutex); + torture_init_end(); return 0; unwind: - mutex_unlock(&fullstop_mutex); + torture_init_end(); rcu_torture_cleanup(); return firsterr; } diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index 3318d828438..c639556f3fa 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -12,8 +12,8 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright (C) IBM Corporation, 2006 * Copyright (C) Fujitsu, 2012 @@ -36,8 +36,6 @@ #include <linux/delay.h> #include <linux/srcu.h> -#include <trace/events/rcu.h> - #include "rcu.h" /* @@ -398,7 +396,7 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head, rcu_batch_queue(&sp->batch_queue, head); if (!sp->running) { sp->running = true; - schedule_delayed_work(&sp->work, 0); + queue_delayed_work(system_power_efficient_wq, &sp->work, 0); } spin_unlock_irqrestore(&sp->queue_lock, flags); } @@ -674,7 +672,8 @@ static void srcu_reschedule(struct srcu_struct *sp) } if (pending) - schedule_delayed_work(&sp->work, SRCU_INTERVAL); + queue_delayed_work(system_power_efficient_wq, + &sp->work, SRCU_INTERVAL); } /* diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 1254f312d02..d9efcc13008 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -12,8 +12,8 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright IBM Corporation, 2008 * @@ -37,10 +37,6 @@ #include <linux/prefetch.h> #include <linux/ftrace_event.h> -#ifdef CONFIG_RCU_TRACE -#include <trace/events/rcu.h> -#endif /* #else #ifdef CONFIG_RCU_TRACE */ - #include "rcu.h" /* Forward declarations for tiny_plugin.h. */ diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h index 280d06cae35..858c5656912 100644 --- a/kernel/rcu/tiny_plugin.h +++ b/kernel/rcu/tiny_plugin.h @@ -14,8 +14,8 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright (c) 2010 Linaro * @@ -144,7 +144,7 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp) return; rcp->ticks_this_gp++; j = jiffies; - js = rcp->jiffies_stall; + js = ACCESS_ONCE(rcp->jiffies_stall); if (*rcp->curtail && ULONG_CMP_GE(j, js)) { pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n", rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting, @@ -152,17 +152,17 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp) dump_stack(); } if (*rcp->curtail && ULONG_CMP_GE(j, js)) - rcp->jiffies_stall = jiffies + + ACCESS_ONCE(rcp->jiffies_stall) = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; else if (ULONG_CMP_GE(j, js)) - rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); + ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check(); } static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) { rcp->ticks_this_gp = 0; rcp->gp_start = jiffies; - rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); + ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check(); } static void check_cpu_stalls(void) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b3d116cd072..625d0b0cd75 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -12,8 +12,8 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright IBM Corporation, 2008 * @@ -58,8 +58,6 @@ #include <linux/suspend.h> #include "tree.h" -#include <trace/events/rcu.h> - #include "rcu.h" MODULE_ALIAS("rcutree"); @@ -103,7 +101,7 @@ DEFINE_PER_CPU(struct rcu_data, sname##_data) RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); -static struct rcu_state *rcu_state; +static struct rcu_state *rcu_state_p; LIST_HEAD(rcu_struct_flavors); /* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */ @@ -208,6 +206,70 @@ void rcu_bh_qs(int cpu) rdp->passed_quiesce = 1; } +static DEFINE_PER_CPU(int, rcu_sched_qs_mask); + +static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { + .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, + .dynticks = ATOMIC_INIT(1), +#ifdef CONFIG_NO_HZ_FULL_SYSIDLE + .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE, + .dynticks_idle = ATOMIC_INIT(1), +#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ +}; + +/* + * Let the RCU core know that this CPU has gone through the scheduler, + * which is a quiescent state. This is called when the need for a + * quiescent state is urgent, so we burn an atomic operation and full + * memory barriers to let the RCU core know about it, regardless of what + * this CPU might (or might not) do in the near future. + * + * We inform the RCU core by emulating a zero-duration dyntick-idle + * period, which we in turn do by incrementing the ->dynticks counter + * by two. + */ +static void rcu_momentary_dyntick_idle(void) +{ + unsigned long flags; + struct rcu_data *rdp; + struct rcu_dynticks *rdtp; + int resched_mask; + struct rcu_state *rsp; + + local_irq_save(flags); + + /* + * Yes, we can lose flag-setting operations. This is OK, because + * the flag will be set again after some delay. + */ + resched_mask = raw_cpu_read(rcu_sched_qs_mask); + raw_cpu_write(rcu_sched_qs_mask, 0); + + /* Find the flavor that needs a quiescent state. */ + for_each_rcu_flavor(rsp) { + rdp = raw_cpu_ptr(rsp->rda); + if (!(resched_mask & rsp->flavor_mask)) + continue; + smp_mb(); /* rcu_sched_qs_mask before cond_resched_completed. */ + if (ACCESS_ONCE(rdp->mynode->completed) != + ACCESS_ONCE(rdp->cond_resched_completed)) + continue; + + /* + * Pretend to be momentarily idle for the quiescent state. + * This allows the grace-period kthread to record the + * quiescent state, with no need for this CPU to do anything + * further. + */ + rdtp = this_cpu_ptr(&rcu_dynticks); + smp_mb__before_atomic(); /* Earlier stuff before QS. */ + atomic_add(2, &rdtp->dynticks); /* QS. */ + smp_mb__after_atomic(); /* Later stuff after QS. */ + break; + } + local_irq_restore(flags); +} + /* * Note a context switch. This is a quiescent state for RCU-sched, * and requires special handling for preemptible RCU. @@ -218,19 +280,12 @@ void rcu_note_context_switch(int cpu) trace_rcu_utilization(TPS("Start context switch")); rcu_sched_qs(cpu); rcu_preempt_note_context_switch(cpu); + if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) + rcu_momentary_dyntick_idle(); trace_rcu_utilization(TPS("End context switch")); } EXPORT_SYMBOL_GPL(rcu_note_context_switch); -static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { - .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, - .dynticks = ATOMIC_INIT(1), -#ifdef CONFIG_NO_HZ_FULL_SYSIDLE - .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE, - .dynticks_idle = ATOMIC_INIT(1), -#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ -}; - static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ static long qhimark = 10000; /* If this many pending, ignore blimit. */ static long qlowmark = 100; /* Once only this many pending, use blimit. */ @@ -245,7 +300,14 @@ static ulong jiffies_till_next_fqs = ULONG_MAX; module_param(jiffies_till_first_fqs, ulong, 0644); module_param(jiffies_till_next_fqs, ulong, 0644); -static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, +/* + * How long the grace period must be before we start recruiting + * quiescent-state help from rcu_note_context_switch(). + */ +static ulong jiffies_till_sched_qs = HZ / 20; +module_param(jiffies_till_sched_qs, ulong, 0644); + +static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp); static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp, bool *isidle, @@ -273,6 +335,15 @@ long rcu_batches_completed_bh(void) EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); /* + * Force a quiescent state. + */ +void rcu_force_quiescent_state(void) +{ + force_quiescent_state(rcu_state_p); +} +EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); + +/* * Force a quiescent state for RCU BH. */ void rcu_bh_force_quiescent_state(void) @@ -282,6 +353,21 @@ void rcu_bh_force_quiescent_state(void) EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); /* + * Show the state of the grace-period kthreads. + */ +void show_rcu_gp_kthreads(void) +{ + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) { + pr_info("%s: wait state: %d ->state: %#lx\n", + rsp->name, rsp->gp_state, rsp->gp_kthread->state); + /* sched_show_task(rsp->gp_kthread); */ + } +} +EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); + +/* * Record the number of times rcutorture tests have been initiated and * terminated. This information allows the debugfs tracing stats to be * correlated to the rcutorture messages, even when the rcutorture module @@ -296,6 +382,39 @@ void rcutorture_record_test_transition(void) EXPORT_SYMBOL_GPL(rcutorture_record_test_transition); /* + * Send along grace-period-related data for rcutorture diagnostics. + */ +void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, + unsigned long *gpnum, unsigned long *completed) +{ + struct rcu_state *rsp = NULL; + + switch (test_type) { + case RCU_FLAVOR: + rsp = rcu_state_p; + break; + case RCU_BH_FLAVOR: + rsp = &rcu_bh_state; + break; + case RCU_SCHED_FLAVOR: + rsp = &rcu_sched_state; + break; + default: + break; + } + if (rsp != NULL) { + *flags = ACCESS_ONCE(rsp->gp_flags); + *gpnum = ACCESS_ONCE(rsp->gpnum); + *completed = ACCESS_ONCE(rsp->completed); + return; + } + *flags = 0; + *gpnum = 0; + *completed = 0; +} +EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); + +/* * Record the number of writer passes through the current rcutorture test. * This is also used to correlate debugfs tracing stats with the rcutorture * messages. @@ -326,6 +445,28 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) } /* + * Return the root node of the specified rcu_state structure. + */ +static struct rcu_node *rcu_get_root(struct rcu_state *rsp) +{ + return &rsp->node[0]; +} + +/* + * Is there any need for future grace periods? + * Interrupts must be disabled. If the caller does not hold the root + * rnp_node structure's ->lock, the results are advisory only. + */ +static int rcu_future_needs_gp(struct rcu_state *rsp) +{ + struct rcu_node *rnp = rcu_get_root(rsp); + int idx = (ACCESS_ONCE(rnp->completed) + 1) & 0x1; + int *fp = &rnp->need_future_gp[idx]; + + return ACCESS_ONCE(*fp); +} + +/* * Does the current CPU require a not-yet-started grace period? * The caller must have disabled interrupts to prevent races with * normal callback registry. @@ -337,7 +478,7 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) if (rcu_gp_in_progress(rsp)) return 0; /* No, a grace period is already in progress. */ - if (rcu_nocb_needs_gp(rsp)) + if (rcu_future_needs_gp(rsp)) return 1; /* Yes, a no-CBs CPU needs one. */ if (!rdp->nxttail[RCU_NEXT_TAIL]) return 0; /* No, this is a no-CBs (or offline) CPU. */ @@ -352,14 +493,6 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) } /* - * Return the root node of the specified rcu_state structure. - */ -static struct rcu_node *rcu_get_root(struct rcu_state *rsp) -{ - return &rsp->node[0]; -} - -/* * rcu_eqs_enter_common - current CPU is moving towards extended quiescent state * * If the new value of the ->dynticks_nesting counter now is zero, @@ -389,9 +522,9 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, } rcu_prepare_for_idle(smp_processor_id()); /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ - smp_mb__before_atomic_inc(); /* See above. */ + smp_mb__before_atomic(); /* See above. */ atomic_inc(&rdtp->dynticks); - smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */ + smp_mb__after_atomic(); /* Force ordering with next sojourn. */ WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); /* @@ -509,10 +642,10 @@ void rcu_irq_exit(void) static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, int user) { - smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */ + smp_mb__before_atomic(); /* Force ordering w/previous sojourn. */ atomic_inc(&rdtp->dynticks); /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ - smp_mb__after_atomic_inc(); /* See above. */ + smp_mb__after_atomic(); /* See above. */ WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); rcu_cleanup_after_idle(smp_processor_id()); trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); @@ -637,10 +770,10 @@ void rcu_nmi_enter(void) (atomic_read(&rdtp->dynticks) & 0x1)) return; rdtp->dynticks_nmi_nesting++; - smp_mb__before_atomic_inc(); /* Force delay from prior write. */ + smp_mb__before_atomic(); /* Force delay from prior write. */ atomic_inc(&rdtp->dynticks); /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ - smp_mb__after_atomic_inc(); /* See above. */ + smp_mb__after_atomic(); /* See above. */ WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); } @@ -659,9 +792,9 @@ void rcu_nmi_exit(void) --rdtp->dynticks_nmi_nesting != 0) return; /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ - smp_mb__before_atomic_inc(); /* See above. */ + smp_mb__before_atomic(); /* See above. */ atomic_inc(&rdtp->dynticks); - smp_mb__after_atomic_inc(); /* Force delay to next write. */ + smp_mb__after_atomic(); /* Force delay to next write. */ WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); } @@ -760,7 +893,12 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, { rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); rcu_sysidle_check_cpu(rdp, isidle, maxj); - return (rdp->dynticks_snap & 0x1) == 0; + if ((rdp->dynticks_snap & 0x1) == 0) { + trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); + return 1; + } else { + return 0; + } } /* @@ -779,6 +917,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, bool *isidle, unsigned long *maxj) { unsigned int curr; + int *rcrmp; unsigned int snap; curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks); @@ -819,27 +958,43 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, } /* - * There is a possibility that a CPU in adaptive-ticks state - * might run in the kernel with the scheduling-clock tick disabled - * for an extended time period. Invoke rcu_kick_nohz_cpu() to - * force the CPU to restart the scheduling-clock tick in this - * CPU is in this state. - */ - rcu_kick_nohz_cpu(rdp->cpu); - - /* - * Alternatively, the CPU might be running in the kernel - * for an extended period of time without a quiescent state. - * Attempt to force the CPU through the scheduler to gain the - * needed quiescent state, but only if the grace period has gone - * on for an uncommonly long time. If there are many stuck CPUs, - * we will beat on the first one until it gets unstuck, then move - * to the next. Only do this for the primary flavor of RCU. + * A CPU running for an extended time within the kernel can + * delay RCU grace periods. When the CPU is in NO_HZ_FULL mode, + * even context-switching back and forth between a pair of + * in-kernel CPU-bound tasks cannot advance grace periods. + * So if the grace period is old enough, make the CPU pay attention. + * Note that the unsynchronized assignments to the per-CPU + * rcu_sched_qs_mask variable are safe. Yes, setting of + * bits can be lost, but they will be set again on the next + * force-quiescent-state pass. So lost bit sets do not result + * in incorrect behavior, merely in a grace period lasting + * a few jiffies longer than it might otherwise. Because + * there are at most four threads involved, and because the + * updates are only once every few jiffies, the probability of + * lossage (and thus of slight grace-period extension) is + * quite low. + * + * Note that if the jiffies_till_sched_qs boot/sysfs parameter + * is set too high, we override with half of the RCU CPU stall + * warning delay. */ - if (rdp->rsp == rcu_state && - ULONG_CMP_GE(ACCESS_ONCE(jiffies), rdp->rsp->jiffies_resched)) { - rdp->rsp->jiffies_resched += 5; - resched_cpu(rdp->cpu); + rcrmp = &per_cpu(rcu_sched_qs_mask, rdp->cpu); + if (ULONG_CMP_GE(jiffies, + rdp->rsp->gp_start + jiffies_till_sched_qs) || + ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) { + if (!(ACCESS_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) { + ACCESS_ONCE(rdp->cond_resched_completed) = + ACCESS_ONCE(rdp->mynode->completed); + smp_mb(); /* ->cond_resched_completed before *rcrmp. */ + ACCESS_ONCE(*rcrmp) = + ACCESS_ONCE(*rcrmp) + rdp->rsp->flavor_mask; + resched_cpu(rdp->cpu); /* Force CPU into scheduler. */ + rdp->rsp->jiffies_resched += 5; /* Enable beating. */ + } else if (ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) { + /* Time to beat on that CPU again! */ + resched_cpu(rdp->cpu); /* Force CPU into scheduler. */ + rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */ + } } return 0; @@ -847,13 +1002,13 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, static void record_gp_stall_check_time(struct rcu_state *rsp) { - unsigned long j = ACCESS_ONCE(jiffies); + unsigned long j = jiffies; unsigned long j1; rsp->gp_start = j; smp_wmb(); /* Record start time before stall time. */ j1 = rcu_jiffies_till_stall_check(); - rsp->jiffies_stall = j + j1; + ACCESS_ONCE(rsp->jiffies_stall) = j + j1; rsp->jiffies_resched = j + j1 / 2; } @@ -892,12 +1047,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp) /* Only let one CPU complain about others per time interval. */ raw_spin_lock_irqsave(&rnp->lock, flags); - delta = jiffies - rsp->jiffies_stall; + delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall); if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } - rsp->jiffies_stall = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; + ACCESS_ONCE(rsp->jiffies_stall) = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; raw_spin_unlock_irqrestore(&rnp->lock, flags); /* @@ -934,9 +1089,9 @@ static void print_other_cpu_stall(struct rcu_state *rsp) print_cpu_stall_info_end(); for_each_possible_cpu(cpu) totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; - pr_cont("(detected by %d, t=%ld jiffies, g=%lu, c=%lu, q=%lu)\n", + pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n", smp_processor_id(), (long)(jiffies - rsp->gp_start), - rsp->gpnum, rsp->completed, totqlen); + (long)rsp->gpnum, (long)rsp->completed, totqlen); if (ndetected == 0) pr_err("INFO: Stall ended before state dump start\n"); else if (!trigger_all_cpu_backtrace()) @@ -949,12 +1104,6 @@ static void print_other_cpu_stall(struct rcu_state *rsp) force_quiescent_state(rsp); /* Kick them all. */ } -/* - * This function really isn't for public consumption, but RCU is special in - * that context switches can allow the state machine to make progress. - */ -extern void resched_cpu(int cpu); - static void print_cpu_stall(struct rcu_state *rsp) { int cpu; @@ -973,14 +1122,15 @@ static void print_cpu_stall(struct rcu_state *rsp) print_cpu_stall_info_end(); for_each_possible_cpu(cpu) totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; - pr_cont(" (t=%lu jiffies g=%lu c=%lu q=%lu)\n", - jiffies - rsp->gp_start, rsp->gpnum, rsp->completed, totqlen); + pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n", + jiffies - rsp->gp_start, + (long)rsp->gpnum, (long)rsp->completed, totqlen); if (!trigger_all_cpu_backtrace()) dump_stack(); raw_spin_lock_irqsave(&rnp->lock, flags); - if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall)) - rsp->jiffies_stall = jiffies + + if (ULONG_CMP_GE(jiffies, ACCESS_ONCE(rsp->jiffies_stall))) + ACCESS_ONCE(rsp->jiffies_stall) = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; raw_spin_unlock_irqrestore(&rnp->lock, flags); @@ -1005,7 +1155,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) if (rcu_cpu_stall_suppress || !rcu_gp_in_progress(rsp)) return; - j = ACCESS_ONCE(jiffies); + j = jiffies; /* * Lots of memory barriers to reject false positives. @@ -1064,7 +1214,7 @@ void rcu_cpu_stall_reset(void) struct rcu_state *rsp; for_each_rcu_flavor(rsp) - rsp->jiffies_stall = jiffies + ULONG_MAX / 2; + ACCESS_ONCE(rsp->jiffies_stall) = jiffies + ULONG_MAX / 2; } /* @@ -1125,15 +1275,18 @@ static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, /* * Start some future grace period, as needed to handle newly arrived * callbacks. The required future grace periods are recorded in each - * rcu_node structure's ->need_future_gp field. + * rcu_node structure's ->need_future_gp field. Returns true if there + * is reason to awaken the grace-period kthread. * * The caller must hold the specified rcu_node structure's ->lock. */ -static unsigned long __maybe_unused -rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) +static bool __maybe_unused +rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, + unsigned long *c_out) { unsigned long c; int i; + bool ret = false; struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); /* @@ -1144,7 +1297,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf")); if (rnp->need_future_gp[c & 0x1]) { trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf")); - return c; + goto out; } /* @@ -1158,7 +1311,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) { rnp->need_future_gp[c & 0x1]++; trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf")); - return c; + goto out; } /* @@ -1199,12 +1352,15 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot")); } else { trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot")); - rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); + ret = rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); } unlock_out: if (rnp != rnp_root) raw_spin_unlock(&rnp_root->lock); - return c; +out: + if (c_out != NULL) + *c_out = c; + return ret; } /* @@ -1228,25 +1384,43 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) } /* + * Awaken the grace-period kthread for the specified flavor of RCU. + * Don't do a self-awaken, and don't bother awakening when there is + * nothing for the grace-period kthread to do (as in several CPUs + * raced to awaken, and we lost), and finally don't try to awaken + * a kthread that has not yet been created. + */ +static void rcu_gp_kthread_wake(struct rcu_state *rsp) +{ + if (current == rsp->gp_kthread || + !ACCESS_ONCE(rsp->gp_flags) || + !rsp->gp_kthread) + return; + wake_up(&rsp->gp_wq); +} + +/* * If there is room, assign a ->completed number to any callbacks on * this CPU that have not already been assigned. Also accelerate any * callbacks that were previously assigned a ->completed number that has * since proven to be too conservative, which can happen if callbacks get * assigned a ->completed number while RCU is idle, but with reference to * a non-root rcu_node structure. This function is idempotent, so it does - * not hurt to call it repeatedly. + * not hurt to call it repeatedly. Returns an flag saying that we should + * awaken the RCU grace-period kthread. * * The caller must hold rnp->lock with interrupts disabled. */ -static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, +static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { unsigned long c; int i; + bool ret; /* If the CPU has no callbacks, nothing to do. */ if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) - return; + return false; /* * Starting from the sublist containing the callbacks most @@ -1275,7 +1449,7 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, * be grouped into. */ if (++i >= RCU_NEXT_TAIL) - return; + return false; /* * Assign all subsequent callbacks' ->completed number to the next @@ -1287,13 +1461,14 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, rdp->nxtcompleted[i] = c; } /* Record any needed additional grace periods. */ - rcu_start_future_gp(rnp, rdp); + ret = rcu_start_future_gp(rnp, rdp, NULL); /* Trace depending on how much we were able to accelerate. */ if (!*rdp->nxttail[RCU_WAIT_TAIL]) trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB")); else trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB")); + return ret; } /* @@ -1302,17 +1477,18 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, * assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL * sublist. This function is idempotent, so it does not hurt to * invoke it repeatedly. As long as it is not invoked -too- often... + * Returns true if the RCU grace-period kthread needs to be awakened. * * The caller must hold rnp->lock with interrupts disabled. */ -static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, +static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { int i, j; /* If the CPU has no callbacks, nothing to do. */ if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) - return; + return false; /* * Find all callbacks whose ->completed numbers indicate that they @@ -1336,26 +1512,30 @@ static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, } /* Classify any remaining callbacks. */ - rcu_accelerate_cbs(rsp, rnp, rdp); + return rcu_accelerate_cbs(rsp, rnp, rdp); } /* * Update CPU-local rcu_data state to record the beginnings and ends of * grace periods. The caller must hold the ->lock of the leaf rcu_node * structure corresponding to the current CPU, and must have irqs disabled. + * Returns true if the grace-period kthread needs to be awakened. */ -static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) +static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, + struct rcu_data *rdp) { + bool ret; + /* Handle the ends of any preceding grace periods first. */ if (rdp->completed == rnp->completed) { /* No grace period end, so just accelerate recent callbacks. */ - rcu_accelerate_cbs(rsp, rnp, rdp); + ret = rcu_accelerate_cbs(rsp, rnp, rdp); } else { /* Advance callbacks. */ - rcu_advance_cbs(rsp, rnp, rdp); + ret = rcu_advance_cbs(rsp, rnp, rdp); /* Remember that we saw this grace-period completion. */ rdp->completed = rnp->completed; @@ -1374,11 +1554,13 @@ static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struc rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); zero_cpu_stall_ticks(rdp); } + return ret; } static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) { unsigned long flags; + bool needwake; struct rcu_node *rnp; local_irq_save(flags); @@ -1390,8 +1572,10 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) return; } smp_mb__after_unlock_lock(); - __note_gp_changes(rsp, rnp, rdp); + needwake = __note_gp_changes(rsp, rnp, rdp); raw_spin_unlock_irqrestore(&rnp->lock, flags); + if (needwake) + rcu_gp_kthread_wake(rsp); } /* @@ -1405,12 +1589,12 @@ static int rcu_gp_init(struct rcu_state *rsp) rcu_bind_gp_kthread(); raw_spin_lock_irq(&rnp->lock); smp_mb__after_unlock_lock(); - if (rsp->gp_flags == 0) { + if (!ACCESS_ONCE(rsp->gp_flags)) { /* Spurious wakeup, tell caller to go back to sleep. */ raw_spin_unlock_irq(&rnp->lock); return 0; } - rsp->gp_flags = 0; /* Clear all flags: New grace period. */ + ACCESS_ONCE(rsp->gp_flags) = 0; /* Clear all flags: New grace period. */ if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) { /* @@ -1423,13 +1607,14 @@ static int rcu_gp_init(struct rcu_state *rsp) /* Advance to a new grace period and initialize state. */ record_gp_stall_check_time(rsp); - smp_wmb(); /* Record GP times before starting GP. */ - rsp->gpnum++; + /* Record GP times before starting GP, hence smp_store_release(). */ + smp_store_release(&rsp->gpnum, rsp->gpnum + 1); trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); raw_spin_unlock_irq(&rnp->lock); /* Exclude any concurrent CPU-hotplug operations. */ mutex_lock(&rsp->onoff_mutex); + smp_mb__after_unlock_lock(); /* ->gpnum increment before GP! */ /* * Set the quiescent-state-needed bits in all the rcu_node @@ -1454,7 +1639,7 @@ static int rcu_gp_init(struct rcu_state *rsp) WARN_ON_ONCE(rnp->completed != rsp->completed); ACCESS_ONCE(rnp->completed) = rsp->completed; if (rnp == rdp->mynode) - __note_gp_changes(rsp, rnp, rdp); + (void)__note_gp_changes(rsp, rnp, rdp); rcu_preempt_boost_start_gp(rnp); trace_rcu_grace_period_init(rsp->name, rnp->gpnum, rnp->level, rnp->grplo, @@ -1502,7 +1687,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { raw_spin_lock_irq(&rnp->lock); smp_mb__after_unlock_lock(); - rsp->gp_flags &= ~RCU_GP_FLAG_FQS; + ACCESS_ONCE(rsp->gp_flags) &= ~RCU_GP_FLAG_FQS; raw_spin_unlock_irq(&rnp->lock); } return fqs_state; @@ -1514,6 +1699,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) static void rcu_gp_cleanup(struct rcu_state *rsp) { unsigned long gp_duration; + bool needgp = false; int nocb = 0; struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); @@ -1549,7 +1735,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) ACCESS_ONCE(rnp->completed) = rsp->gpnum; rdp = this_cpu_ptr(rsp->rda); if (rnp == rdp->mynode) - __note_gp_changes(rsp, rnp, rdp); + needgp = __note_gp_changes(rsp, rnp, rdp) || needgp; /* smp_mb() provided by prior unlock-lock pair. */ nocb += rcu_future_gp_cleanup(rsp, rnp); raw_spin_unlock_irq(&rnp->lock); @@ -1557,16 +1743,18 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) } rnp = rcu_get_root(rsp); raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); + smp_mb__after_unlock_lock(); /* Order GP before ->completed update. */ rcu_nocb_gp_set(rnp, nocb); - rsp->completed = rsp->gpnum; /* Declare grace period done. */ + /* Declare grace period done. */ + ACCESS_ONCE(rsp->completed) = rsp->gpnum; trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); rsp->fqs_state = RCU_GP_IDLE; rdp = this_cpu_ptr(rsp->rda); - rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */ - if (cpu_needs_another_gp(rsp, rdp)) { - rsp->gp_flags = RCU_GP_FLAG_INIT; + /* Advance CBs to reduce false positives below. */ + needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp; + if (needgp || cpu_needs_another_gp(rsp, rdp)) { + ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT; trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), TPS("newreq")); @@ -1593,6 +1781,7 @@ static int __noreturn rcu_gp_kthread(void *arg) trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), TPS("reqwait")); + rsp->gp_state = RCU_GP_WAIT_GPS; wait_event_interruptible(rsp->gp_wq, ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_INIT); @@ -1620,6 +1809,7 @@ static int __noreturn rcu_gp_kthread(void *arg) trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), TPS("fqswait")); + rsp->gp_state = RCU_GP_WAIT_FQS; ret = wait_event_interruptible_timeout(rsp->gp_wq, ((gf = ACCESS_ONCE(rsp->gp_flags)) & RCU_GP_FLAG_FQS) || @@ -1665,14 +1855,6 @@ static int __noreturn rcu_gp_kthread(void *arg) } } -static void rsp_wakeup(struct irq_work *work) -{ - struct rcu_state *rsp = container_of(work, struct rcu_state, wakeup_work); - - /* Wake up rcu_gp_kthread() to start the grace period. */ - wake_up(&rsp->gp_wq); -} - /* * Start a new RCU grace period if warranted, re-initializing the hierarchy * in preparation for detecting the next grace period. The caller must hold @@ -1681,8 +1863,10 @@ static void rsp_wakeup(struct irq_work *work) * Note that it is legal for a dying CPU (which is marked as offline) to * invoke this function. This can happen when the dying CPU reports its * quiescent state. + * + * Returns true if the grace-period kthread must be awakened. */ -static void +static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { @@ -1693,20 +1877,18 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, * or a grace period is already in progress. * Either way, don't start a new grace period. */ - return; + return false; } - rsp->gp_flags = RCU_GP_FLAG_INIT; + ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT; trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), TPS("newreq")); /* * We can't do wakeups while holding the rnp->lock, as that * could cause possible deadlocks with the rq->lock. Defer - * the wakeup to interrupt context. And don't bother waking - * up the running kthread. + * the wakeup to our caller. */ - if (current != rsp->gp_kthread) - irq_work_queue(&rsp->wakeup_work); + return true; } /* @@ -1715,12 +1897,14 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, * is invoked indirectly from rcu_advance_cbs(), which would result in * endless recursion -- or would do so if it wasn't for the self-deadlock * that is encountered beforehand. + * + * Returns true if the grace-period kthread needs to be awakened. */ -static void -rcu_start_gp(struct rcu_state *rsp) +static bool rcu_start_gp(struct rcu_state *rsp) { struct rcu_data *rdp = this_cpu_ptr(rsp->rda); struct rcu_node *rnp = rcu_get_root(rsp); + bool ret = false; /* * If there is no grace period in progress right now, any @@ -1730,8 +1914,9 @@ rcu_start_gp(struct rcu_state *rsp) * resulting in pointless grace periods. So, advance callbacks * then start the grace period! */ - rcu_advance_cbs(rsp, rnp, rdp); - rcu_start_gp_advanced(rsp, rnp, rdp); + ret = rcu_advance_cbs(rsp, rnp, rdp) || ret; + ret = rcu_start_gp_advanced(rsp, rnp, rdp) || ret; + return ret; } /* @@ -1820,6 +2005,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) { unsigned long flags; unsigned long mask; + bool needwake; struct rcu_node *rnp; rnp = rdp->mynode; @@ -1848,9 +2034,11 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) * This GP can't end until cpu checks in, so all of our * callbacks can be processed during the next GP. */ - rcu_accelerate_cbs(rsp, rnp, rdp); + needwake = rcu_accelerate_cbs(rsp, rnp, rdp); rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */ + if (needwake) + rcu_gp_kthread_wake(rsp); } } @@ -1951,7 +2139,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) { int i; - struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); + struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); /* No-CBs CPUs are handled specially. */ if (rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags)) @@ -2304,7 +2492,7 @@ static void force_quiescent_state(struct rcu_state *rsp) if (rnp_old != NULL) raw_spin_unlock(&rnp_old->fqslock); if (ret) { - rsp->n_force_qs_lh++; + ACCESS_ONCE(rsp->n_force_qs_lh)++; return; } rnp_old = rnp; @@ -2316,11 +2504,11 @@ static void force_quiescent_state(struct rcu_state *rsp) smp_mb__after_unlock_lock(); raw_spin_unlock(&rnp_old->fqslock); if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { - rsp->n_force_qs_lh++; + ACCESS_ONCE(rsp->n_force_qs_lh)++; raw_spin_unlock_irqrestore(&rnp_old->lock, flags); return; /* Someone beat us to it. */ } - rsp->gp_flags |= RCU_GP_FLAG_FQS; + ACCESS_ONCE(rsp->gp_flags) |= RCU_GP_FLAG_FQS; raw_spin_unlock_irqrestore(&rnp_old->lock, flags); wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ } @@ -2334,7 +2522,8 @@ static void __rcu_process_callbacks(struct rcu_state *rsp) { unsigned long flags; - struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); + bool needwake; + struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); WARN_ON_ONCE(rdp->beenonline == 0); @@ -2345,8 +2534,10 @@ __rcu_process_callbacks(struct rcu_state *rsp) local_irq_save(flags); if (cpu_needs_another_gp(rsp, rdp)) { raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */ - rcu_start_gp(rsp); + needwake = rcu_start_gp(rsp); raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); + if (needwake) + rcu_gp_kthread_wake(rsp); } else { local_irq_restore(flags); } @@ -2404,6 +2595,8 @@ static void invoke_rcu_core(void) static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, struct rcu_head *head, unsigned long flags) { + bool needwake; + /* * If called from an extended quiescent state, invoke the RCU * core in order to force a re-evaluation of RCU's idleness. @@ -2433,8 +2626,10 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, raw_spin_lock(&rnp_root->lock); smp_mb__after_unlock_lock(); - rcu_start_gp(rsp); + needwake = rcu_start_gp(rsp); raw_spin_unlock(&rnp_root->lock); + if (needwake) + rcu_gp_kthread_wake(rsp); } else { /* Give the grace period a kick. */ rdp->blimit = LONG_MAX; @@ -2537,6 +2732,20 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) EXPORT_SYMBOL_GPL(call_rcu_bh); /* + * Queue an RCU callback for lazy invocation after a grace period. + * This will likely be later named something like "call_rcu_lazy()", + * but this change will require some way of tagging the lazy RCU + * callbacks in the list of pending callbacks. Until then, this + * function may only be called from __kfree_rcu(). + */ +void kfree_call_rcu(struct rcu_head *head, + void (*func)(struct rcu_head *rcu)) +{ + __call_rcu(head, func, rcu_state_p, -1, 1); +} +EXPORT_SYMBOL_GPL(kfree_call_rcu); + +/* * Because a context switch is a grace period for RCU-sched and RCU-bh, * any blocking grace-period wait automatically implies a grace period * if there is only one CPU online at any point time during execution @@ -2639,6 +2848,58 @@ void synchronize_rcu_bh(void) } EXPORT_SYMBOL_GPL(synchronize_rcu_bh); +/** + * get_state_synchronize_rcu - Snapshot current RCU state + * + * Returns a cookie that is used by a later call to cond_synchronize_rcu() + * to determine whether or not a full grace period has elapsed in the + * meantime. + */ +unsigned long get_state_synchronize_rcu(void) +{ + /* + * Any prior manipulation of RCU-protected data must happen + * before the load from ->gpnum. + */ + smp_mb(); /* ^^^ */ + + /* + * Make sure this load happens before the purportedly + * time-consuming work between get_state_synchronize_rcu() + * and cond_synchronize_rcu(). + */ + return smp_load_acquire(&rcu_state_p->gpnum); +} +EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); + +/** + * cond_synchronize_rcu - Conditionally wait for an RCU grace period + * + * @oldstate: return value from earlier call to get_state_synchronize_rcu() + * + * If a full RCU grace period has elapsed since the earlier call to + * get_state_synchronize_rcu(), just return. Otherwise, invoke + * synchronize_rcu() to wait for a full grace period. + * + * Yes, this function does not take counter wrap into account. But + * counter wrap is harmless. If the counter wraps, we have waited for + * more than 2 billion grace periods (and way more on a 64-bit system!), + * so waiting for one additional grace period should be just fine. + */ +void cond_synchronize_rcu(unsigned long oldstate) +{ + unsigned long newstate; + + /* + * Ensure that this load happens before any RCU-destructive + * actions the caller might carry out after we return. + */ + newstate = smp_load_acquire(&rcu_state_p->completed); + if (ULONG_CMP_GE(oldstate, newstate)) + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(cond_synchronize_rcu); + static int synchronize_sched_expedited_cpu_stop(void *data) { /* @@ -2738,7 +2999,7 @@ void synchronize_sched_expedited(void) s = atomic_long_read(&rsp->expedited_done); if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { /* ensure test happens before caller kfree */ - smp_mb__before_atomic_inc(); /* ^^^ */ + smp_mb__before_atomic(); /* ^^^ */ atomic_long_inc(&rsp->expedited_workdone1); return; } @@ -2756,7 +3017,7 @@ void synchronize_sched_expedited(void) s = atomic_long_read(&rsp->expedited_done); if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { /* ensure test happens before caller kfree */ - smp_mb__before_atomic_inc(); /* ^^^ */ + smp_mb__before_atomic(); /* ^^^ */ atomic_long_inc(&rsp->expedited_workdone2); return; } @@ -2785,7 +3046,7 @@ void synchronize_sched_expedited(void) s = atomic_long_read(&rsp->expedited_done); if (ULONG_CMP_GE((ulong)s, (ulong)snap)) { /* ensure test happens before caller kfree */ - smp_mb__before_atomic_inc(); /* ^^^ */ + smp_mb__before_atomic(); /* ^^^ */ atomic_long_inc(&rsp->expedited_done_lost); break; } @@ -2880,7 +3141,7 @@ static int rcu_pending(int cpu) * non-NULL, store an indication of whether all callbacks are lazy. * (If there are no callbacks, all of them are deemed to be lazy.) */ -static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy) +static int __maybe_unused rcu_cpu_has_callbacks(int cpu, bool *all_lazy) { bool al = true; bool hc = false; @@ -2936,7 +3197,7 @@ static void rcu_barrier_callback(struct rcu_head *rhp) static void rcu_barrier_func(void *type) { struct rcu_state *rsp = type; - struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); + struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done); atomic_inc(&rsp->barrier_cpu_count); @@ -3108,7 +3369,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) * that this CPU cannot possibly have any RCU callbacks in flight yet. */ static void -rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) +rcu_init_percpu_data(int cpu, struct rcu_state *rsp) { unsigned long flags; unsigned long mask; @@ -3121,7 +3382,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) /* Set up local state, ensuring consistent view of global state. */ raw_spin_lock_irqsave(&rnp->lock, flags); rdp->beenonline = 1; /* We have now been online. */ - rdp->preemptible = preemptible; rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = rsp->n_force_qs; rdp->blimit = blimit; @@ -3165,8 +3425,7 @@ static void rcu_prepare_cpu(int cpu) struct rcu_state *rsp; for_each_rcu_flavor(rsp) - rcu_init_percpu_data(cpu, rsp, - strcmp(rsp->name, "rcu_preempt") == 0); + rcu_init_percpu_data(cpu, rsp); } /* @@ -3176,7 +3435,7 @@ static int rcu_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { long cpu = (long)hcpu; - struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); + struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); struct rcu_node *rnp = rdp->mynode; struct rcu_state *rsp; @@ -3313,6 +3572,7 @@ static void __init rcu_init_one(struct rcu_state *rsp, "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */ + static u8 fl_mask = 0x1; int cpustride = 1; int i; int j; @@ -3331,6 +3591,8 @@ static void __init rcu_init_one(struct rcu_state *rsp, for (i = 1; i < rcu_num_lvls; i++) rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1]; rcu_init_levelspread(rsp); + rsp->flavor_mask = fl_mask; + fl_mask <<= 1; /* Initialize the elements themselves, starting from the leaves. */ @@ -3350,8 +3612,8 @@ static void __init rcu_init_one(struct rcu_state *rsp, rnp->qsmaskinit = 0; rnp->grplo = j * cpustride; rnp->grphi = (j + 1) * cpustride - 1; - if (rnp->grphi >= NR_CPUS) - rnp->grphi = NR_CPUS - 1; + if (rnp->grphi >= nr_cpu_ids) + rnp->grphi = nr_cpu_ids - 1; if (i == 0) { rnp->grpnum = 0; rnp->grpmask = 0; @@ -3370,7 +3632,6 @@ static void __init rcu_init_one(struct rcu_state *rsp, rsp->rda = rda; init_waitqueue_head(&rsp->gp_wq); - init_irq_work(&rsp->wakeup_work, rsp_wakeup); rnp = rsp->level[rcu_num_lvls - 1]; for_each_possible_cpu(i) { while (i > rnp->grphi) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 8c19873f1ac..0f69a79c5b7 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -13,8 +13,8 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright IBM Corporation, 2008 * @@ -252,7 +252,6 @@ struct rcu_data { bool passed_quiesce; /* User-mode/idle loop etc. */ bool qs_pending; /* Core waits for quiesc state. */ bool beenonline; /* CPU online at least once. */ - bool preemptible; /* Preemptible RCU? */ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ unsigned long grpmask; /* Mask to apply to leaf qsmask. */ #ifdef CONFIG_RCU_CPU_STALL_INFO @@ -308,6 +307,9 @@ struct rcu_data { /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ unsigned long offline_fqs; /* Kicked due to being offline. */ + unsigned long cond_resched_completed; + /* Grace period that needs help */ + /* from cond_resched(). */ /* 5) __rcu_pending() statistics. */ unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ @@ -393,6 +395,7 @@ struct rcu_state { struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */ u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ + u8 flavor_mask; /* bit in flavor mask. */ struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ void (*func)(struct rcu_head *head)); @@ -406,7 +409,8 @@ struct rcu_state { unsigned long completed; /* # of last completed gp. */ struct task_struct *gp_kthread; /* Task for grace periods. */ wait_queue_head_t gp_wq; /* Where GP task waits. */ - int gp_flags; /* Commands for GP task. */ + short gp_flags; /* Commands for GP task. */ + short gp_state; /* GP kthread sleep state. */ /* End of fields guarded by root rcu_node's lock. */ @@ -462,13 +466,17 @@ struct rcu_state { const char *name; /* Name of structure. */ char abbr; /* Abbreviated name. */ struct list_head flavors; /* List of RCU flavors. */ - struct irq_work wakeup_work; /* Postponed wakeups */ }; /* Values for rcu_state structure's gp_flags field. */ #define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */ #define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */ +/* Values for rcu_state structure's gp_flags field. */ +#define RCU_GP_WAIT_INIT 0 /* Initial state. */ +#define RCU_GP_WAIT_GPS 1 /* Wait for grace-period start. */ +#define RCU_GP_WAIT_FQS 2 /* Wait for force-quiescent-state time. */ + extern struct list_head rcu_struct_flavors; /* Sequence through rcu_state structures for each RCU flavor. */ @@ -547,7 +555,6 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); static void print_cpu_stall_info_end(void); static void zero_cpu_stall_ticks(struct rcu_data *rdp); static void increment_cpu_stall_ticks(void); -static int rcu_nocb_needs_gp(struct rcu_state *rsp); static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); static void rcu_init_one_nocb(struct rcu_node *rnp); @@ -560,7 +567,7 @@ static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp); static void do_nocb_deferred_wakeup(struct rcu_data *rdp); static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); -static void rcu_kick_nohz_cpu(int cpu); +static void __maybe_unused rcu_kick_nohz_cpu(int cpu); static bool init_nocb_callback_list(struct rcu_data *rdp); static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq); static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 6e2ef4b2b92..02ac0fb186b 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -14,8 +14,8 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright Red Hat, 2009 * Copyright IBM Corporation, 2009 @@ -116,7 +116,7 @@ static void __init rcu_bootup_announce_oddness(void) #ifdef CONFIG_TREE_PREEMPT_RCU RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); -static struct rcu_state *rcu_state = &rcu_preempt_state; +static struct rcu_state *rcu_state_p = &rcu_preempt_state; static int rcu_preempted_readers_exp(struct rcu_node *rnp); @@ -149,15 +149,6 @@ long rcu_batches_completed(void) EXPORT_SYMBOL_GPL(rcu_batches_completed); /* - * Force a quiescent state for preemptible RCU. - */ -void rcu_force_quiescent_state(void) -{ - force_quiescent_state(&rcu_preempt_state); -} -EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); - -/* * Record a preemptible-RCU quiescent state for the specified CPU. Note * that this just means that the task currently running on the CPU is * not in a quiescent state. There might be any number of tasks blocked @@ -688,20 +679,6 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) } EXPORT_SYMBOL_GPL(call_rcu); -/* - * Queue an RCU callback for lazy invocation after a grace period. - * This will likely be later named something like "call_rcu_lazy()", - * but this change will require some way of tagging the lazy RCU - * callbacks in the list of pending callbacks. Until then, this - * function may only be called from __kfree_rcu(). - */ -void kfree_call_rcu(struct rcu_head *head, - void (*func)(struct rcu_head *rcu)) -{ - __call_rcu(head, func, &rcu_preempt_state, -1, 1); -} -EXPORT_SYMBOL_GPL(kfree_call_rcu); - /** * synchronize_rcu - wait until a grace period has elapsed. * @@ -970,7 +947,7 @@ void exit_rcu(void) #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ -static struct rcu_state *rcu_state = &rcu_sched_state; +static struct rcu_state *rcu_state_p = &rcu_sched_state; /* * Tell them what RCU they are running. @@ -991,16 +968,6 @@ long rcu_batches_completed(void) EXPORT_SYMBOL_GPL(rcu_batches_completed); /* - * Force a quiescent state for RCU, which, because there is no preemptible - * RCU, becomes the same as rcu-sched. - */ -void rcu_force_quiescent_state(void) -{ - rcu_sched_force_quiescent_state(); -} -EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); - -/* * Because preemptible RCU does not exist, we never have to check for * CPUs being in quiescent states. */ @@ -1080,22 +1047,6 @@ static void rcu_preempt_check_callbacks(int cpu) } /* - * Queue an RCU callback for lazy invocation after a grace period. - * This will likely be later named something like "call_rcu_lazy()", - * but this change will require some way of tagging the lazy RCU - * callbacks in the list of pending callbacks. Until then, this - * function may only be called from __kfree_rcu(). - * - * Because there is no preemptible RCU, we use RCU-sched instead. - */ -void kfree_call_rcu(struct rcu_head *head, - void (*func)(struct rcu_head *rcu)) -{ - __call_rcu(head, func, &rcu_sched_state, -1, 1); -} -EXPORT_SYMBOL_GPL(kfree_call_rcu); - -/* * Wait for an rcu-preempt grace period, but make it happen quickly. * But because preemptible RCU does not exist, map to rcu-sched. */ @@ -1517,11 +1468,11 @@ static int __init rcu_spawn_kthreads(void) for_each_possible_cpu(cpu) per_cpu(rcu_cpu_has_work, cpu) = 0; BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); - rnp = rcu_get_root(rcu_state); - (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); + rnp = rcu_get_root(rcu_state_p); + (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); if (NUM_RCU_NODES > 1) { - rcu_for_each_leaf_node(rcu_state, rnp) - (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); + rcu_for_each_leaf_node(rcu_state_p, rnp) + (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); } return 0; } @@ -1529,12 +1480,12 @@ early_initcall(rcu_spawn_kthreads); static void rcu_prepare_kthreads(int cpu) { - struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); + struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); struct rcu_node *rnp = rdp->mynode; /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ if (rcu_scheduler_fully_active) - (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); + (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); } #else /* #ifdef CONFIG_RCU_BOOST */ @@ -1586,11 +1537,13 @@ static void rcu_prepare_kthreads(int cpu) * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs * any flavor of RCU. */ +#ifndef CONFIG_RCU_NOCB_CPU_ALL int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) { *delta_jiffies = ULONG_MAX; return rcu_cpu_has_callbacks(cpu, NULL); } +#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ /* * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up @@ -1656,7 +1609,7 @@ extern int tick_nohz_active; * only if it has been awhile since the last time we did so. Afterwards, * if there are any callbacks ready for immediate invocation, return true. */ -static bool rcu_try_advance_all_cbs(void) +static bool __maybe_unused rcu_try_advance_all_cbs(void) { bool cbs_ready = false; struct rcu_data *rdp; @@ -1696,6 +1649,7 @@ static bool rcu_try_advance_all_cbs(void) * * The caller must have disabled interrupts. */ +#ifndef CONFIG_RCU_NOCB_CPU_ALL int rcu_needs_cpu(int cpu, unsigned long *dj) { struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); @@ -1726,6 +1680,7 @@ int rcu_needs_cpu(int cpu, unsigned long *dj) } return 0; } +#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ /* * Prepare a CPU for idle from an RCU perspective. The first major task @@ -1739,6 +1694,8 @@ int rcu_needs_cpu(int cpu, unsigned long *dj) */ static void rcu_prepare_for_idle(int cpu) { +#ifndef CONFIG_RCU_NOCB_CPU_ALL + bool needwake; struct rcu_data *rdp; struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); struct rcu_node *rnp; @@ -1787,9 +1744,12 @@ static void rcu_prepare_for_idle(int cpu) rnp = rdp->mynode; raw_spin_lock(&rnp->lock); /* irqs already disabled. */ smp_mb__after_unlock_lock(); - rcu_accelerate_cbs(rsp, rnp, rdp); + needwake = rcu_accelerate_cbs(rsp, rnp, rdp); raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + if (needwake) + rcu_gp_kthread_wake(rsp); } +#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ } /* @@ -1799,11 +1759,12 @@ static void rcu_prepare_for_idle(int cpu) */ static void rcu_cleanup_after_idle(int cpu) { - +#ifndef CONFIG_RCU_NOCB_CPU_ALL if (rcu_is_nocb_cpu(cpu)) return; if (rcu_try_advance_all_cbs()) invoke_rcu_core(); +#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ } /* @@ -1848,7 +1809,7 @@ static void rcu_oom_notify_cpu(void *unused) struct rcu_data *rdp; for_each_rcu_flavor(rsp) { - rdp = __this_cpu_ptr(rsp->rda); + rdp = raw_cpu_ptr(rsp->rda); if (rdp->qlen_lazy != 0) { atomic_inc(&oom_callback_count); rsp->call(&rdp->oom_head, rcu_oom_callback); @@ -1990,7 +1951,7 @@ static void increment_cpu_stall_ticks(void) struct rcu_state *rsp; for_each_rcu_flavor(rsp) - __this_cpu_ptr(rsp->rda)->ticks_this_gp++; + raw_cpu_inc(rsp->rda->ticks_this_gp); } #else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ @@ -2061,19 +2022,6 @@ static int __init parse_rcu_nocb_poll(char *arg) early_param("rcu_nocb_poll", parse_rcu_nocb_poll); /* - * Do any no-CBs CPUs need another grace period? - * - * Interrupts must be disabled. If the caller does not hold the root - * rnp_node structure's ->lock, the results are advisory only. - */ -static int rcu_nocb_needs_gp(struct rcu_state *rsp) -{ - struct rcu_node *rnp = rcu_get_root(rsp); - - return rnp->need_future_gp[(ACCESS_ONCE(rnp->completed) + 1) & 0x1]; -} - -/* * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended * grace period. */ @@ -2101,13 +2049,15 @@ static void rcu_init_one_nocb(struct rcu_node *rnp) init_waitqueue_head(&rnp->nocb_gp_wq[1]); } -/* Is the specified CPU a no-CPUs CPU? */ +#ifndef CONFIG_RCU_NOCB_CPU_ALL +/* Is the specified CPU a no-CBs CPU? */ bool rcu_is_nocb_cpu(int cpu) { if (have_rcu_nocb_mask) return cpumask_test_cpu(cpu, rcu_nocb_mask); return false; } +#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ /* * Enqueue the specified string of rcu_head structures onto the specified @@ -2234,12 +2184,15 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) unsigned long c; bool d; unsigned long flags; + bool needwake; struct rcu_node *rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); smp_mb__after_unlock_lock(); - c = rcu_start_future_gp(rnp, rdp); + needwake = rcu_start_future_gp(rnp, rdp, &c); raw_spin_unlock_irqrestore(&rnp->lock, flags); + if (needwake) + rcu_gp_kthread_wake(rdp->rsp); /* * Wait for the grace period. Do so interruptibly to avoid messing @@ -2393,11 +2346,6 @@ static bool init_nocb_callback_list(struct rcu_data *rdp) #else /* #ifdef CONFIG_RCU_NOCB_CPU */ -static int rcu_nocb_needs_gp(struct rcu_state *rsp) -{ - return 0; -} - static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) { } @@ -2456,7 +2404,7 @@ static bool init_nocb_callback_list(struct rcu_data *rdp) * if an adaptive-ticks CPU is failing to respond to the current grace * period and has not be idle from an RCU perspective, kick it. */ -static void rcu_kick_nohz_cpu(int cpu) +static void __maybe_unused rcu_kick_nohz_cpu(int cpu) { #ifdef CONFIG_NO_HZ_FULL if (tick_nohz_full_cpu(cpu)) @@ -2514,9 +2462,9 @@ static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) /* Record start of fully idle period. */ j = jiffies; ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j; - smp_mb__before_atomic_inc(); + smp_mb__before_atomic(); atomic_inc(&rdtp->dynticks_idle); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1); } @@ -2581,9 +2529,9 @@ static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) } /* Record end of idle period. */ - smp_mb__before_atomic_inc(); + smp_mb__before_atomic(); atomic_inc(&rdtp->dynticks_idle); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1)); /* @@ -2648,20 +2596,6 @@ static bool is_sysidle_rcu_state(struct rcu_state *rsp) } /* - * Bind the grace-period kthread for the sysidle flavor of RCU to the - * timekeeping CPU. - */ -static void rcu_bind_gp_kthread(void) -{ - int cpu = ACCESS_ONCE(tick_do_timer_cpu); - - if (cpu < 0 || cpu >= nr_cpu_ids) - return; - if (raw_smp_processor_id() != cpu) - set_cpus_allowed_ptr(current, cpumask_of(cpu)); -} - -/* * Return a delay in jiffies based on the number of CPUs, rcu_node * leaf fanout, and jiffies tick rate. The idea is to allow larger * systems more time to transition to full-idle state in order to @@ -2725,7 +2659,8 @@ static void rcu_sysidle(unsigned long j) static void rcu_sysidle_cancel(void) { smp_mb(); - ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT; + if (full_sysidle_state > RCU_SYSIDLE_SHORT) + ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT; } /* @@ -2871,10 +2806,6 @@ static bool is_sysidle_rcu_state(struct rcu_state *rsp) return false; } -static void rcu_bind_gp_kthread(void) -{ -} - static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, unsigned long maxj) { @@ -2893,7 +2824,7 @@ static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) * CPU unless the grace period has extended for too long. * * This code relies on the fact that all NO_HZ_FULL CPUs are also - * CONFIG_RCU_NOCB_CPUs. + * CONFIG_RCU_NOCB_CPU CPUs. */ static bool rcu_nohz_full_cpu(struct rcu_state *rsp) { @@ -2905,3 +2836,19 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp) #endif /* #ifdef CONFIG_NO_HZ_FULL */ return 0; } + +/* + * Bind the grace-period kthread for the sysidle flavor of RCU to the + * timekeeping CPU. + */ +static void rcu_bind_gp_kthread(void) +{ +#ifdef CONFIG_NO_HZ_FULL + int cpu = ACCESS_ONCE(tick_do_timer_cpu); + + if (cpu < 0 || cpu >= nr_cpu_ids) + return; + if (raw_smp_processor_id() != cpu) + set_cpus_allowed_ptr(current, cpumask_of(cpu)); +#endif /* #ifdef CONFIG_NO_HZ_FULL */ +} diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index 4def475336d..5cdc62e1bee 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -12,8 +12,8 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright IBM Corporation, 2008 * @@ -273,7 +273,7 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", rsp->n_force_qs, rsp->n_force_qs_ngp, rsp->n_force_qs - rsp->n_force_qs_ngp, - rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen); + ACCESS_ONCE(rsp->n_force_qs_lh), rsp->qlen_lazy, rsp->qlen); for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) { if (rnp->level != level) { seq_puts(m, "\n"); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index c54609faf23..bc788357053 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -12,8 +12,8 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright IBM Corporation, 2001 * @@ -49,7 +49,6 @@ #include <linux/module.h> #define CREATE_TRACE_POINTS -#include <trace/events/rcu.h> #include "rcu.h" @@ -201,12 +200,12 @@ void wait_rcu_gp(call_rcu_func_t crf) EXPORT_SYMBOL_GPL(wait_rcu_gp); #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD -static inline void debug_init_rcu_head(struct rcu_head *head) +void init_rcu_head(struct rcu_head *head) { debug_object_init(head, &rcuhead_debug_descr); } -static inline void debug_rcu_head_free(struct rcu_head *head) +void destroy_rcu_head(struct rcu_head *head) { debug_object_free(head, &rcuhead_debug_descr); } @@ -321,6 +320,18 @@ int rcu_jiffies_till_stall_check(void) return till_stall_check * HZ + RCU_STALL_DELAY_DELTA; } +void rcu_sysrq_start(void) +{ + if (!rcu_cpu_stall_suppress) + rcu_cpu_stall_suppress = 2; +} + +void rcu_sysrq_end(void) +{ + if (rcu_cpu_stall_suppress == 2) + rcu_cpu_stall_suppress = 0; +} + static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) { rcu_cpu_stall_suppress = 1; diff --git a/kernel/reboot.c b/kernel/reboot.c index 662c83fc16b..a3a9e240fcd 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -388,15 +388,22 @@ static int __init reboot_setup(char *str) break; case 's': - if (isdigit(*(str+1))) - reboot_cpu = simple_strtoul(str+1, NULL, 0); - else if (str[1] == 'm' && str[2] == 'p' && - isdigit(*(str+3))) - reboot_cpu = simple_strtoul(str+3, NULL, 0); - else + { + int rc; + + if (isdigit(*(str+1))) { + rc = kstrtoint(str+1, 0, &reboot_cpu); + if (rc) + return rc; + } else if (str[1] == 'm' && str[2] == 'p' && + isdigit(*(str+3))) { + rc = kstrtoint(str+3, 0, &reboot_cpu); + if (rc) + return rc; + } else reboot_mode = REBOOT_SOFT; break; - + } case 'g': reboot_mode = REBOOT_GPIO; break; diff --git a/kernel/relay.c b/kernel/relay.c index 5001c9887db..5a56d3c8dc0 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -227,7 +227,7 @@ static void relay_destroy_buf(struct rchan_buf *buf) * relay_remove_buf - remove a channel buffer * @kref: target kernel reference that contains the relay buffer * - * Removes the file from the fileystem, which also frees the + * Removes the file from the filesystem, which also frees the * rchan_buf_struct and the channel buffer. Should only be called from * kref_put(). */ @@ -1195,8 +1195,6 @@ static void relay_pipe_buf_release(struct pipe_inode_info *pipe, static const struct pipe_buf_operations relay_pipe_buf_ops = { .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, .confirm = generic_pipe_buf_confirm, .release = relay_pipe_buf_release, .steal = generic_pipe_buf_steal, @@ -1253,7 +1251,7 @@ static ssize_t subbuf_splice_actor(struct file *in, subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT; pidx = (read_start / PAGE_SIZE) % subbuf_pages; poff = read_start & ~PAGE_MASK; - nr_pages = min_t(unsigned int, subbuf_pages, pipe->buffers); + nr_pages = min_t(unsigned int, subbuf_pages, spd.nr_pages_max); for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) { unsigned int this_len, this_end, private; diff --git a/kernel/res_counter.c b/kernel/res_counter.c index 4aa8a305aed..e791130f85a 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -22,8 +22,18 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent) counter->parent = parent; } -int res_counter_charge_locked(struct res_counter *counter, unsigned long val, - bool force) +static u64 res_counter_uncharge_locked(struct res_counter *counter, + unsigned long val) +{ + if (WARN_ON(counter->usage < val)) + val = counter->usage; + + counter->usage -= val; + return counter->usage; +} + +static int res_counter_charge_locked(struct res_counter *counter, + unsigned long val, bool force) { int ret = 0; @@ -86,15 +96,6 @@ int res_counter_charge_nofail(struct res_counter *counter, unsigned long val, return __res_counter_charge(counter, val, limit_fail_at, true); } -u64 res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) -{ - if (WARN_ON(counter->usage < val)) - val = counter->usage; - - counter->usage -= val; - return counter->usage; -} - u64 res_counter_uncharge_until(struct res_counter *counter, struct res_counter *top, unsigned long val) @@ -185,8 +186,11 @@ int res_counter_memparse_write_strategy(const char *buf, /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */ if (*buf == '-') { - res = simple_strtoull(buf + 1, &end, 10); - if (res != 1 || *end != '\0') + int rc = kstrtoull(buf + 1, 10, &res); + + if (rc) + return rc; + if (res != 1) return -EINVAL; *resp = RES_COUNTER_MAX; return 0; diff --git a/kernel/resource.c b/kernel/resource.c index a8344dda704..3c2237ac32d 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -511,7 +511,7 @@ static int find_resource(struct resource *root, struct resource *new, * @newsize: new size of the resource descriptor * @constraint: the size and alignment constraints to be met. */ -int reallocate_resource(struct resource *root, struct resource *old, +static int reallocate_resource(struct resource *root, struct resource *old, resource_size_t newsize, struct resource_constraint *constraint) { @@ -945,8 +945,8 @@ struct resource * __request_region(struct resource *parent, res->name = name; res->start = start; res->end = start + n - 1; - res->flags = IORESOURCE_BUSY; - res->flags |= flags; + res->flags = resource_type(parent); + res->flags |= IORESOURCE_BUSY | flags; write_lock(&resource_lock); @@ -1288,13 +1288,10 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size) if (p->flags & IORESOURCE_BUSY) continue; - printk(KERN_WARNING "resource map sanity check conflict: " - "0x%llx 0x%llx 0x%llx 0x%llx %s\n", + printk(KERN_WARNING "resource sanity check: requesting [mem %#010llx-%#010llx], which spans more than %s %pR\n", (unsigned long long)addr, (unsigned long long)(addr + size - 1), - (unsigned long long)p->start, - (unsigned long long)p->end, - p->name); + p->name, p); err = -1; break; } diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 9a95c8c2af2..ab32b7b0db5 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -13,7 +13,7 @@ endif obj-y += core.o proc.o clock.o cputime.o obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o -obj-y += wait.o completion.o +obj-y += wait.o completion.o idle.o obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index 4a073539c58..e73efba9830 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -203,7 +203,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice) struct autogroup *ag; int err; - if (nice < -20 || nice > 19) + if (nice < MIN_NICE || nice > MAX_NICE) return -EINVAL; err = security_task_setnice(current, nice); diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 43c2bcc3576..3ef6451e972 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -60,13 +60,14 @@ #include <linux/sched.h> #include <linux/static_key.h> #include <linux/workqueue.h> +#include <linux/compiler.h> /* * Scheduler clock - returns current time in nanosec units. * This is default implementation. * Architectures and sub-architectures can override this. */ -unsigned long long __attribute__((weak)) sched_clock(void) +unsigned long long __weak sched_clock(void) { return (unsigned long long)(jiffies - INITIAL_JIFFIES) * (NSEC_PER_SEC / HZ); @@ -301,14 +302,14 @@ u64 sched_clock_cpu(int cpu) if (unlikely(!sched_clock_running)) return 0ull; - preempt_disable(); + preempt_disable_notrace(); scd = cpu_sdc(cpu); if (cpu != smp_processor_id()) clock = sched_clock_remote(scd); else clock = sched_clock_local(scd); - preempt_enable(); + preempt_enable_notrace(); return clock; } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b46131ef6aa..bc1638b3344 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -73,6 +73,7 @@ #include <linux/init_task.h> #include <linux/binfmts.h> #include <linux/context_tracking.h> +#include <linux/compiler.h> #include <asm/switch_to.h> #include <asm/tlb.h> @@ -89,6 +90,22 @@ #define CREATE_TRACE_POINTS #include <trace/events/sched.h> +#ifdef smp_mb__before_atomic +void __smp_mb__before_atomic(void) +{ + smp_mb__before_atomic(); +} +EXPORT_SYMBOL(__smp_mb__before_atomic); +#endif + +#ifdef smp_mb__after_atomic +void __smp_mb__after_atomic(void) +{ + smp_mb__after_atomic(); +} +EXPORT_SYMBOL(__smp_mb__after_atomic); +#endif + void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) { unsigned long delta; @@ -432,7 +449,7 @@ void hrtick_start(struct rq *rq, u64 delay) if (rq == this_rq()) { __hrtick_restart(rq); } else if (!rq->hrtick_csd_pending) { - __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0); + smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); rq->hrtick_csd_pending = 1; } } @@ -505,6 +522,71 @@ static inline void init_hrtick(void) #endif /* CONFIG_SCHED_HRTICK */ /* + * cmpxchg based fetch_or, macro so it works for different integer types + */ +#define fetch_or(ptr, val) \ +({ typeof(*(ptr)) __old, __val = *(ptr); \ + for (;;) { \ + __old = cmpxchg((ptr), __val, __val | (val)); \ + if (__old == __val) \ + break; \ + __val = __old; \ + } \ + __old; \ +}) + +#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) +/* + * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, + * this avoids any races wrt polling state changes and thereby avoids + * spurious IPIs. + */ +static bool set_nr_and_not_polling(struct task_struct *p) +{ + struct thread_info *ti = task_thread_info(p); + return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); +} + +/* + * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. + * + * If this returns true, then the idle task promises to call + * sched_ttwu_pending() and reschedule soon. + */ +static bool set_nr_if_polling(struct task_struct *p) +{ + struct thread_info *ti = task_thread_info(p); + typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags); + + for (;;) { + if (!(val & _TIF_POLLING_NRFLAG)) + return false; + if (val & _TIF_NEED_RESCHED) + return true; + old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); + if (old == val) + break; + val = old; + } + return true; +} + +#else +static bool set_nr_and_not_polling(struct task_struct *p) +{ + set_tsk_need_resched(p); + return true; +} + +#ifdef CONFIG_SMP +static bool set_nr_if_polling(struct task_struct *p) +{ + return false; +} +#endif +#endif + +/* * resched_task - mark a task 'to be rescheduled now'. * * On UP this means the setting of the need_resched flag, on SMP it @@ -520,18 +602,18 @@ void resched_task(struct task_struct *p) if (test_tsk_need_resched(p)) return; - set_tsk_need_resched(p); - cpu = task_cpu(p); + if (cpu == smp_processor_id()) { + set_tsk_need_resched(p); set_preempt_need_resched(); return; } - /* NEED_RESCHED must be visible before we test polling */ - smp_mb(); - if (!tsk_is_polling(p)) + if (set_nr_and_not_polling(p)) smp_send_reschedule(cpu); + else + trace_sched_wake_idle_without_ipi(cpu); } void resched_cpu(int cpu) @@ -555,12 +637,15 @@ void resched_cpu(int cpu) * selecting an idle cpu will add more delays to the timers than intended * (as that cpu's timer base may not be uptodate wrt jiffies etc). */ -int get_nohz_timer_target(void) +int get_nohz_timer_target(int pinned) { int cpu = smp_processor_id(); int i; struct sched_domain *sd; + if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu)) + return cpu; + rcu_read_lock(); for_each_domain(cpu, sd) { for_each_cpu(i, sched_domain_span(sd)) { @@ -591,27 +676,10 @@ static void wake_up_idle_cpu(int cpu) if (cpu == smp_processor_id()) return; - /* - * This is safe, as this function is called with the timer - * wheel base lock of (cpu) held. When the CPU is on the way - * to idle and has not yet set rq->curr to idle then it will - * be serialized on the timer wheel base lock and take the new - * timer into account automatically. - */ - if (rq->curr != rq->idle) - return; - - /* - * We can set TIF_RESCHED on the idle task of the other CPU - * lockless. The worst case is that the other CPU runs the - * idle task through an additional NOOP schedule() - */ - set_tsk_need_resched(rq->idle); - - /* NEED_RESCHED must be visible before we test polling */ - smp_mb(); - if (!tsk_is_polling(rq->idle)) + if (set_nr_and_not_polling(rq->idle)) smp_send_reschedule(cpu); + else + trace_sched_wake_idle_without_ipi(cpu); } static bool wake_up_full_nohz_cpu(int cpu) @@ -823,19 +891,13 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) #endif #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING if (static_key_false((¶virt_steal_rq_enabled))) { - u64 st; - steal = paravirt_steal_clock(cpu_of(rq)); steal -= rq->prev_steal_time_rq; if (unlikely(steal > delta)) steal = delta; - st = steal_ticks(steal); - steal = st * TICK_NSEC; - rq->prev_steal_time_rq += steal; - delta -= steal; } #endif @@ -843,7 +905,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) rq->clock_task += delta; #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) - if ((irq_delta + steal) && sched_feat(NONTASK_POWER)) + if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) sched_rt_avg_update(rq, irq_delta + steal); #endif } @@ -1322,7 +1384,7 @@ out: * leave kernel. */ if (p->mm && printk_ratelimit()) { - printk_sched("process %d (%s) no longer affine to cpu%d\n", + printk_deferred("process %d (%s) no longer affine to cpu%d\n", task_pid_nr(p), p->comm, cpu); } } @@ -1476,13 +1538,17 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) } #ifdef CONFIG_SMP -static void sched_ttwu_pending(void) +void sched_ttwu_pending(void) { struct rq *rq = this_rq(); struct llist_node *llist = llist_del_all(&rq->wake_list); struct task_struct *p; + unsigned long flags; - raw_spin_lock(&rq->lock); + if (!llist) + return; + + raw_spin_lock_irqsave(&rq->lock, flags); while (llist) { p = llist_entry(llist, struct task_struct, wake_entry); @@ -1490,7 +1556,7 @@ static void sched_ttwu_pending(void) ttwu_do_activate(rq, p, 0); } - raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&rq->lock, flags); } void scheduler_ipi(void) @@ -1536,8 +1602,14 @@ void scheduler_ipi(void) static void ttwu_queue_remote(struct task_struct *p, int cpu) { - if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) - smp_send_reschedule(cpu); + struct rq *rq = cpu_rq(cpu); + + if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { + if (!set_nr_if_polling(rq->idle)) + smp_send_reschedule(cpu); + else + trace_sched_wake_idle_without_ipi(cpu); + } } bool cpus_share_cache(int this_cpu, int that_cpu) @@ -1745,8 +1817,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; p->numa_scan_period = sysctl_numa_balancing_scan_delay; p->numa_work.next = &p->numa_work; - p->numa_faults = NULL; - p->numa_faults_buffer = NULL; + p->numa_faults_memory = NULL; + p->numa_faults_buffer_memory = NULL; + p->last_task_numa_placement = 0; + p->last_sum_exec_runtime = 0; INIT_LIST_HEAD(&p->numa_entry); p->numa_group = NULL; @@ -1952,7 +2026,7 @@ static int dl_overflow(struct task_struct *p, int policy, { struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); - u64 period = attr->sched_period; + u64 period = attr->sched_period ?: attr->sched_deadline; u64 runtime = attr->sched_runtime; u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; int cpus, err = -1; @@ -2149,8 +2223,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) if (mm) mmdrop(mm); if (unlikely(prev_state == TASK_DEAD)) { - task_numa_free(prev); - if (prev->sched_class->task_dead) prev->sched_class->task_dead(prev); @@ -2167,13 +2239,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) #ifdef CONFIG_SMP -/* assumes rq->lock is held */ -static inline void pre_schedule(struct rq *rq, struct task_struct *prev) -{ - if (prev->sched_class->pre_schedule) - prev->sched_class->pre_schedule(rq, prev); -} - /* rq->lock is NOT held, but preemption is disabled */ static inline void post_schedule(struct rq *rq) { @@ -2191,10 +2256,6 @@ static inline void post_schedule(struct rq *rq) #else -static inline void pre_schedule(struct rq *rq, struct task_struct *p) -{ -} - static inline void post_schedule(struct rq *rq) { } @@ -2205,7 +2266,7 @@ static inline void post_schedule(struct rq *rq) * schedule_tail - first thing a freshly forked thread must call. * @prev: the thread we just switched away from. */ -asmlinkage void schedule_tail(struct task_struct *prev) +asmlinkage __visible void schedule_tail(struct task_struct *prev) __releases(rq->lock) { struct rq *rq = this_rq(); @@ -2493,7 +2554,7 @@ notrace unsigned long get_parent_ip(unsigned long addr) #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ defined(CONFIG_PREEMPT_TRACER)) -void __kprobes preempt_count_add(int val) +void preempt_count_add(int val) { #ifdef CONFIG_DEBUG_PREEMPT /* @@ -2510,12 +2571,18 @@ void __kprobes preempt_count_add(int val) DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK - 10); #endif - if (preempt_count() == val) - trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); + if (preempt_count() == val) { + unsigned long ip = get_parent_ip(CALLER_ADDR1); +#ifdef CONFIG_DEBUG_PREEMPT + current->preempt_disable_ip = ip; +#endif + trace_preempt_off(CALLER_ADDR0, ip); + } } EXPORT_SYMBOL(preempt_count_add); +NOKPROBE_SYMBOL(preempt_count_add); -void __kprobes preempt_count_sub(int val) +void preempt_count_sub(int val) { #ifdef CONFIG_DEBUG_PREEMPT /* @@ -2536,6 +2603,7 @@ void __kprobes preempt_count_sub(int val) __preempt_count_sub(val); } EXPORT_SYMBOL(preempt_count_sub); +NOKPROBE_SYMBOL(preempt_count_sub); #endif @@ -2554,6 +2622,13 @@ static noinline void __schedule_bug(struct task_struct *prev) print_modules(); if (irqs_disabled()) print_irqtrace_events(prev); +#ifdef CONFIG_DEBUG_PREEMPT + if (in_atomic_preempt_off()) { + pr_err("Preemption disabled at:"); + print_ip_sym(current->preempt_disable_ip); + pr_cont("\n"); + } +#endif dump_stack(); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } @@ -2577,36 +2652,40 @@ static inline void schedule_debug(struct task_struct *prev) schedstat_inc(this_rq(), sched_count); } -static void put_prev_task(struct rq *rq, struct task_struct *prev) -{ - if (prev->on_rq || rq->skip_clock_update < 0) - update_rq_clock(rq); - prev->sched_class->put_prev_task(rq, prev); -} - /* * Pick up the highest-prio task: */ static inline struct task_struct * -pick_next_task(struct rq *rq) +pick_next_task(struct rq *rq, struct task_struct *prev) { - const struct sched_class *class; + const struct sched_class *class = &fair_sched_class; struct task_struct *p; /* * Optimization: we know that if all tasks are in * the fair class we can call that function directly: */ - if (likely(rq->nr_running == rq->cfs.h_nr_running)) { - p = fair_sched_class.pick_next_task(rq); - if (likely(p)) - return p; + if (likely(prev->sched_class == class && + rq->nr_running == rq->cfs.h_nr_running)) { + p = fair_sched_class.pick_next_task(rq, prev); + if (unlikely(p == RETRY_TASK)) + goto again; + + /* assumes fair_sched_class->next == idle_sched_class */ + if (unlikely(!p)) + p = idle_sched_class.pick_next_task(rq, prev); + + return p; } +again: for_each_class(class) { - p = class->pick_next_task(rq); - if (p) + p = class->pick_next_task(rq, prev); + if (p) { + if (unlikely(p == RETRY_TASK)) + goto again; return p; + } } BUG(); /* the idle class will always have a runnable task */ @@ -2700,13 +2779,10 @@ need_resched: switch_count = &prev->nvcsw; } - pre_schedule(rq, prev); - - if (unlikely(!rq->nr_running)) - idle_balance(cpu, rq); + if (prev->on_rq || rq->skip_clock_update < 0) + update_rq_clock(rq); - put_prev_task(rq, prev); - next = pick_next_task(rq); + next = pick_next_task(rq, prev); clear_tsk_need_resched(prev); clear_preempt_need_resched(); rq->skip_clock_update = 0; @@ -2747,7 +2823,7 @@ static inline void sched_submit_work(struct task_struct *tsk) blk_schedule_flush_plug(tsk); } -asmlinkage void __sched schedule(void) +asmlinkage __visible void __sched schedule(void) { struct task_struct *tsk = current; @@ -2757,7 +2833,7 @@ asmlinkage void __sched schedule(void) EXPORT_SYMBOL(schedule); #ifdef CONFIG_CONTEXT_TRACKING -asmlinkage void __sched schedule_user(void) +asmlinkage __visible void __sched schedule_user(void) { /* * If we come here after a random call to set_need_resched(), @@ -2789,7 +2865,7 @@ void __sched schedule_preempt_disabled(void) * off of preempt_enable. Kernel preemptions off return from interrupt * occur there and call schedule directly. */ -asmlinkage void __sched notrace preempt_schedule(void) +asmlinkage __visible void __sched notrace preempt_schedule(void) { /* * If there is a non-zero preempt_count or interrupts are disabled, @@ -2810,6 +2886,7 @@ asmlinkage void __sched notrace preempt_schedule(void) barrier(); } while (need_resched()); } +NOKPROBE_SYMBOL(preempt_schedule); EXPORT_SYMBOL(preempt_schedule); #endif /* CONFIG_PREEMPT */ @@ -2819,7 +2896,7 @@ EXPORT_SYMBOL(preempt_schedule); * Note, that this is called and return with irqs disabled. This will * protect us against recursive calling from irq. */ -asmlinkage void __sched preempt_schedule_irq(void) +asmlinkage __visible void __sched preempt_schedule_irq(void) { enum ctx_state prev_state; @@ -2852,52 +2929,6 @@ int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, } EXPORT_SYMBOL(default_wake_function); -static long __sched -sleep_on_common(wait_queue_head_t *q, int state, long timeout) -{ - unsigned long flags; - wait_queue_t wait; - - init_waitqueue_entry(&wait, current); - - __set_current_state(state); - - spin_lock_irqsave(&q->lock, flags); - __add_wait_queue(q, &wait); - spin_unlock(&q->lock); - timeout = schedule_timeout(timeout); - spin_lock_irq(&q->lock); - __remove_wait_queue(q, &wait); - spin_unlock_irqrestore(&q->lock, flags); - - return timeout; -} - -void __sched interruptible_sleep_on(wait_queue_head_t *q) -{ - sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); -} -EXPORT_SYMBOL(interruptible_sleep_on); - -long __sched -interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) -{ - return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout); -} -EXPORT_SYMBOL(interruptible_sleep_on_timeout); - -void __sched sleep_on(wait_queue_head_t *q) -{ - sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); -} -EXPORT_SYMBOL(sleep_on); - -long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) -{ - return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout); -} -EXPORT_SYMBOL(sleep_on_timeout); - #ifdef CONFIG_RT_MUTEXES /* @@ -2908,7 +2939,8 @@ EXPORT_SYMBOL(sleep_on_timeout); * This function changes the 'effective' priority of a task. It does * not touch ->normal_prio like __setscheduler(). * - * Used by the rt_mutex code to implement priority inheritance logic. + * Used by the rt_mutex code to implement priority inheritance + * logic. Call site only calls if the priority of the task changed. */ void rt_mutex_setprio(struct task_struct *p, int prio) { @@ -2998,7 +3030,7 @@ void set_user_nice(struct task_struct *p, long nice) unsigned long flags; struct rq *rq; - if (TASK_NICE(p) == nice || nice < -20 || nice > 19) + if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) return; /* * We have to be careful, if called from sys_setpriority(), @@ -3047,7 +3079,7 @@ EXPORT_SYMBOL(set_user_nice); int can_nice(const struct task_struct *p, const int nice) { /* convert nice value [19,-20] to rlimit style value [1,40] */ - int nice_rlim = 20 - nice; + int nice_rlim = nice_to_rlimit(nice); return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || capable(CAP_SYS_NICE)); @@ -3071,17 +3103,10 @@ SYSCALL_DEFINE1(nice, int, increment) * We don't have to worry. Conceptually one call occurs first * and we have a single winner. */ - if (increment < -40) - increment = -40; - if (increment > 40) - increment = 40; - - nice = TASK_NICE(current) + increment; - if (nice < -20) - nice = -20; - if (nice > 19) - nice = 19; + increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); + nice = task_nice(current) + increment; + nice = clamp_val(nice, MIN_NICE, MAX_NICE); if (increment < 0 && !can_nice(current, nice)) return -EPERM; @@ -3109,18 +3134,6 @@ int task_prio(const struct task_struct *p) } /** - * task_nice - return the nice value of a given task. - * @p: the task in question. - * - * Return: The nice value [ -20 ... 0 ... 19 ]. - */ -int task_nice(const struct task_struct *p) -{ - return TASK_NICE(p); -} -EXPORT_SYMBOL(task_nice); - -/** * idle_cpu - is a given cpu idle currently? * @cpu: the processor in question. * @@ -3187,11 +3200,11 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr) dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); dl_se->dl_throttled = 0; dl_se->dl_new = 1; + dl_se->dl_yielded = 0; } -/* Actually do priority change: must hold pi & rq lock. */ -static void __setscheduler(struct rq *rq, struct task_struct *p, - const struct sched_attr *attr) +static void __setscheduler_params(struct task_struct *p, + const struct sched_attr *attr) { int policy = attr->sched_policy; @@ -3211,9 +3224,21 @@ static void __setscheduler(struct rq *rq, struct task_struct *p, * getparam()/getattr() don't report silly values for !rt tasks. */ p->rt_priority = attr->sched_priority; - p->normal_prio = normal_prio(p); - p->prio = rt_mutex_getprio(p); + set_load_weight(p); +} + +/* Actually do priority change: must hold pi & rq lock. */ +static void __setscheduler(struct rq *rq, struct task_struct *p, + const struct sched_attr *attr) +{ + __setscheduler_params(p, attr); + + /* + * If we get here, there was no pi waiters boosting the + * task. It is safe to use the normal prio. + */ + p->prio = normal_prio(p); if (dl_prio(p->prio)) p->sched_class = &dl_sched_class; @@ -3221,8 +3246,6 @@ static void __setscheduler(struct rq *rq, struct task_struct *p, p->sched_class = &rt_sched_class; else p->sched_class = &fair_sched_class; - - set_load_weight(p); } static void @@ -3242,17 +3265,40 @@ __getparam_dl(struct task_struct *p, struct sched_attr *attr) * We ask for the deadline not being zero, and greater or equal * than the runtime, as well as the period of being zero or * greater than deadline. Furthermore, we have to be sure that - * user parameters are above the internal resolution (1us); we - * check sched_runtime only since it is always the smaller one. + * user parameters are above the internal resolution of 1us (we + * check sched_runtime only since it is always the smaller one) and + * below 2^63 ns (we have to check both sched_deadline and + * sched_period, as the latter can be zero). */ static bool __checkparam_dl(const struct sched_attr *attr) { - return attr && attr->sched_deadline != 0 && - (attr->sched_period == 0 || - (s64)(attr->sched_period - attr->sched_deadline) >= 0) && - (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0 && - attr->sched_runtime >= (2 << (DL_SCALE - 1)); + /* deadline != 0 */ + if (attr->sched_deadline == 0) + return false; + + /* + * Since we truncate DL_SCALE bits, make sure we're at least + * that big. + */ + if (attr->sched_runtime < (1ULL << DL_SCALE)) + return false; + + /* + * Since we use the MSB for wrap-around and sign issues, make + * sure it's not set (mind that period can be equal to zero). + */ + if (attr->sched_deadline & (1ULL << 63) || + attr->sched_period & (1ULL << 63)) + return false; + + /* runtime <= deadline <= period (if period != 0) */ + if ((attr->sched_period != 0 && + attr->sched_period < attr->sched_deadline) || + attr->sched_deadline < attr->sched_runtime) + return false; + + return true; } /* @@ -3275,6 +3321,8 @@ static int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user) { + int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : + MAX_RT_PRIO - 1 - attr->sched_priority; int retval, oldprio, oldpolicy = -1, on_rq, running; int policy = attr->sched_policy; unsigned long flags; @@ -3319,7 +3367,7 @@ recheck: */ if (user && !capable(CAP_SYS_NICE)) { if (fair_policy(policy)) { - if (attr->sched_nice < TASK_NICE(p) && + if (attr->sched_nice < task_nice(p) && !can_nice(p, attr->sched_nice)) return -EPERM; } @@ -3338,12 +3386,21 @@ recheck: return -EPERM; } + /* + * Can't set/change SCHED_DEADLINE policy at all for now + * (safest behavior); in the future we would like to allow + * unprivileged DL tasks to increase their relative deadline + * or reduce their runtime (both ways reducing utilization) + */ + if (dl_policy(policy)) + return -EPERM; + /* * Treat SCHED_IDLE as nice 20. Only allow a switch to * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. */ if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { - if (!can_nice(p, TASK_NICE(p))) + if (!can_nice(p, task_nice(p))) return -EPERM; } @@ -3380,16 +3437,18 @@ recheck: } /* - * If not changing anything there's no need to proceed further: + * If not changing anything there's no need to proceed further, + * but store a possible modification of reset_on_fork. */ if (unlikely(policy == p->policy)) { - if (fair_policy(policy) && attr->sched_nice != TASK_NICE(p)) + if (fair_policy(policy) && attr->sched_nice != task_nice(p)) goto change; if (rt_policy(policy) && attr->sched_priority != p->rt_priority) goto change; if (dl_policy(policy)) goto change; + p->sched_reset_on_fork = reset_on_fork; task_rq_unlock(rq, p, &flags); return 0; } @@ -3443,6 +3502,24 @@ change: return -EBUSY; } + p->sched_reset_on_fork = reset_on_fork; + oldprio = p->prio; + + /* + * Special case for priority boosted tasks. + * + * If the new priority is lower or equal (user space view) + * than the current (boosted) priority, we just store the new + * normal parameters and do not touch the scheduler class and + * the runqueue. This will be done when the task deboost + * itself. + */ + if (rt_mutex_check_prio(p, newprio)) { + __setscheduler_params(p, attr); + task_rq_unlock(rq, p, &flags); + return 0; + } + on_rq = p->on_rq; running = task_current(rq, p); if (on_rq) @@ -3450,16 +3527,18 @@ change: if (running) p->sched_class->put_prev_task(rq, p); - p->sched_reset_on_fork = reset_on_fork; - - oldprio = p->prio; prev_class = p->sched_class; __setscheduler(rq, p, attr); if (running) p->sched_class->set_curr_task(rq); - if (on_rq) - enqueue_task(rq, p, 0); + if (on_rq) { + /* + * We enqueue to tail when the priority of a task is + * increased (user space view). + */ + enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0); + } check_class_changed(rq, p, prev_class, oldprio); task_rq_unlock(rq, p, &flags); @@ -3615,15 +3694,13 @@ static int sched_copy_attr(struct sched_attr __user *uattr, * XXX: do we want to be lenient like existing syscalls; or do we want * to be strict and return an error on out-of-bounds values? */ - attr->sched_nice = clamp(attr->sched_nice, -20, 19); + attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); -out: - return ret; + return 0; err_size: put_user(sizeof(*attr), &uattr->size); - ret = -E2BIG; - goto out; + return -E2BIG; } /** @@ -3660,18 +3737,24 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) * sys_sched_setattr - same as above, but with extended sched_attr * @pid: the pid in question. * @uattr: structure containing the extended parameters. + * @flags: for future extension. */ -SYSCALL_DEFINE2(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr) +SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, + unsigned int, flags) { struct sched_attr attr; struct task_struct *p; int retval; - if (!uattr || pid < 0) + if (!uattr || pid < 0 || flags) return -EINVAL; - if (sched_copy_attr(uattr, &attr)) - return -EFAULT; + retval = sched_copy_attr(uattr, &attr); + if (retval) + return retval; + + if ((int)attr.sched_policy < 0) + return -EINVAL; rcu_read_lock(); retval = -ESRCH; @@ -3721,7 +3804,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) */ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) { - struct sched_param lp; + struct sched_param lp = { .sched_priority = 0 }; struct task_struct *p; int retval; @@ -3738,11 +3821,8 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) if (retval) goto out_unlock; - if (task_has_dl_policy(p)) { - retval = -EINVAL; - goto out_unlock; - } - lp.sched_priority = p->rt_priority; + if (task_has_rt_policy(p)) + lp.sched_priority = p->rt_priority; rcu_read_unlock(); /* @@ -3780,22 +3860,17 @@ static int sched_read_attr(struct sched_attr __user *uattr, for (; addr < end; addr++) { if (*addr) - goto err_size; + return -EFBIG; } attr->size = usize; } - ret = copy_to_user(uattr, attr, usize); + ret = copy_to_user(uattr, attr, attr->size); if (ret) return -EFAULT; -out: - return ret; - -err_size: - ret = -E2BIG; - goto out; + return 0; } /** @@ -3803,9 +3878,10 @@ err_size: * @pid: the pid in question. * @uattr: structure containing the extended parameters. * @size: sizeof(attr) for fwd/bwd comp. + * @flags: for future extension. */ -SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, - unsigned int, size) +SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, + unsigned int, size, unsigned int, flags) { struct sched_attr attr = { .size = sizeof(struct sched_attr), @@ -3814,7 +3890,7 @@ SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, int retval; if (!uattr || pid < 0 || size > PAGE_SIZE || - size < SCHED_ATTR_SIZE_VER0) + size < SCHED_ATTR_SIZE_VER0 || flags) return -EINVAL; rcu_read_lock(); @@ -3835,7 +3911,7 @@ SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, else if (task_has_rt_policy(p)) attr.sched_priority = p->rt_priority; else - attr.sched_nice = TASK_NICE(p); + attr.sched_nice = task_nice(p); rcu_read_unlock(); @@ -4165,7 +4241,7 @@ EXPORT_SYMBOL(yield); * false (0) if we failed to boost the target. * -ESRCH if there's no task to yield to. */ -bool __sched yield_to(struct task_struct *p, bool preempt) +int __sched yield_to(struct task_struct *p, bool preempt) { struct task_struct *curr = current; struct rq *rq, *p_rq; @@ -4473,6 +4549,7 @@ void init_idle(struct task_struct *idle, int cpu) rcu_read_unlock(); rq->curr = rq->idle = idle; + idle->on_rq = 1; #if defined(CONFIG_SMP) idle->on_cpu = 1; #endif @@ -4692,8 +4769,10 @@ void idle_task_exit(void) BUG_ON(cpu_online(smp_processor_id())); - if (mm != &init_mm) + if (mm != &init_mm) { switch_mm(mm, &init_mm, current); + finish_arch_post_lock_switch(); + } mmdrop(mm); } @@ -4711,6 +4790,22 @@ static void calc_load_migrate(struct rq *rq) atomic_long_add(delta, &calc_load_tasks); } +static void put_prev_task_fake(struct rq *rq, struct task_struct *prev) +{ +} + +static const struct sched_class fake_sched_class = { + .put_prev_task = put_prev_task_fake, +}; + +static struct task_struct fake_task = { + /* + * Avoid pull_{rt,dl}_task() + */ + .prio = MAX_PRIO + 1, + .sched_class = &fake_sched_class, +}; + /* * Migrate all tasks from the rq, sleeping tasks will be migrated by * try_to_wake_up()->select_task_rq(). @@ -4751,7 +4846,7 @@ static void migrate_tasks(unsigned int dead_cpu) if (rq->nr_running == 1) break; - next = pick_next_task(rq); + next = pick_next_task(rq, &fake_task); BUG_ON(!next); next->sched_class->put_prev_task(rq, next); @@ -4841,7 +4936,7 @@ set_table_entry(struct ctl_table *entry, static struct ctl_table * sd_alloc_ctl_domain_table(struct sched_domain *sd) { - struct ctl_table *table = sd_alloc_ctl_entry(13); + struct ctl_table *table = sd_alloc_ctl_entry(14); if (table == NULL) return NULL; @@ -4869,9 +4964,12 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) sizeof(int), 0644, proc_dointvec_minmax, false); set_table_entry(&table[10], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax, false); - set_table_entry(&table[11], "name", sd->name, + set_table_entry(&table[11], "max_newidle_lb_cost", + &sd->max_newidle_lb_cost, + sizeof(long), 0644, proc_doulongvec_minmax, false); + set_table_entry(&table[12], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring, false); - /* &table[12] is terminator */ + /* &table[13] is terminator */ return table; } @@ -5037,11 +5135,20 @@ static struct notifier_block migration_notifier = { .priority = CPU_PRI_MIGRATION, }; +static void __cpuinit set_cpu_rq_start_time(void) +{ + int cpu = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); + rq->age_stamp = sched_clock_cpu(cpu); +} + static int sched_cpu_active(struct notifier_block *nfb, unsigned long action, void *hcpu) { switch (action & ~CPU_TASKS_FROZEN) { case CPU_STARTING: + set_cpu_rq_start_time(); + return NOTIFY_OK; case CPU_DOWN_FAILED: set_cpu_active((long)hcpu, true); return NOTIFY_OK; @@ -5160,14 +5267,13 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, } /* - * Even though we initialize ->power to something semi-sane, - * we leave power_orig unset. This allows us to detect if + * Even though we initialize ->capacity to something semi-sane, + * we leave capacity_orig unset. This allows us to detect if * domain iteration is still funny without causing /0 traps. */ - if (!group->sgp->power_orig) { + if (!group->sgc->capacity_orig) { printk(KERN_CONT "\n"); - printk(KERN_ERR "ERROR: domain->cpu_power not " - "set\n"); + printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n"); break; } @@ -5189,9 +5295,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); printk(KERN_CONT " %s", str); - if (group->sgp->power != SCHED_POWER_SCALE) { - printk(KERN_CONT " (cpu_power = %d)", - group->sgp->power); + if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { + printk(KERN_CONT " (cpu_capacity = %d)", + group->sgc->capacity); } group = group->next; @@ -5249,8 +5355,9 @@ static int sd_degenerate(struct sched_domain *sd) SD_BALANCE_NEWIDLE | SD_BALANCE_FORK | SD_BALANCE_EXEC | - SD_SHARE_CPUPOWER | - SD_SHARE_PKG_RESOURCES)) { + SD_SHARE_CPUCAPACITY | + SD_SHARE_PKG_RESOURCES | + SD_SHARE_POWERDOMAIN)) { if (sd->groups != sd->groups->next) return 0; } @@ -5279,9 +5386,10 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) SD_BALANCE_NEWIDLE | SD_BALANCE_FORK | SD_BALANCE_EXEC | - SD_SHARE_CPUPOWER | + SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | - SD_PREFER_SIBLING); + SD_PREFER_SIBLING | + SD_SHARE_POWERDOMAIN); if (nr_node_ids == 1) pflags &= ~SD_SERIALIZE; } @@ -5403,7 +5511,7 @@ static struct root_domain *alloc_rootdomain(void) return rd; } -static void free_sched_groups(struct sched_group *sg, int free_sgp) +static void free_sched_groups(struct sched_group *sg, int free_sgc) { struct sched_group *tmp, *first; @@ -5414,8 +5522,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgp) do { tmp = sg->next; - if (free_sgp && atomic_dec_and_test(&sg->sgp->ref)) - kfree(sg->sgp); + if (free_sgc && atomic_dec_and_test(&sg->sgc->ref)) + kfree(sg->sgc); kfree(sg); sg = tmp; @@ -5433,7 +5541,7 @@ static void free_sched_domain(struct rcu_head *rcu) if (sd->flags & SD_OVERLAP) { free_sched_groups(sd->groups, 1); } else if (atomic_dec_and_test(&sd->groups->ref)) { - kfree(sd->groups->sgp); + kfree(sd->groups->sgc); kfree(sd->groups); } kfree(sd); @@ -5555,17 +5663,6 @@ static int __init isolated_cpu_setup(char *str) __setup("isolcpus=", isolated_cpu_setup); -static const struct cpumask *cpu_cpu_mask(int cpu) -{ - return cpumask_of_node(cpu_to_node(cpu)); -} - -struct sd_data { - struct sched_domain **__percpu sd; - struct sched_group **__percpu sg; - struct sched_group_power **__percpu sgp; -}; - struct s_data { struct sched_domain ** __percpu sd; struct root_domain *rd; @@ -5578,21 +5675,6 @@ enum s_alloc { sa_none, }; -struct sched_domain_topology_level; - -typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); -typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); - -#define SDTL_OVERLAP 0x01 - -struct sched_domain_topology_level { - sched_domain_init_f init; - sched_domain_mask_f mask; - int flags; - int numa_level; - struct sd_data data; -}; - /* * Build an iteration mask that can exclude certain CPUs from the upwards * domain traversal. @@ -5670,17 +5752,17 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) cpumask_or(covered, covered, sg_span); - sg->sgp = *per_cpu_ptr(sdd->sgp, i); - if (atomic_inc_return(&sg->sgp->ref) == 1) + sg->sgc = *per_cpu_ptr(sdd->sgc, i); + if (atomic_inc_return(&sg->sgc->ref) == 1) build_group_mask(sd, sg); /* - * Initialize sgp->power such that even if we mess up the + * Initialize sgc->capacity such that even if we mess up the * domains and no possible iteration will get us here, we won't * die on a /0 trap. */ - sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span); - sg->sgp->power_orig = sg->sgp->power; + sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); + sg->sgc->capacity_orig = sg->sgc->capacity; /* * Make sure the first group of this domain contains the @@ -5718,8 +5800,8 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) if (sg) { *sg = *per_cpu_ptr(sdd->sg, cpu); - (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu); - atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */ + (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu); + atomic_set(&(*sg)->sgc->ref, 1); /* for claim_allocations */ } return cpu; @@ -5728,7 +5810,7 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) /* * build_sched_groups will build a circular linked list of the groups * covered by the given span, and will set each group's ->cpumask correctly, - * and ->cpu_power to 0. + * and ->cpu_capacity to 0. * * Assumes the sched_domain tree is fully constructed */ @@ -5760,8 +5842,6 @@ build_sched_groups(struct sched_domain *sd, int cpu) continue; group = get_group(i, sdd, &sg); - cpumask_clear(sched_group_cpus(sg)); - sg->sgp->power = 0; cpumask_setall(sched_group_mask(sg)); for_each_cpu(j, span) { @@ -5784,16 +5864,16 @@ build_sched_groups(struct sched_domain *sd, int cpu) } /* - * Initialize sched groups cpu_power. + * Initialize sched groups cpu_capacity. * - * cpu_power indicates the capacity of sched group, which is used while + * cpu_capacity indicates the capacity of sched group, which is used while * distributing the load between different sched groups in a sched domain. - * Typically cpu_power for all the groups in a sched domain will be same unless - * there are asymmetries in the topology. If there are asymmetries, group - * having more cpu_power will pickup more load compared to the group having - * less cpu_power. + * Typically cpu_capacity for all the groups in a sched domain will be same + * unless there are asymmetries in the topology. If there are asymmetries, + * group having more cpu_capacity will pickup more load compared to the + * group having less cpu_capacity. */ -static void init_sched_groups_power(int cpu, struct sched_domain *sd) +static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) { struct sched_group *sg = sd->groups; @@ -5807,13 +5887,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) if (cpu != group_balance_cpu(sg)) return; - update_group_power(sd, cpu); - atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight); -} - -int __weak arch_sd_sibling_asym_packing(void) -{ - return 0*SD_ASYM_PACKING; + update_group_capacity(sd, cpu); + atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight); } /* @@ -5821,34 +5896,6 @@ int __weak arch_sd_sibling_asym_packing(void) * Non-inlined to reduce accumulated stack pressure in build_sched_domains() */ -#ifdef CONFIG_SCHED_DEBUG -# define SD_INIT_NAME(sd, type) sd->name = #type -#else -# define SD_INIT_NAME(sd, type) do { } while (0) -#endif - -#define SD_INIT_FUNC(type) \ -static noinline struct sched_domain * \ -sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ -{ \ - struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \ - *sd = SD_##type##_INIT; \ - SD_INIT_NAME(sd, type); \ - sd->private = &tl->data; \ - return sd; \ -} - -SD_INIT_FUNC(CPU) -#ifdef CONFIG_SCHED_SMT - SD_INIT_FUNC(SIBLING) -#endif -#ifdef CONFIG_SCHED_MC - SD_INIT_FUNC(MC) -#endif -#ifdef CONFIG_SCHED_BOOK - SD_INIT_FUNC(BOOK) -#endif - static int default_relax_domain_level = -1; int sched_domain_level_max; @@ -5932,101 +5979,158 @@ static void claim_allocations(int cpu, struct sched_domain *sd) if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) *per_cpu_ptr(sdd->sg, cpu) = NULL; - if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref)) - *per_cpu_ptr(sdd->sgp, cpu) = NULL; + if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref)) + *per_cpu_ptr(sdd->sgc, cpu) = NULL; } -#ifdef CONFIG_SCHED_SMT -static const struct cpumask *cpu_smt_mask(int cpu) -{ - return topology_thread_cpumask(cpu); -} -#endif - -/* - * Topology list, bottom-up. - */ -static struct sched_domain_topology_level default_topology[] = { -#ifdef CONFIG_SCHED_SMT - { sd_init_SIBLING, cpu_smt_mask, }, -#endif -#ifdef CONFIG_SCHED_MC - { sd_init_MC, cpu_coregroup_mask, }, -#endif -#ifdef CONFIG_SCHED_BOOK - { sd_init_BOOK, cpu_book_mask, }, -#endif - { sd_init_CPU, cpu_cpu_mask, }, - { NULL, }, -}; - -static struct sched_domain_topology_level *sched_domain_topology = default_topology; - -#define for_each_sd_topology(tl) \ - for (tl = sched_domain_topology; tl->init; tl++) - #ifdef CONFIG_NUMA - static int sched_domains_numa_levels; static int *sched_domains_numa_distance; static struct cpumask ***sched_domains_numa_masks; static int sched_domains_curr_level; +#endif -static inline int sd_local_flags(int level) -{ - if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE) - return 0; - - return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; -} +/* + * SD_flags allowed in topology descriptions. + * + * SD_SHARE_CPUCAPACITY - describes SMT topologies + * SD_SHARE_PKG_RESOURCES - describes shared caches + * SD_NUMA - describes NUMA topologies + * SD_SHARE_POWERDOMAIN - describes shared power domain + * + * Odd one out: + * SD_ASYM_PACKING - describes SMT quirks + */ +#define TOPOLOGY_SD_FLAGS \ + (SD_SHARE_CPUCAPACITY | \ + SD_SHARE_PKG_RESOURCES | \ + SD_NUMA | \ + SD_ASYM_PACKING | \ + SD_SHARE_POWERDOMAIN) static struct sched_domain * -sd_numa_init(struct sched_domain_topology_level *tl, int cpu) +sd_init(struct sched_domain_topology_level *tl, int cpu) { struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); - int level = tl->numa_level; - int sd_weight = cpumask_weight( - sched_domains_numa_masks[level][cpu_to_node(cpu)]); + int sd_weight, sd_flags = 0; + +#ifdef CONFIG_NUMA + /* + * Ugly hack to pass state to sd_numa_mask()... + */ + sched_domains_curr_level = tl->numa_level; +#endif + + sd_weight = cpumask_weight(tl->mask(cpu)); + + if (tl->sd_flags) + sd_flags = (*tl->sd_flags)(); + if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS, + "wrong sd_flags in topology description\n")) + sd_flags &= ~TOPOLOGY_SD_FLAGS; *sd = (struct sched_domain){ .min_interval = sd_weight, .max_interval = 2*sd_weight, .busy_factor = 32, .imbalance_pct = 125, - .cache_nice_tries = 2, - .busy_idx = 3, - .idle_idx = 2, + + .cache_nice_tries = 0, + .busy_idx = 0, + .idle_idx = 0, .newidle_idx = 0, .wake_idx = 0, .forkexec_idx = 0, .flags = 1*SD_LOAD_BALANCE | 1*SD_BALANCE_NEWIDLE - | 0*SD_BALANCE_EXEC - | 0*SD_BALANCE_FORK + | 1*SD_BALANCE_EXEC + | 1*SD_BALANCE_FORK | 0*SD_BALANCE_WAKE - | 0*SD_WAKE_AFFINE - | 0*SD_SHARE_CPUPOWER + | 1*SD_WAKE_AFFINE + | 0*SD_SHARE_CPUCAPACITY | 0*SD_SHARE_PKG_RESOURCES - | 1*SD_SERIALIZE + | 0*SD_SERIALIZE | 0*SD_PREFER_SIBLING - | 1*SD_NUMA - | sd_local_flags(level) + | 0*SD_NUMA + | sd_flags , + .last_balance = jiffies, .balance_interval = sd_weight, + .smt_gain = 0, + .max_newidle_lb_cost = 0, + .next_decay_max_lb_cost = jiffies, +#ifdef CONFIG_SCHED_DEBUG + .name = tl->name, +#endif }; - SD_INIT_NAME(sd, NUMA); - sd->private = &tl->data; /* - * Ugly hack to pass state to sd_numa_mask()... + * Convert topological properties into behaviour. */ - sched_domains_curr_level = tl->numa_level; + + if (sd->flags & SD_SHARE_CPUCAPACITY) { + sd->imbalance_pct = 110; + sd->smt_gain = 1178; /* ~15% */ + + } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { + sd->imbalance_pct = 117; + sd->cache_nice_tries = 1; + sd->busy_idx = 2; + +#ifdef CONFIG_NUMA + } else if (sd->flags & SD_NUMA) { + sd->cache_nice_tries = 2; + sd->busy_idx = 3; + sd->idle_idx = 2; + + sd->flags |= SD_SERIALIZE; + if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) { + sd->flags &= ~(SD_BALANCE_EXEC | + SD_BALANCE_FORK | + SD_WAKE_AFFINE); + } + +#endif + } else { + sd->flags |= SD_PREFER_SIBLING; + sd->cache_nice_tries = 1; + sd->busy_idx = 2; + sd->idle_idx = 1; + } + + sd->private = &tl->data; return sd; } +/* + * Topology list, bottom-up. + */ +static struct sched_domain_topology_level default_topology[] = { +#ifdef CONFIG_SCHED_SMT + { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, +#endif +#ifdef CONFIG_SCHED_MC + { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, +#endif + { cpu_cpu_mask, SD_INIT_NAME(DIE) }, + { NULL, }, +}; + +struct sched_domain_topology_level *sched_domain_topology = default_topology; + +#define for_each_sd_topology(tl) \ + for (tl = sched_domain_topology; tl->mask; tl++) + +void set_sched_topology(struct sched_domain_topology_level *tl) +{ + sched_domain_topology = tl; +} + +#ifdef CONFIG_NUMA + static const struct cpumask *sd_numa_mask(int cpu) { return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; @@ -6170,7 +6274,10 @@ static void sched_init_numa(void) } } - tl = kzalloc((ARRAY_SIZE(default_topology) + level) * + /* Compute default topology size */ + for (i = 0; sched_domain_topology[i].mask; i++); + + tl = kzalloc((i + level + 1) * sizeof(struct sched_domain_topology_level), GFP_KERNEL); if (!tl) return; @@ -6178,18 +6285,19 @@ static void sched_init_numa(void) /* * Copy the default topology bits.. */ - for (i = 0; default_topology[i].init; i++) - tl[i] = default_topology[i]; + for (i = 0; sched_domain_topology[i].mask; i++) + tl[i] = sched_domain_topology[i]; /* * .. and append 'j' levels of NUMA goodness. */ for (j = 0; j < level; i++, j++) { tl[i] = (struct sched_domain_topology_level){ - .init = sd_numa_init, .mask = sd_numa_mask, + .sd_flags = cpu_numa_flags, .flags = SDTL_OVERLAP, .numa_level = j, + SD_INIT_NAME(NUMA) }; } @@ -6274,14 +6382,14 @@ static int __sdt_alloc(const struct cpumask *cpu_map) if (!sdd->sg) return -ENOMEM; - sdd->sgp = alloc_percpu(struct sched_group_power *); - if (!sdd->sgp) + sdd->sgc = alloc_percpu(struct sched_group_capacity *); + if (!sdd->sgc) return -ENOMEM; for_each_cpu(j, cpu_map) { struct sched_domain *sd; struct sched_group *sg; - struct sched_group_power *sgp; + struct sched_group_capacity *sgc; sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), GFP_KERNEL, cpu_to_node(j)); @@ -6299,12 +6407,12 @@ static int __sdt_alloc(const struct cpumask *cpu_map) *per_cpu_ptr(sdd->sg, j) = sg; - sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(), + sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(), GFP_KERNEL, cpu_to_node(j)); - if (!sgp) + if (!sgc) return -ENOMEM; - *per_cpu_ptr(sdd->sgp, j) = sgp; + *per_cpu_ptr(sdd->sgc, j) = sgc; } } @@ -6331,15 +6439,15 @@ static void __sdt_free(const struct cpumask *cpu_map) if (sdd->sg) kfree(*per_cpu_ptr(sdd->sg, j)); - if (sdd->sgp) - kfree(*per_cpu_ptr(sdd->sgp, j)); + if (sdd->sgc) + kfree(*per_cpu_ptr(sdd->sgc, j)); } free_percpu(sdd->sd); sdd->sd = NULL; free_percpu(sdd->sg); sdd->sg = NULL; - free_percpu(sdd->sgp); - sdd->sgp = NULL; + free_percpu(sdd->sgc); + sdd->sgc = NULL; } } @@ -6347,7 +6455,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, struct sched_domain_attr *attr, struct sched_domain *child, int cpu) { - struct sched_domain *sd = tl->init(tl, cpu); + struct sched_domain *sd = sd_init(tl, cpu); if (!sd) return child; @@ -6409,14 +6517,14 @@ static int build_sched_domains(const struct cpumask *cpu_map, } } - /* Calculate CPU power for physical packages and nodes */ + /* Calculate CPU capacity for physical packages and nodes */ for (i = nr_cpumask_bits-1; i >= 0; i--) { if (!cpumask_test_cpu(i, cpu_map)) continue; for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { claim_allocations(i, sd); - init_sched_groups_power(i, sd); + init_sched_groups_capacity(i, sd); } } @@ -6451,7 +6559,7 @@ static cpumask_var_t fallback_doms; * cpu core maps. It is supposed to return 1 if the topology changed * or 0 if it stayed the same. */ -int __attribute__((weak)) arch_update_cpu_topology(void) +int __weak arch_update_cpu_topology(void) { return 0; } @@ -6848,7 +6956,6 @@ void __init sched_init(void) rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; #ifdef CONFIG_RT_GROUP_SCHED - INIT_LIST_HEAD(&rq->leaf_rt_rq_list); init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); #endif @@ -6860,7 +6967,7 @@ void __init sched_init(void) #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; - rq->cpu_power = SCHED_POWER_SCALE; + rq->cpu_capacity = SCHED_CAPACITY_SCALE; rq->post_schedule = 0; rq->active_balance = 0; rq->next_balance = jiffies; @@ -6918,6 +7025,7 @@ void __init sched_init(void) if (cpu_isolated_map == NULL) zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); idle_thread_set_boot_cpu(); + set_cpu_rq_start_time(); #endif init_sched_fair_class(); @@ -6937,7 +7045,8 @@ void __might_sleep(const char *file, int line, int preempt_offset) static unsigned long prev_jiffy; /* ratelimiting */ rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ - if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || + if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && + !is_idle_task(current)) || system_state != SYSTEM_RUNNING || oops_in_progress) return; if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) @@ -6955,6 +7064,13 @@ void __might_sleep(const char *file, int line, int preempt_offset) debug_show_held_locks(current); if (irqs_disabled()) print_irqtrace_events(current); +#ifdef CONFIG_DEBUG_PREEMPT + if (!preempt_count_equals(preempt_offset)) { + pr_err("Preemption disabled at:"); + print_ip_sym(current->preempt_disable_ip); + pr_cont("\n"); + } +#endif dump_stack(); } EXPORT_SYMBOL(__might_sleep); @@ -7008,7 +7124,7 @@ void normalize_rt_tasks(void) * Renice negative nice level userspace * tasks back to 0: */ - if (TASK_NICE(p) < 0 && p->mm) + if (task_nice(p) < 0 && p->mm) set_user_nice(p, 0); continue; } @@ -7176,7 +7292,7 @@ void sched_move_task(struct task_struct *tsk) if (unlikely(running)) tsk->sched_class->put_prev_task(rq, tsk); - tg = container_of(task_css_check(tsk, cpu_cgroup_subsys_id, + tg = container_of(task_css_check(tsk, cpu_cgrp_id, lockdep_is_held(&tsk->sighand->siglock)), struct task_group, css); tg = autogroup_task_group(tsk, tg); @@ -7422,6 +7538,7 @@ static int sched_dl_global_constraints(void) u64 period = global_rt_period(); u64 new_bw = to_ratio(period, runtime); int cpu, ret = 0; + unsigned long flags; /* * Here we want to check the bandwidth not being set to some @@ -7435,10 +7552,10 @@ static int sched_dl_global_constraints(void) for_each_possible_cpu(cpu) { struct dl_bw *dl_b = dl_bw_of(cpu); - raw_spin_lock(&dl_b->lock); + raw_spin_lock_irqsave(&dl_b->lock, flags); if (new_bw < dl_b->total_bw) ret = -EBUSY; - raw_spin_unlock(&dl_b->lock); + raw_spin_unlock_irqrestore(&dl_b->lock, flags); if (ret) break; @@ -7451,6 +7568,7 @@ static void sched_dl_do_global(void) { u64 new_bw = -1; int cpu; + unsigned long flags; def_dl_bandwidth.dl_period = global_rt_period(); def_dl_bandwidth.dl_runtime = global_rt_runtime(); @@ -7464,9 +7582,9 @@ static void sched_dl_do_global(void) for_each_possible_cpu(cpu) { struct dl_bw *dl_b = dl_bw_of(cpu); - raw_spin_lock(&dl_b->lock); + raw_spin_lock_irqsave(&dl_b->lock, flags); dl_b->bw = new_bw; - raw_spin_unlock(&dl_b->lock); + raw_spin_unlock_irqrestore(&dl_b->lock, flags); } } @@ -7475,7 +7593,8 @@ static int sched_rt_global_validate(void) if (sysctl_sched_rt_period <= 0) return -EINVAL; - if (sysctl_sched_rt_runtime > sysctl_sched_rt_period) + if ((sysctl_sched_rt_runtime != RUNTIME_INF) && + (sysctl_sched_rt_runtime > sysctl_sched_rt_period)) return -EINVAL; return 0; @@ -7574,7 +7693,7 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) { struct task_group *tg = css_tg(css); - struct task_group *parent = css_tg(css_parent(css)); + struct task_group *parent = css_tg(css->parent); if (parent) sched_online_group(tg, parent); @@ -7600,7 +7719,7 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, { struct task_struct *task; - cgroup_taskset_for_each(task, css, tset) { + cgroup_taskset_for_each(task, tset) { #ifdef CONFIG_RT_GROUP_SCHED if (!sched_rt_can_attach(css_tg(css), task)) return -EINVAL; @@ -7618,7 +7737,7 @@ static void cpu_cgroup_attach(struct cgroup_subsys_state *css, { struct task_struct *task; - cgroup_taskset_for_each(task, css, tset) + cgroup_taskset_for_each(task, tset) sched_move_task(task); } @@ -7705,8 +7824,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) /* restart the period timer (if active) to handle new period expiry */ if (runtime_enabled && cfs_b->timer_active) { /* force a reprogram */ - cfs_b->timer_active = 0; - __start_cfs_bandwidth(cfs_b); + __start_cfs_bandwidth(cfs_b, true); } raw_spin_unlock_irq(&cfs_b->lock); @@ -7957,8 +8075,7 @@ static struct cftype cpu_files[] = { { } /* terminate */ }; -struct cgroup_subsys cpu_cgroup_subsys = { - .name = "cpu", +struct cgroup_subsys cpu_cgrp_subsys = { .css_alloc = cpu_cgroup_css_alloc, .css_free = cpu_cgroup_css_free, .css_online = cpu_cgroup_css_online, @@ -7966,7 +8083,6 @@ struct cgroup_subsys cpu_cgroup_subsys = { .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, .exit = cpu_cgroup_exit, - .subsys_id = cpu_cgroup_subsys_id, .base_cftypes = cpu_files, .early_init = 1, }; diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 622e0818f90..9cf350c94ec 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -41,12 +41,12 @@ static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css) /* return cpu accounting group to which this task belongs */ static inline struct cpuacct *task_ca(struct task_struct *tsk) { - return css_ca(task_css(tsk, cpuacct_subsys_id)); + return css_ca(task_css(tsk, cpuacct_cgrp_id)); } static inline struct cpuacct *parent_ca(struct cpuacct *ca) { - return css_ca(css_parent(&ca->css)); + return css_ca(ca->css.parent); } static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); @@ -275,11 +275,9 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val) rcu_read_unlock(); } -struct cgroup_subsys cpuacct_subsys = { - .name = "cpuacct", +struct cgroup_subsys cpuacct_cgrp_subsys = { .css_alloc = cpuacct_css_alloc, .css_free = cpuacct_css_free, - .subsys_id = cpuacct_subsys_id, .base_cftypes = files, .early_init = 1, }; diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 045fc74e3f0..bd95963dae8 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -13,6 +13,7 @@ #include <linux/gfp.h> #include <linux/kernel.h> +#include <linux/slab.h> #include "cpudeadline.h" static inline int parent(int i) @@ -39,8 +40,10 @@ static void cpudl_exchange(struct cpudl *cp, int a, int b) { int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; - swap(cp->elements[a], cp->elements[b]); - swap(cp->cpu_to_idx[cpu_a], cp->cpu_to_idx[cpu_b]); + swap(cp->elements[a].cpu, cp->elements[b].cpu); + swap(cp->elements[a].dl , cp->elements[b].dl ); + + swap(cp->elements[cpu_a].idx, cp->elements[cpu_b].idx); } static void cpudl_heapify(struct cpudl *cp, int idx) @@ -70,7 +73,7 @@ static void cpudl_heapify(struct cpudl *cp, int idx) static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl) { - WARN_ON(idx > num_present_cpus() || idx == IDX_INVALID); + WARN_ON(idx == IDX_INVALID || !cpu_present(idx)); if (dl_time_before(new_dl, cp->elements[idx].dl)) { cp->elements[idx].dl = new_dl; @@ -117,7 +120,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, } out: - WARN_ON(best_cpu > num_present_cpus() && best_cpu != -1); + WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); return best_cpu; } @@ -137,10 +140,10 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) int old_idx, new_cpu; unsigned long flags; - WARN_ON(cpu > num_present_cpus()); + WARN_ON(!cpu_present(cpu)); raw_spin_lock_irqsave(&cp->lock, flags); - old_idx = cp->cpu_to_idx[cpu]; + old_idx = cp->elements[cpu].idx; if (!is_valid) { /* remove item */ if (old_idx == IDX_INVALID) { @@ -155,8 +158,8 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl; cp->elements[old_idx].cpu = new_cpu; cp->size--; - cp->cpu_to_idx[new_cpu] = old_idx; - cp->cpu_to_idx[cpu] = IDX_INVALID; + cp->elements[new_cpu].idx = old_idx; + cp->elements[cpu].idx = IDX_INVALID; while (old_idx > 0 && dl_time_before( cp->elements[parent(old_idx)].dl, cp->elements[old_idx].dl)) { @@ -173,7 +176,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) cp->size++; cp->elements[cp->size - 1].dl = 0; cp->elements[cp->size - 1].cpu = cpu; - cp->cpu_to_idx[cpu] = cp->size - 1; + cp->elements[cpu].idx = cp->size - 1; cpudl_change_key(cp, cp->size - 1, dl); cpumask_clear_cpu(cpu, cp->free_cpus); } else { @@ -195,10 +198,21 @@ int cpudl_init(struct cpudl *cp) memset(cp, 0, sizeof(*cp)); raw_spin_lock_init(&cp->lock); cp->size = 0; - for (i = 0; i < NR_CPUS; i++) - cp->cpu_to_idx[i] = IDX_INVALID; - if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) + + cp->elements = kcalloc(nr_cpu_ids, + sizeof(struct cpudl_item), + GFP_KERNEL); + if (!cp->elements) + return -ENOMEM; + + if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) { + kfree(cp->elements); return -ENOMEM; + } + + for_each_possible_cpu(i) + cp->elements[i].idx = IDX_INVALID; + cpumask_setall(cp->free_cpus); return 0; @@ -210,7 +224,6 @@ int cpudl_init(struct cpudl *cp) */ void cpudl_cleanup(struct cpudl *cp) { - /* - * nothing to do for the moment - */ + free_cpumask_var(cp->free_cpus); + kfree(cp->elements); } diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index a202789a412..538c9796ad4 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h @@ -5,17 +5,17 @@ #define IDX_INVALID -1 -struct array_item { +struct cpudl_item { u64 dl; int cpu; + int idx; }; struct cpudl { raw_spinlock_t lock; int size; - int cpu_to_idx[NR_CPUS]; - struct array_item elements[NR_CPUS]; cpumask_var_t free_cpus; + struct cpudl_item *elements; }; diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 8b836b376d9..981fcd7dc39 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -30,6 +30,7 @@ #include <linux/gfp.h> #include <linux/sched.h> #include <linux/sched/rt.h> +#include <linux/slab.h> #include "cpupri.h" /* Convert between a 140 based task->prio, and our 102 based cpupri */ @@ -70,8 +71,7 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, int idx = 0; int task_pri = convert_prio(p->prio); - if (task_pri >= MAX_RT_PRIO) - return 0; + BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES); for (idx = 0; idx < task_pri; idx++) { struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; @@ -165,7 +165,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) * do a write memory barrier, and then update the count, to * make sure the vector is visible when count is set. */ - smp_mb__before_atomic_inc(); + smp_mb__before_atomic(); atomic_inc(&(vec)->count); do_mb = 1; } @@ -185,14 +185,14 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) * the new priority vec. */ if (do_mb) - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); /* * When removing from the vector, we decrement the counter first * do a memory barrier and then clear the mask. */ atomic_dec(&(vec)->count); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); cpumask_clear_cpu(cpu, vec->mask); } @@ -219,8 +219,13 @@ int cpupri_init(struct cpupri *cp) goto cleanup; } + cp->cpu_to_pri = kcalloc(nr_cpu_ids, sizeof(int), GFP_KERNEL); + if (!cp->cpu_to_pri) + goto cleanup; + for_each_possible_cpu(i) cp->cpu_to_pri[i] = CPUPRI_INVALID; + return 0; cleanup: @@ -237,6 +242,7 @@ void cpupri_cleanup(struct cpupri *cp) { int i; + kfree(cp->cpu_to_pri); for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) free_cpumask_var(cp->pri_to_cpu[i].mask); } diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index f6d75617349..6b033347fdf 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h @@ -17,7 +17,7 @@ struct cpupri_vec { struct cpupri { struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; - int cpu_to_pri[NR_CPUS]; + int *cpu_to_pri; }; #ifdef CONFIG_SMP diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 99947919e30..72fdf06ef86 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -142,7 +142,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime, p->utimescaled += cputime_scaled; account_group_user_time(p, cputime); - index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; + index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; /* Add user time to cpustat. */ task_group_account_field(p, index, (__force u64) cputime); @@ -169,7 +169,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime, p->gtime += cputime; /* Add guest time to cpustat. */ - if (TASK_NICE(p) > 0) { + if (task_nice(p) > 0) { cpustat[CPUTIME_NICE] += (__force u64) cputime; cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; } else { @@ -258,16 +258,22 @@ static __always_inline bool steal_account_process_tick(void) { #ifdef CONFIG_PARAVIRT if (static_key_false(¶virt_steal_enabled)) { - u64 steal, st = 0; + u64 steal; + cputime_t steal_ct; steal = paravirt_steal_clock(smp_processor_id()); steal -= this_rq()->prev_steal_time; - st = steal_ticks(steal); - this_rq()->prev_steal_time += st * TICK_NSEC; + /* + * cputime_t may be less precise than nsecs (eg: if it's + * based on jiffies). Lets cast the result to cputime + * granularity and account the rest on the next rounds. + */ + steal_ct = nsecs_to_cputime(steal); + this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct); - account_steal_time(st); - return st; + account_steal_time(steal_ct); + return steal_ct; } #endif return false; @@ -326,50 +332,50 @@ out: * softirq as those do not count in task exec_runtime any more. */ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, - struct rq *rq) + struct rq *rq, int ticks) { - cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); + cputime_t scaled = cputime_to_scaled(cputime_one_jiffy); + u64 cputime = (__force u64) cputime_one_jiffy; u64 *cpustat = kcpustat_this_cpu->cpustat; if (steal_account_process_tick()) return; + cputime *= ticks; + scaled *= ticks; + if (irqtime_account_hi_update()) { - cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy; + cpustat[CPUTIME_IRQ] += cputime; } else if (irqtime_account_si_update()) { - cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy; + cpustat[CPUTIME_SOFTIRQ] += cputime; } else if (this_cpu_ksoftirqd() == p) { /* * ksoftirqd time do not get accounted in cpu_softirq_time. * So, we have to handle it separately here. * Also, p->stime needs to be updated for ksoftirqd. */ - __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, - CPUTIME_SOFTIRQ); + __account_system_time(p, cputime, scaled, CPUTIME_SOFTIRQ); } else if (user_tick) { - account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); + account_user_time(p, cputime, scaled); } else if (p == rq->idle) { - account_idle_time(cputime_one_jiffy); + account_idle_time(cputime); } else if (p->flags & PF_VCPU) { /* System time or guest time */ - account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); + account_guest_time(p, cputime, scaled); } else { - __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, - CPUTIME_SYSTEM); + __account_system_time(p, cputime, scaled, CPUTIME_SYSTEM); } } static void irqtime_account_idle_ticks(int ticks) { - int i; struct rq *rq = this_rq(); - for (i = 0; i < ticks; i++) - irqtime_account_process_tick(current, 0, rq); + irqtime_account_process_tick(current, 0, rq, ticks); } #else /* CONFIG_IRQ_TIME_ACCOUNTING */ static inline void irqtime_account_idle_ticks(int ticks) {} static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, - struct rq *rq) {} + struct rq *rq, int nr_ticks) {} #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ /* @@ -458,7 +464,7 @@ void account_process_tick(struct task_struct *p, int user_tick) return; if (sched_clock_irqtime) { - irqtime_account_process_tick(p, user_tick, rq); + irqtime_account_process_tick(p, user_tick, rq, 1); return; } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 0dd5e0971a0..fc4f98b1258 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -57,8 +57,6 @@ void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime) dl_b->dl_runtime = runtime; } -extern unsigned long to_ratio(u64 period, u64 runtime); - void init_dl_bw(struct dl_bw *dl_b) { raw_spin_lock_init(&dl_b->lock); @@ -121,7 +119,7 @@ static inline void dl_clear_overload(struct rq *rq) static void update_dl_migration(struct dl_rq *dl_rq) { - if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_total > 1) { + if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_running > 1) { if (!dl_rq->overloaded) { dl_set_overload(rq_of_dl_rq(dl_rq)); dl_rq->overloaded = 1; @@ -135,9 +133,7 @@ static void update_dl_migration(struct dl_rq *dl_rq) static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { struct task_struct *p = dl_task_of(dl_se); - dl_rq = &rq_of_dl_rq(dl_rq)->dl; - dl_rq->dl_nr_total++; if (p->nr_cpus_allowed > 1) dl_rq->dl_nr_migratory++; @@ -147,9 +143,7 @@ static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { struct task_struct *p = dl_task_of(dl_se); - dl_rq = &rq_of_dl_rq(dl_rq)->dl; - dl_rq->dl_nr_total--; if (p->nr_cpus_allowed > 1) dl_rq->dl_nr_migratory--; @@ -214,6 +208,16 @@ static inline int has_pushable_dl_tasks(struct rq *rq) static int push_dl_task(struct rq *rq); +static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev) +{ + return dl_task(prev); +} + +static inline void set_post_schedule(struct rq *rq) +{ + rq->post_schedule = has_pushable_dl_tasks(rq); +} + #else static inline @@ -236,6 +240,19 @@ void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { } +static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev) +{ + return false; +} + +static inline int pull_dl_task(struct rq *rq) +{ + return 0; +} + +static inline void set_post_schedule(struct rq *rq) +{ +} #endif /* CONFIG_SMP */ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); @@ -329,12 +346,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, * entity. */ if (dl_time_before(dl_se->deadline, rq_clock(rq))) { - static bool lag_once = false; - - if (!lag_once) { - lag_once = true; - printk_sched("sched: DL replenish lagged to much\n"); - } + printk_deferred_once("sched: DL replenish lagged to much\n"); dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; dl_se->runtime = pi_se->dl_runtime; } @@ -494,14 +506,22 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) struct sched_dl_entity, dl_timer); struct task_struct *p = dl_task_of(dl_se); - struct rq *rq = task_rq(p); + struct rq *rq; +again: + rq = task_rq(p); raw_spin_lock(&rq->lock); + if (rq != task_rq(p)) { + /* Task was moved, retrying. */ + raw_spin_unlock(&rq->lock); + goto again; + } + /* * We need to take care of a possible races here. In fact, the * task might have changed its scheduling policy to something * different from SCHED_DEADLINE or changed its reservation - * parameters (through sched_setscheduler()). + * parameters (through sched_setattr()). */ if (!dl_task(p) || dl_se->dl_new) goto unlock; @@ -509,6 +529,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) sched_clock_tick(); update_rq_clock(rq); dl_se->dl_throttled = 0; + dl_se->dl_yielded = 0; if (p->on_rq) { enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); if (task_has_dl_policy(rq->curr)) @@ -566,6 +587,8 @@ int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se) return 1; } +extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); + /* * Update the current task's runtime statistics (provided it is still * a -deadline task and has not been removed from the dl_rq). @@ -588,8 +611,8 @@ static void update_curr_dl(struct rq *rq) * approach need further study. */ delta_exec = rq_clock_task(rq) - curr->se.exec_start; - if (unlikely((s64)delta_exec < 0)) - delta_exec = 0; + if (unlikely((s64)delta_exec <= 0)) + return; schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); @@ -629,11 +652,13 @@ static void update_curr_dl(struct rq *rq) struct rt_rq *rt_rq = &rq->rt; raw_spin_lock(&rt_rq->rt_runtime_lock); - rt_rq->rt_time += delta_exec; /* * We'll let actual RT tasks worry about the overflow here, we - * have our own CBS to keep us inline -- see above. + * have our own CBS to keep us inline; only account when RT + * bandwidth is relevant. */ + if (sched_rt_bandwidth_account(rt_rq)) + rt_rq->rt_time += delta_exec; raw_spin_unlock(&rt_rq->rt_runtime_lock); } } @@ -717,6 +742,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) WARN_ON(!dl_prio(prio)); dl_rq->dl_nr_running++; + add_nr_running(rq_of_dl_rq(dl_rq), 1); inc_dl_deadline(dl_rq, deadline); inc_dl_migration(dl_se, dl_rq); @@ -730,6 +756,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) WARN_ON(!dl_prio(prio)); WARN_ON(!dl_rq->dl_nr_running); dl_rq->dl_nr_running--; + sub_nr_running(rq_of_dl_rq(dl_rq), 1); dec_dl_deadline(dl_rq, dl_se->deadline); dec_dl_migration(dl_se, dl_rq); @@ -836,8 +863,6 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); - - inc_nr_running(rq); } static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) @@ -850,8 +875,6 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) { update_curr_dl(rq); __dequeue_task_dl(rq, p, flags); - - dec_nr_running(rq); } /* @@ -872,10 +895,10 @@ static void yield_task_dl(struct rq *rq) * We make the task go to sleep until its current deadline by * forcing its runtime to zero. This way, update_curr_dl() stops * it and the bandwidth timer will wake it up and will give it - * new scheduling parameters (thanks to dl_new=1). + * new scheduling parameters (thanks to dl_yielded=1). */ if (p->dl.runtime > 0) { - rq->curr->dl.dl_new = 1; + rq->curr->dl.dl_yielded = 1; p->dl.runtime = 0; } update_curr_dl(rq); @@ -944,6 +967,8 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) resched_task(rq->curr); } +static int pull_dl_task(struct rq *this_rq); + #endif /* CONFIG_SMP */ /* @@ -990,7 +1015,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, return rb_entry(left, struct sched_dl_entity, rb_node); } -struct task_struct *pick_next_task_dl(struct rq *rq) +struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev) { struct sched_dl_entity *dl_se; struct task_struct *p; @@ -998,9 +1023,29 @@ struct task_struct *pick_next_task_dl(struct rq *rq) dl_rq = &rq->dl; + if (need_pull_dl_task(rq, prev)) { + pull_dl_task(rq); + /* + * pull_rt_task() can drop (and re-acquire) rq->lock; this + * means a stop task can slip in, in which case we need to + * re-start task selection. + */ + if (rq->stop && rq->stop->on_rq) + return RETRY_TASK; + } + + /* + * When prev is DL, we may throttle it in put_prev_task(). + * So, we update time before we check for dl_nr_running. + */ + if (prev->sched_class == &dl_sched_class) + update_curr_dl(rq); + if (unlikely(!dl_rq->dl_nr_running)) return NULL; + put_prev_task(rq, prev); + dl_se = pick_next_dl_entity(rq, dl_rq); BUG_ON(!dl_se); @@ -1015,9 +1060,7 @@ struct task_struct *pick_next_task_dl(struct rq *rq) start_hrtick_dl(rq, p); #endif -#ifdef CONFIG_SMP - rq->post_schedule = has_pushable_dl_tasks(rq); -#endif /* CONFIG_SMP */ + set_post_schedule(rq); return p; } @@ -1426,13 +1469,6 @@ skip: return ret; } -static void pre_schedule_dl(struct rq *rq, struct task_struct *prev) -{ - /* Try to pull other tasks here */ - if (dl_task(prev)) - pull_dl_task(rq); -} - static void post_schedule_dl(struct rq *rq) { push_dl_tasks(rq); @@ -1560,7 +1596,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) if (unlikely(p->dl.dl_throttled)) return; - if (p->on_rq || rq->curr != p) { + if (p->on_rq && rq->curr != p) { #ifdef CONFIG_SMP if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p)) /* Only reschedule if pushing failed */ @@ -1625,7 +1661,6 @@ const struct sched_class dl_sched_class = { .set_cpus_allowed = set_cpus_allowed_dl, .rq_online = rq_online_dl, .rq_offline = rq_offline_dl, - .pre_schedule = pre_schedule_dl, .post_schedule = post_schedule_dl, .task_woken = task_woken_dl, #endif diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index dd52e7ffb10..627b3c34b82 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -111,8 +111,7 @@ static char *task_group_path(struct task_group *tg) if (autogroup_path(tg, group_path, PATH_MAX)) return group_path; - cgroup_path(tg->css.cgroup, group_path, PATH_MAX); - return group_path; + return cgroup_path(tg->css.cgroup, group_path, PATH_MAX); } #endif @@ -321,6 +320,7 @@ do { \ P(sched_goidle); #ifdef CONFIG_SMP P64(avg_idle); + P64(max_idle_balance_cost); #endif P(ttwu_count); @@ -533,15 +533,15 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m) unsigned long nr_faults = -1; int cpu_current, home_node; - if (p->numa_faults) - nr_faults = p->numa_faults[2*node + i]; + if (p->numa_faults_memory) + nr_faults = p->numa_faults_memory[2*node + i]; cpu_current = !i ? (task_node(p) == node) : (pol && node_isset(node, pol->v.nodes)); home_node = (p->numa_preferred_nid == node); - SEQ_printf(m, "numa_faults, %d, %d, %d, %d, %ld\n", + SEQ_printf(m, "numa_faults_memory, %d, %d, %d, %d, %ld\n", i, node, cpu_current, home_node, nr_faults); } } @@ -608,7 +608,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) avg_atom = p->se.sum_exec_runtime; if (nr_switches) - do_div(avg_atom, nr_switches); + avg_atom = div64_ul(avg_atom, nr_switches); else avg_atom = -1LL; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 966cc2bfcb7..fea7d3335e1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -322,13 +322,13 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) /* Do the two (enqueued) entities belong to the same group ? */ -static inline int +static inline struct cfs_rq * is_same_group(struct sched_entity *se, struct sched_entity *pse) { if (se->cfs_rq == pse->cfs_rq) - return 1; + return se->cfs_rq; - return 0; + return NULL; } static inline struct sched_entity *parent_entity(struct sched_entity *se) @@ -336,17 +336,6 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se) return se->parent; } -/* return depth at which a sched entity is present in the hierarchy */ -static inline int depth_se(struct sched_entity *se) -{ - int depth = 0; - - for_each_sched_entity(se) - depth++; - - return depth; -} - static void find_matching_se(struct sched_entity **se, struct sched_entity **pse) { @@ -360,8 +349,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) */ /* First walk up until both entities are at same depth */ - se_depth = depth_se(*se); - pse_depth = depth_se(*pse); + se_depth = (*se)->depth; + pse_depth = (*pse)->depth; while (se_depth > pse_depth) { se_depth--; @@ -426,12 +415,6 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) #define for_each_leaf_cfs_rq(rq, cfs_rq) \ for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) -static inline int -is_same_group(struct sched_entity *se, struct sched_entity *pse) -{ - return 1; -} - static inline struct sched_entity *parent_entity(struct sched_entity *se) { return NULL; @@ -819,14 +802,6 @@ unsigned int sysctl_numa_balancing_scan_size = 256; /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ unsigned int sysctl_numa_balancing_scan_delay = 1000; -/* - * After skipping a page migration on a shared page, skip N more numa page - * migrations unconditionally. This reduces the number of NUMA migrations - * in shared memory workloads, and has the effect of pulling tasks towards - * where their memory lives, over pulling the memory towards the task. - */ -unsigned int sysctl_numa_balancing_migrate_deferred = 16; - static unsigned int task_nr_scan_windows(struct task_struct *p) { unsigned long rss = 0; @@ -893,10 +868,26 @@ struct numa_group { struct list_head task_list; struct rcu_head rcu; + nodemask_t active_nodes; unsigned long total_faults; + /* + * Faults_cpu is used to decide whether memory should move + * towards the CPU. As a consequence, these stats are weighted + * more by CPU use than by memory faults. + */ + unsigned long *faults_cpu; unsigned long faults[0]; }; +/* Shared or private faults. */ +#define NR_NUMA_HINT_FAULT_TYPES 2 + +/* Memory and CPU locality */ +#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2) + +/* Averaged statistics, and temporary buffers. */ +#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2) + pid_t task_numa_group_id(struct task_struct *p) { return p->numa_group ? p->numa_group->gid : 0; @@ -904,16 +895,16 @@ pid_t task_numa_group_id(struct task_struct *p) static inline int task_faults_idx(int nid, int priv) { - return 2 * nid + priv; + return NR_NUMA_HINT_FAULT_TYPES * nid + priv; } static inline unsigned long task_faults(struct task_struct *p, int nid) { - if (!p->numa_faults) + if (!p->numa_faults_memory) return 0; - return p->numa_faults[task_faults_idx(nid, 0)] + - p->numa_faults[task_faults_idx(nid, 1)]; + return p->numa_faults_memory[task_faults_idx(nid, 0)] + + p->numa_faults_memory[task_faults_idx(nid, 1)]; } static inline unsigned long group_faults(struct task_struct *p, int nid) @@ -925,6 +916,12 @@ static inline unsigned long group_faults(struct task_struct *p, int nid) p->numa_group->faults[task_faults_idx(nid, 1)]; } +static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) +{ + return group->faults_cpu[task_faults_idx(nid, 0)] + + group->faults_cpu[task_faults_idx(nid, 1)]; +} + /* * These return the fraction of accesses done by a particular task, or * task group, on a particular numa node. The group weight is given a @@ -935,7 +932,7 @@ static inline unsigned long task_weight(struct task_struct *p, int nid) { unsigned long total_faults; - if (!p->numa_faults) + if (!p->numa_faults_memory) return 0; total_faults = p->total_numa_faults; @@ -954,10 +951,73 @@ static inline unsigned long group_weight(struct task_struct *p, int nid) return 1000 * group_faults(p, nid) / p->numa_group->total_faults; } +bool should_numa_migrate_memory(struct task_struct *p, struct page * page, + int src_nid, int dst_cpu) +{ + struct numa_group *ng = p->numa_group; + int dst_nid = cpu_to_node(dst_cpu); + int last_cpupid, this_cpupid; + + this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid); + + /* + * Multi-stage node selection is used in conjunction with a periodic + * migration fault to build a temporal task<->page relation. By using + * a two-stage filter we remove short/unlikely relations. + * + * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate + * a task's usage of a particular page (n_p) per total usage of this + * page (n_t) (in a given time-span) to a probability. + * + * Our periodic faults will sample this probability and getting the + * same result twice in a row, given these samples are fully + * independent, is then given by P(n)^2, provided our sample period + * is sufficiently short compared to the usage pattern. + * + * This quadric squishes small probabilities, making it less likely we + * act on an unlikely task<->page relation. + */ + last_cpupid = page_cpupid_xchg_last(page, this_cpupid); + if (!cpupid_pid_unset(last_cpupid) && + cpupid_to_nid(last_cpupid) != dst_nid) + return false; + + /* Always allow migrate on private faults */ + if (cpupid_match_pid(p, last_cpupid)) + return true; + + /* A shared fault, but p->numa_group has not been set up yet. */ + if (!ng) + return true; + + /* + * Do not migrate if the destination is not a node that + * is actively used by this numa group. + */ + if (!node_isset(dst_nid, ng->active_nodes)) + return false; + + /* + * Source is a node that is not actively used by this + * numa group, while the destination is. Migrate. + */ + if (!node_isset(src_nid, ng->active_nodes)) + return true; + + /* + * Both source and destination are nodes in active + * use by this numa group. Maximize memory bandwidth + * by migrating from more heavily used groups, to less + * heavily used ones, spreading the load around. + * Use a 1/4 hysteresis to avoid spurious page movement. + */ + return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4); +} + static unsigned long weighted_cpuload(const int cpu); static unsigned long source_load(int cpu, int type); static unsigned long target_load(int cpu, int type); -static unsigned long power_of(int cpu); +static unsigned long capacity_of(int cpu); static long effective_load(struct task_group *tg, int cpu, long wl, long wg); /* Cached statistics for all CPUs within a node */ @@ -966,11 +1026,11 @@ struct numa_stats { unsigned long load; /* Total compute capacity of CPUs on a node */ - unsigned long power; + unsigned long compute_capacity; /* Approximate capacity in terms of runnable tasks on a node */ - unsigned long capacity; - int has_capacity; + unsigned long task_capacity; + int has_free_capacity; }; /* @@ -986,7 +1046,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid) ns->nr_running += rq->nr_running; ns->load += weighted_cpuload(cpu); - ns->power += power_of(cpu); + ns->compute_capacity += capacity_of(cpu); cpus++; } @@ -996,15 +1056,16 @@ static void update_numa_stats(struct numa_stats *ns, int nid) * the @ns structure is NULL'ed and task_numa_compare() will * not find this node attractive. * - * We'll either bail at !has_capacity, or we'll detect a huge imbalance - * and bail there. + * We'll either bail at !has_free_capacity, or we'll detect a huge + * imbalance and bail there. */ if (!cpus) return; - ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power; - ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE); - ns->has_capacity = (ns->nr_running < ns->capacity); + ns->load = (ns->load * SCHED_CAPACITY_SCALE) / ns->compute_capacity; + ns->task_capacity = + DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE); + ns->has_free_capacity = (ns->nr_running < ns->task_capacity); } struct task_numa_env { @@ -1035,6 +1096,34 @@ static void task_numa_assign(struct task_numa_env *env, env->best_cpu = env->dst_cpu; } +static bool load_too_imbalanced(long orig_src_load, long orig_dst_load, + long src_load, long dst_load, + struct task_numa_env *env) +{ + long imb, old_imb; + + /* We care about the slope of the imbalance, not the direction. */ + if (dst_load < src_load) + swap(dst_load, src_load); + + /* Is the difference below the threshold? */ + imb = dst_load * 100 - src_load * env->imbalance_pct; + if (imb <= 0) + return false; + + /* + * The imbalance is above the allowed threshold. + * Compare it with the old imbalance. + */ + if (orig_dst_load < orig_src_load) + swap(orig_dst_load, orig_src_load); + + old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct; + + /* Would this change make things worse? */ + return (imb > old_imb); +} + /* * This checks if the overall compute and NUMA accesses of the system would * be improved if the source tasks was migrated to the target dst_cpu taking @@ -1047,7 +1136,8 @@ static void task_numa_compare(struct task_numa_env *env, struct rq *src_rq = cpu_rq(env->src_cpu); struct rq *dst_rq = cpu_rq(env->dst_cpu); struct task_struct *cur; - long dst_load, src_load; + long orig_src_load, src_load; + long orig_dst_load, dst_load; long load; long imp = (groupimp > 0) ? groupimp : taskimp; @@ -1106,8 +1196,8 @@ static void task_numa_compare(struct task_numa_env *env, if (!cur) { /* Is there capacity at our destination? */ - if (env->src_stats.has_capacity && - !env->dst_stats.has_capacity) + if (env->src_stats.has_free_capacity && + !env->dst_stats.has_free_capacity) goto unlock; goto balance; @@ -1121,13 +1211,13 @@ static void task_numa_compare(struct task_numa_env *env, * In the overloaded case, try and keep the load balanced. */ balance: - dst_load = env->dst_stats.load; - src_load = env->src_stats.load; + orig_dst_load = env->dst_stats.load; + orig_src_load = env->src_stats.load; - /* XXX missing power terms */ + /* XXX missing capacity terms */ load = task_h_load(env->p); - dst_load += load; - src_load -= load; + dst_load = orig_dst_load + load; + src_load = orig_src_load - load; if (cur) { load = task_h_load(cur); @@ -1135,11 +1225,8 @@ balance: src_load += load; } - /* make src_load the smaller */ - if (dst_load < src_load) - swap(dst_load, src_load); - - if (src_load * env->imbalance_pct < dst_load * 100) + if (load_too_imbalanced(orig_src_load, orig_dst_load, + src_load, dst_load, env)) goto unlock; assign: @@ -1215,8 +1302,8 @@ static int task_numa_migrate(struct task_struct *p) groupimp = group_weight(p, env.dst_nid) - groupweight; update_numa_stats(&env.dst_stats, env.dst_nid); - /* If the preferred nid has capacity, try to use it. */ - if (env.dst_stats.has_capacity) + /* If the preferred nid has free capacity, try to use it. */ + if (env.dst_stats.has_free_capacity) task_numa_find_cpu(&env, taskimp, groupimp); /* No space available on the preferred nid. Look elsewhere. */ @@ -1241,7 +1328,16 @@ static int task_numa_migrate(struct task_struct *p) if (env.best_cpu == -1) return -EAGAIN; - sched_setnuma(p, env.dst_nid); + /* + * If the task is part of a workload that spans multiple NUMA nodes, + * and is migrating into one of the workload's active nodes, remember + * this node as the task's preferred numa node, so the workload can + * settle down. + * A task that migrated to a second choice node will be better off + * trying for a better one later. Do not set the preferred node here. + */ + if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes)) + sched_setnuma(p, env.dst_nid); /* * Reset the scan period if the task is being rescheduled on an @@ -1266,12 +1362,15 @@ static int task_numa_migrate(struct task_struct *p) /* Attempt to migrate a task to a CPU on the preferred node. */ static void numa_migrate_preferred(struct task_struct *p) { + unsigned long interval = HZ; + /* This task has no NUMA fault statistics yet */ - if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) + if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory)) return; /* Periodically retry migrating the task to the preferred node */ - p->numa_migrate_retry = jiffies + HZ; + interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16); + p->numa_migrate_retry = jiffies + interval; /* Success if task is already running on preferred CPU */ if (task_node(p) == p->numa_preferred_nid) @@ -1282,6 +1381,38 @@ static void numa_migrate_preferred(struct task_struct *p) } /* + * Find the nodes on which the workload is actively running. We do this by + * tracking the nodes from which NUMA hinting faults are triggered. This can + * be different from the set of nodes where the workload's memory is currently + * located. + * + * The bitmask is used to make smarter decisions on when to do NUMA page + * migrations, To prevent flip-flopping, and excessive page migrations, nodes + * are added when they cause over 6/16 of the maximum number of faults, but + * only removed when they drop below 3/16. + */ +static void update_numa_active_node_mask(struct numa_group *numa_group) +{ + unsigned long faults, max_faults = 0; + int nid; + + for_each_online_node(nid) { + faults = group_faults_cpu(numa_group, nid); + if (faults > max_faults) + max_faults = faults; + } + + for_each_online_node(nid) { + faults = group_faults_cpu(numa_group, nid); + if (!node_isset(nid, numa_group->active_nodes)) { + if (faults > max_faults * 6 / 16) + node_set(nid, numa_group->active_nodes); + } else if (faults < max_faults * 3 / 16) + node_clear(nid, numa_group->active_nodes); + } +} + +/* * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS * increments. The more local the fault statistics are, the higher the scan * period will be for the next scan window. If local/remote ratio is below @@ -1355,11 +1486,41 @@ static void update_task_scan_period(struct task_struct *p, memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); } +/* + * Get the fraction of time the task has been running since the last + * NUMA placement cycle. The scheduler keeps similar statistics, but + * decays those on a 32ms period, which is orders of magnitude off + * from the dozens-of-seconds NUMA balancing period. Use the scheduler + * stats only if the task is so new there are no NUMA statistics yet. + */ +static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) +{ + u64 runtime, delta, now; + /* Use the start of this time slice to avoid calculations. */ + now = p->se.exec_start; + runtime = p->se.sum_exec_runtime; + + if (p->last_task_numa_placement) { + delta = runtime - p->last_sum_exec_runtime; + *period = now - p->last_task_numa_placement; + } else { + delta = p->se.avg.runnable_avg_sum; + *period = p->se.avg.runnable_avg_period; + } + + p->last_sum_exec_runtime = runtime; + p->last_task_numa_placement = now; + + return delta; +} + static void task_numa_placement(struct task_struct *p) { int seq, nid, max_nid = -1, max_group_nid = -1; unsigned long max_faults = 0, max_group_faults = 0; unsigned long fault_types[2] = { 0, 0 }; + unsigned long total_faults; + u64 runtime, period; spinlock_t *group_lock = NULL; seq = ACCESS_ONCE(p->mm->numa_scan_seq); @@ -1368,10 +1529,14 @@ static void task_numa_placement(struct task_struct *p) p->numa_scan_seq = seq; p->numa_scan_period_max = task_scan_max(p); + total_faults = p->numa_faults_locality[0] + + p->numa_faults_locality[1]; + runtime = numa_get_avg_runtime(p, &period); + /* If the task is part of a group prevent parallel updates to group stats */ if (p->numa_group) { group_lock = &p->numa_group->lock; - spin_lock(group_lock); + spin_lock_irq(group_lock); } /* Find the node with the highest number of faults */ @@ -1379,24 +1544,37 @@ static void task_numa_placement(struct task_struct *p) unsigned long faults = 0, group_faults = 0; int priv, i; - for (priv = 0; priv < 2; priv++) { - long diff; + for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) { + long diff, f_diff, f_weight; i = task_faults_idx(nid, priv); - diff = -p->numa_faults[i]; /* Decay existing window, copy faults since last scan */ - p->numa_faults[i] >>= 1; - p->numa_faults[i] += p->numa_faults_buffer[i]; - fault_types[priv] += p->numa_faults_buffer[i]; - p->numa_faults_buffer[i] = 0; + diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2; + fault_types[priv] += p->numa_faults_buffer_memory[i]; + p->numa_faults_buffer_memory[i] = 0; - faults += p->numa_faults[i]; - diff += p->numa_faults[i]; + /* + * Normalize the faults_from, so all tasks in a group + * count according to CPU use, instead of by the raw + * number of faults. Tasks with little runtime have + * little over-all impact on throughput, and thus their + * faults are less important. + */ + f_weight = div64_u64(runtime << 16, period + 1); + f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) / + (total_faults + 1); + f_diff = f_weight - p->numa_faults_cpu[i] / 2; + p->numa_faults_buffer_cpu[i] = 0; + + p->numa_faults_memory[i] += diff; + p->numa_faults_cpu[i] += f_diff; + faults += p->numa_faults_memory[i]; p->total_numa_faults += diff; if (p->numa_group) { /* safe because we can only change our own group */ p->numa_group->faults[i] += diff; + p->numa_group->faults_cpu[i] += f_diff; p->numa_group->total_faults += diff; group_faults += p->numa_group->faults[i]; } @@ -1416,6 +1594,7 @@ static void task_numa_placement(struct task_struct *p) update_task_scan_period(p, fault_types[0], fault_types[1]); if (p->numa_group) { + update_numa_active_node_mask(p->numa_group); /* * If the preferred task and group nids are different, * iterate over the nodes again to find the best place. @@ -1432,7 +1611,7 @@ static void task_numa_placement(struct task_struct *p) } } - spin_unlock(group_lock); + spin_unlock_irq(group_lock); } /* Preferred node as the node with the most faults */ @@ -1465,7 +1644,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, if (unlikely(!p->numa_group)) { unsigned int size = sizeof(struct numa_group) + - 2*nr_node_ids*sizeof(unsigned long); + 4*nr_node_ids*sizeof(unsigned long); grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); if (!grp) @@ -1475,9 +1654,14 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, spin_lock_init(&grp->lock); INIT_LIST_HEAD(&grp->task_list); grp->gid = p->pid; + /* Second half of the array tracks nids where faults happen */ + grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES * + nr_node_ids; - for (i = 0; i < 2*nr_node_ids; i++) - grp->faults[i] = p->numa_faults[i]; + node_set(task_node(current), grp->active_nodes); + + for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) + grp->faults[i] = p->numa_faults_memory[i]; grp->total_faults = p->total_numa_faults; @@ -1532,11 +1716,12 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, if (!join) return; - double_lock(&my_grp->lock, &grp->lock); + BUG_ON(irqs_disabled()); + double_lock_irq(&my_grp->lock, &grp->lock); - for (i = 0; i < 2*nr_node_ids; i++) { - my_grp->faults[i] -= p->numa_faults[i]; - grp->faults[i] += p->numa_faults[i]; + for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) { + my_grp->faults[i] -= p->numa_faults_memory[i]; + grp->faults[i] += p->numa_faults_memory[i]; } my_grp->total_faults -= p->total_numa_faults; grp->total_faults += p->total_numa_faults; @@ -1546,7 +1731,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, grp->nr_tasks++; spin_unlock(&my_grp->lock); - spin_unlock(&grp->lock); + spin_unlock_irq(&grp->lock); rcu_assign_pointer(p->numa_group, grp); @@ -1561,34 +1746,39 @@ no_join: void task_numa_free(struct task_struct *p) { struct numa_group *grp = p->numa_group; + void *numa_faults = p->numa_faults_memory; + unsigned long flags; int i; - void *numa_faults = p->numa_faults; if (grp) { - spin_lock(&grp->lock); - for (i = 0; i < 2*nr_node_ids; i++) - grp->faults[i] -= p->numa_faults[i]; + spin_lock_irqsave(&grp->lock, flags); + for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) + grp->faults[i] -= p->numa_faults_memory[i]; grp->total_faults -= p->total_numa_faults; list_del(&p->numa_entry); grp->nr_tasks--; - spin_unlock(&grp->lock); + spin_unlock_irqrestore(&grp->lock, flags); rcu_assign_pointer(p->numa_group, NULL); put_numa_group(grp); } - p->numa_faults = NULL; - p->numa_faults_buffer = NULL; + p->numa_faults_memory = NULL; + p->numa_faults_buffer_memory = NULL; + p->numa_faults_cpu= NULL; + p->numa_faults_buffer_cpu = NULL; kfree(numa_faults); } /* * Got a PROT_NONE fault for a page on @node. */ -void task_numa_fault(int last_cpupid, int node, int pages, int flags) +void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) { struct task_struct *p = current; bool migrated = flags & TNF_MIGRATED; + int cpu_node = task_node(current); + int local = !!(flags & TNF_FAULT_LOCAL); int priv; if (!numabalancing_enabled) @@ -1603,16 +1793,24 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags) return; /* Allocate buffer to track faults on a per-node basis */ - if (unlikely(!p->numa_faults)) { - int size = sizeof(*p->numa_faults) * 2 * nr_node_ids; + if (unlikely(!p->numa_faults_memory)) { + int size = sizeof(*p->numa_faults_memory) * + NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids; - /* numa_faults and numa_faults_buffer share the allocation */ - p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN); - if (!p->numa_faults) + p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN); + if (!p->numa_faults_memory) return; - BUG_ON(p->numa_faults_buffer); - p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids); + BUG_ON(p->numa_faults_buffer_memory); + /* + * The averaged statistics, shared & private, memory & cpu, + * occupy the first half of the array. The second half of the + * array is for current counters, which are averaged into the + * first set by task_numa_placement. + */ + p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids); + p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids); + p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids); p->total_numa_faults = 0; memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); } @@ -1629,6 +1827,17 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags) task_numa_group(p, last_cpupid, flags, &priv); } + /* + * If a workload spans multiple NUMA nodes, a shared fault that + * occurs wholly within the set of nodes that the workload is + * actively using should be counted as local. This allows the + * scan rate to slow down when a workload has settled down. + */ + if (!priv && !local && p->numa_group && + node_isset(cpu_node, p->numa_group->active_nodes) && + node_isset(mem_node, p->numa_group->active_nodes)) + local = 1; + task_numa_placement(p); /* @@ -1641,8 +1850,9 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags) if (migrated) p->numa_pages_migrated += pages; - p->numa_faults_buffer[task_faults_idx(node, priv)] += pages; - p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages; + p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages; + p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages; + p->numa_faults_locality[local] += pages; } static void reset_ptenuma_scan(struct task_struct *p) @@ -1757,6 +1967,8 @@ void task_numa_work(struct callback_head *work) start = end; if (pages <= 0) goto out; + + cond_resched(); } while (end != vma->vm_end); } @@ -2217,13 +2429,20 @@ static inline void __update_group_entity_contrib(struct sched_entity *se) se->avg.load_avg_contrib >>= NICE_0_SHIFT; } } -#else + +static inline void update_rq_runnable_avg(struct rq *rq, int runnable) +{ + __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable); + __update_tg_runnable_avg(&rq->avg, &rq->cfs); +} +#else /* CONFIG_FAIR_GROUP_SCHED */ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, int force_update) {} static inline void __update_tg_runnable_avg(struct sched_avg *sa, struct cfs_rq *cfs_rq) {} static inline void __update_group_entity_contrib(struct sched_entity *se) {} -#endif +static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} +#endif /* CONFIG_FAIR_GROUP_SCHED */ static inline void __update_task_entity_contrib(struct sched_entity *se) { @@ -2321,12 +2540,6 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) __update_cfs_rq_tg_load_contrib(cfs_rq, force_update); } -static inline void update_rq_runnable_avg(struct rq *rq, int runnable) -{ - __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable); - __update_tg_runnable_avg(&rq->avg, &rq->cfs); -} - /* Add the load generated by se into cfs_rq's child load-average */ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, @@ -2414,7 +2627,10 @@ void idle_exit_fair(struct rq *this_rq) update_rq_runnable_avg(this_rq, 0); } -#else +static int idle_balance(struct rq *this_rq); + +#else /* CONFIG_SMP */ + static inline void update_entity_load_avg(struct sched_entity *se, int update_cfs_rq) {} static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} @@ -2426,7 +2642,13 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, int sleep) {} static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) {} -#endif + +static inline int idle_balance(struct rq *rq) +{ + return 0; +} + +#endif /* CONFIG_SMP */ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -2576,10 +2798,10 @@ static void __clear_buddies_last(struct sched_entity *se) { for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - if (cfs_rq->last == se) - cfs_rq->last = NULL; - else + if (cfs_rq->last != se) break; + + cfs_rq->last = NULL; } } @@ -2587,10 +2809,10 @@ static void __clear_buddies_next(struct sched_entity *se) { for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - if (cfs_rq->next == se) - cfs_rq->next = NULL; - else + if (cfs_rq->next != se) break; + + cfs_rq->next = NULL; } } @@ -2598,10 +2820,10 @@ static void __clear_buddies_skip(struct sched_entity *se) { for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - if (cfs_rq->skip == se) - cfs_rq->skip = NULL; - else + if (cfs_rq->skip != se) break; + + cfs_rq->skip = NULL; } } @@ -2744,17 +2966,36 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); * 3) pick the "last" process, for cache locality * 4) do not run the "skip" process, if something else is available */ -static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) +static struct sched_entity * +pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) { - struct sched_entity *se = __pick_first_entity(cfs_rq); - struct sched_entity *left = se; + struct sched_entity *left = __pick_first_entity(cfs_rq); + struct sched_entity *se; + + /* + * If curr is set we have to see if its left of the leftmost entity + * still in the tree, provided there was anything in the tree at all. + */ + if (!left || (curr && entity_before(curr, left))) + left = curr; + + se = left; /* ideally we run the leftmost entity */ /* * Avoid running the skip buddy, if running something else can * be done without getting too unfair. */ if (cfs_rq->skip == se) { - struct sched_entity *second = __pick_next_entity(se); + struct sched_entity *second; + + if (se == curr) { + second = __pick_first_entity(cfs_rq); + } else { + second = __pick_next_entity(se); + if (!second || (curr && entity_before(curr, second))) + second = curr; + } + if (second && wakeup_preempt_entity(second, left) < 1) se = second; } @@ -2776,7 +3017,7 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) return se; } -static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq); +static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq); static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) { @@ -2940,7 +3181,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) */ if (!cfs_b->timer_active) { __refill_cfs_bandwidth_runtime(cfs_b); - __start_cfs_bandwidth(cfs_b); + __start_cfs_bandwidth(cfs_b, false); } if (cfs_b->runtime > 0) { @@ -2985,10 +3226,12 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) * has not truly expired. * * Fortunately we can check determine whether this the case by checking - * whether the global deadline has advanced. + * whether the global deadline has advanced. It is valid to compare + * cfs_b->runtime_expires without any locks since we only care about + * exact equality, so a partial write will still work. */ - if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) { + if (cfs_rq->runtime_expires != cfs_b->runtime_expires) { /* extend local deadline, drift is bounded above by 2 ticks */ cfs_rq->runtime_expires += TICK_NSEC; } else { @@ -3112,14 +3355,14 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) } if (!se) - rq->nr_running -= task_delta; + sub_nr_running(rq, task_delta); cfs_rq->throttled = 1; cfs_rq->throttled_clock = rq_clock(rq); raw_spin_lock(&cfs_b->lock); list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); if (!cfs_b->timer_active) - __start_cfs_bandwidth(cfs_b); + __start_cfs_bandwidth(cfs_b, false); raw_spin_unlock(&cfs_b->lock); } @@ -3163,7 +3406,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) } if (!se) - rq->nr_running += task_delta; + add_nr_running(rq, task_delta); /* determine whether we need to wake up potentially idle cpu */ if (rq->curr == rq->idle && rq->cfs.nr_running) @@ -3217,21 +3460,21 @@ next: static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) { u64 runtime, runtime_expires; - int idle = 1, throttled; + int throttled; - raw_spin_lock(&cfs_b->lock); /* no need to continue the timer with no bandwidth constraint */ if (cfs_b->quota == RUNTIME_INF) - goto out_unlock; + goto out_deactivate; throttled = !list_empty(&cfs_b->throttled_cfs_rq); - /* idle depends on !throttled (for the case of a large deficit) */ - idle = cfs_b->idle && !throttled; cfs_b->nr_periods += overrun; - /* if we're going inactive then everything else can be deferred */ - if (idle) - goto out_unlock; + /* + * idle depends on !throttled (for the case of a large deficit), and if + * we're going inactive then everything else can be deferred + */ + if (cfs_b->idle && !throttled) + goto out_deactivate; /* * if we have relooped after returning idle once, we need to update our @@ -3245,7 +3488,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) if (!throttled) { /* mark as potentially idle for the upcoming period */ cfs_b->idle = 1; - goto out_unlock; + return 0; } /* account preceding periods in which throttling occurred */ @@ -3285,12 +3528,12 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) * timer to remain active while there are any throttled entities.) */ cfs_b->idle = 0; -out_unlock: - if (idle) - cfs_b->timer_active = 0; - raw_spin_unlock(&cfs_b->lock); - return idle; + return 0; + +out_deactivate: + cfs_b->timer_active = 0; + return 1; } /* a cfs_rq won't donate quota below this amount */ @@ -3431,22 +3674,23 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) } /* conditionally throttle active cfs_rq's from put_prev_entity() */ -static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) +static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { if (!cfs_bandwidth_used()) - return; + return false; if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) - return; + return false; /* * it's possible for a throttled entity to be forced into a running * state (e.g. set_curr_task), in this case we're finished. */ if (cfs_rq_throttled(cfs_rq)) - return; + return true; throttle_cfs_rq(cfs_rq); + return true; } static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) @@ -3466,6 +3710,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) int overrun; int idle = 0; + raw_spin_lock(&cfs_b->lock); for (;;) { now = hrtimer_cb_get_time(timer); overrun = hrtimer_forward(timer, now, cfs_b->period); @@ -3475,6 +3720,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) idle = do_sched_cfs_period_timer(cfs_b, overrun); } + raw_spin_unlock(&cfs_b->lock); return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; } @@ -3500,7 +3746,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) } /* requires cfs_b->lock, may release to reprogram timer */ -void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) +void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force) { /* * The timer may be active because we're trying to set a new bandwidth @@ -3515,7 +3761,7 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) cpu_relax(); raw_spin_lock(&cfs_b->lock); /* if someone else restarted the timer then we're done */ - if (cfs_b->timer_active) + if (!force && cfs_b->timer_active) return; } @@ -3534,8 +3780,6 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) struct cfs_rq *cfs_rq; for_each_leaf_cfs_rq(rq, cfs_rq) { - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - if (!cfs_rq->runtime_enabled) continue; @@ -3543,7 +3787,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) * clock_task is not advancing so we just need to make sure * there's some valid quota amount */ - cfs_rq->runtime_remaining = cfs_b->quota; + cfs_rq->runtime_remaining = 1; if (cfs_rq_throttled(cfs_rq)) unthrottle_cfs_rq(cfs_rq); } @@ -3556,7 +3800,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) } static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} -static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} +static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} @@ -3694,7 +3938,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!se) { update_rq_runnable_avg(rq, rq->nr_running); - inc_nr_running(rq); + add_nr_running(rq, 1); } hrtick_update(rq); } @@ -3754,7 +3998,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) } if (!se) { - dec_nr_running(rq); + sub_nr_running(rq, 1); update_rq_runnable_avg(rq, 1); } hrtick_update(rq); @@ -3800,9 +4044,9 @@ static unsigned long target_load(int cpu, int type) return max(rq->cpu_load[type-1], total); } -static unsigned long power_of(int cpu) +static unsigned long capacity_of(int cpu) { - return cpu_rq(cpu)->cpu_power; + return cpu_rq(cpu)->cpu_capacity; } static unsigned long cpu_avg_load_per_task(int cpu) @@ -3824,8 +4068,8 @@ static void record_wakee(struct task_struct *p) * about the boundary, really active task won't care * about the loss. */ - if (jiffies > current->wakee_flip_decay_ts + HZ) { - current->wakee_flips = 0; + if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) { + current->wakee_flips >>= 1; current->wakee_flip_decay_ts = jiffies; } @@ -4045,12 +4289,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) s64 this_eff_load, prev_eff_load; this_eff_load = 100; - this_eff_load *= power_of(prev_cpu); + this_eff_load *= capacity_of(prev_cpu); this_eff_load *= this_load + effective_load(tg, this_cpu, weight, weight); prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; - prev_eff_load *= power_of(this_cpu); + prev_eff_load *= capacity_of(this_cpu); prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); balanced = this_eff_load <= prev_eff_load; @@ -4126,8 +4370,8 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, avg_load += load; } - /* Adjust by relative CPU power of the group */ - avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power; + /* Adjust by relative CPU capacity of the group */ + avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity; if (local_group) { this_load = avg_load; @@ -4211,13 +4455,14 @@ done: } /* - * sched_balance_self: balance the current task (running on cpu) in domains - * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and - * SD_BALANCE_EXEC. + * select_task_rq_fair: Select target runqueue for the waking task in domains + * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, + * SD_BALANCE_FORK, or SD_BALANCE_EXEC. * - * Balance, ie. select the least loaded group. + * Balances load by selecting the idlest cpu in the idlest group, or under + * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set. * - * Returns the target CPU number, or the same CPU if no balancing is needed. + * Returns the target cpu number. * * preempt must be disabled. */ @@ -4258,10 +4503,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f sd = tmp; } - if (affine_sd) { - if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) - prev_cpu = cpu; + if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync)) + prev_cpu = cpu; + if (sd_flag & SD_BALANCE_WAKE) { new_cpu = select_idle_sibling(p, prev_cpu); goto unlock; } @@ -4329,6 +4574,9 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu) atomic_long_add(se->avg.load_avg_contrib, &cfs_rq->removed_load); } + + /* We have migrated, no longer consider this task hot */ + se->exec_start = 0; } #endif /* CONFIG_SMP */ @@ -4492,26 +4740,124 @@ preempt: set_last_buddy(se); } -static struct task_struct *pick_next_task_fair(struct rq *rq) +static struct task_struct * +pick_next_task_fair(struct rq *rq, struct task_struct *prev) { - struct task_struct *p; struct cfs_rq *cfs_rq = &rq->cfs; struct sched_entity *se; + struct task_struct *p; + int new_tasks; +again: +#ifdef CONFIG_FAIR_GROUP_SCHED if (!cfs_rq->nr_running) - return NULL; + goto idle; + + if (prev->sched_class != &fair_sched_class) + goto simple; + + /* + * Because of the set_next_buddy() in dequeue_task_fair() it is rather + * likely that a next task is from the same cgroup as the current. + * + * Therefore attempt to avoid putting and setting the entire cgroup + * hierarchy, only change the part that actually changes. + */ do { - se = pick_next_entity(cfs_rq); + struct sched_entity *curr = cfs_rq->curr; + + /* + * Since we got here without doing put_prev_entity() we also + * have to consider cfs_rq->curr. If it is still a runnable + * entity, update_curr() will update its vruntime, otherwise + * forget we've ever seen it. + */ + if (curr && curr->on_rq) + update_curr(cfs_rq); + else + curr = NULL; + + /* + * This call to check_cfs_rq_runtime() will do the throttle and + * dequeue its entity in the parent(s). Therefore the 'simple' + * nr_running test will indeed be correct. + */ + if (unlikely(check_cfs_rq_runtime(cfs_rq))) + goto simple; + + se = pick_next_entity(cfs_rq, curr); + cfs_rq = group_cfs_rq(se); + } while (cfs_rq); + + p = task_of(se); + + /* + * Since we haven't yet done put_prev_entity and if the selected task + * is a different task than we started out with, try and touch the + * least amount of cfs_rqs. + */ + if (prev != p) { + struct sched_entity *pse = &prev->se; + + while (!(cfs_rq = is_same_group(se, pse))) { + int se_depth = se->depth; + int pse_depth = pse->depth; + + if (se_depth <= pse_depth) { + put_prev_entity(cfs_rq_of(pse), pse); + pse = parent_entity(pse); + } + if (se_depth >= pse_depth) { + set_next_entity(cfs_rq_of(se), se); + se = parent_entity(se); + } + } + + put_prev_entity(cfs_rq, pse); + set_next_entity(cfs_rq, se); + } + + if (hrtick_enabled(rq)) + hrtick_start_fair(rq, p); + + return p; +simple: + cfs_rq = &rq->cfs; +#endif + + if (!cfs_rq->nr_running) + goto idle; + + put_prev_task(rq, prev); + + do { + se = pick_next_entity(cfs_rq, NULL); set_next_entity(cfs_rq, se); cfs_rq = group_cfs_rq(se); } while (cfs_rq); p = task_of(se); + if (hrtick_enabled(rq)) hrtick_start_fair(rq, p); return p; + +idle: + new_tasks = idle_balance(rq); + /* + * Because idle_balance() releases (and re-acquires) rq->lock, it is + * possible for any higher priority task to appear. In that case we + * must re-start the pick_next_entity() loop. + */ + if (new_tasks < 0) + return RETRY_TASK; + + if (new_tasks > 0) + goto again; + + return NULL; } /* @@ -4605,14 +4951,14 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp * * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3) * - * P_i is the cpu power (or compute capacity) of cpu i, typically it is the + * C_i is the compute capacity of cpu i, typically it is the * fraction of 'recent' time available for SCHED_OTHER task execution. But it * can also include other factors [XXX]. * * To achieve this balance we define a measure of imbalance which follows * directly from (1): * - * imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j } (4) + * imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j } (4) * * We them move tasks around to minimize the imbalance. In the continuous * function space it is obvious this converges, in the discrete case we get @@ -4749,7 +5095,7 @@ static void move_task(struct task_struct *p, struct lb_env *env) * Is this task likely cache-hot: */ static int -task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) +task_hot(struct task_struct *p, u64 now) { s64 delta; @@ -4781,9 +5127,10 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) /* Returns true if the destination node has incurred more faults */ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) { + struct numa_group *numa_group = rcu_dereference(p->numa_group); int src_nid, dst_nid; - if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults || + if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory || !(env->sd->flags & SD_NUMA)) { return false; } @@ -4794,27 +5141,35 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) if (src_nid == dst_nid) return false; - /* Always encourage migration to the preferred node. */ - if (dst_nid == p->numa_preferred_nid) - return true; + if (numa_group) { + /* Task is already in the group's interleave set. */ + if (node_isset(src_nid, numa_group->active_nodes)) + return false; + + /* Task is moving into the group's interleave set. */ + if (node_isset(dst_nid, numa_group->active_nodes)) + return true; + + return group_faults(p, dst_nid) > group_faults(p, src_nid); + } - /* If both task and group weight improve, this move is a winner. */ - if (task_weight(p, dst_nid) > task_weight(p, src_nid) && - group_weight(p, dst_nid) > group_weight(p, src_nid)) + /* Encourage migration to the preferred node. */ + if (dst_nid == p->numa_preferred_nid) return true; - return false; + return task_faults(p, dst_nid) > task_faults(p, src_nid); } static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { + struct numa_group *numa_group = rcu_dereference(p->numa_group); int src_nid, dst_nid; if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) return false; - if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) + if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA)) return false; src_nid = cpu_to_node(env->src_cpu); @@ -4823,16 +5178,23 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) if (src_nid == dst_nid) return false; + if (numa_group) { + /* Task is moving within/into the group's interleave set. */ + if (node_isset(dst_nid, numa_group->active_nodes)) + return false; + + /* Task is moving out of the group's interleave set. */ + if (node_isset(src_nid, numa_group->active_nodes)) + return true; + + return group_faults(p, dst_nid) < group_faults(p, src_nid); + } + /* Migrating away from the preferred node is always bad. */ if (src_nid == p->numa_preferred_nid) return true; - /* If either task or group weight get worse, don't do it. */ - if (task_weight(p, dst_nid) < task_weight(p, src_nid) || - group_weight(p, dst_nid) < group_weight(p, src_nid)) - return true; - - return false; + return task_faults(p, dst_nid) < task_faults(p, src_nid); } #else @@ -4910,7 +5272,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * 2) task is cache cold, or * 3) too many balance attempts have failed. */ - tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd); + tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq)); if (!tsk_cache_hot) tsk_cache_hot = migrate_degrades_locality(p, env); @@ -5171,13 +5533,13 @@ struct sg_lb_stats { unsigned long group_load; /* Total load over the CPUs of the group */ unsigned long sum_weighted_load; /* Weighted load of group's tasks */ unsigned long load_per_task; - unsigned long group_power; + unsigned long group_capacity; unsigned int sum_nr_running; /* Nr tasks running in the group */ - unsigned int group_capacity; + unsigned int group_capacity_factor; unsigned int idle_cpus; unsigned int group_weight; int group_imb; /* Is there an imbalance in the group ? */ - int group_has_capacity; /* Is there extra capacity in the group? */ + int group_has_free_capacity; #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; unsigned int nr_preferred_running; @@ -5192,7 +5554,7 @@ struct sd_lb_stats { struct sched_group *busiest; /* Busiest group in this sd */ struct sched_group *local; /* Local group in this sd */ unsigned long total_load; /* Total load of all groups in sd */ - unsigned long total_pwr; /* Total power of all groups in sd */ + unsigned long total_capacity; /* Total capacity of all groups in sd */ unsigned long avg_load; /* Average load across all groups in sd */ struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */ @@ -5211,7 +5573,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) .busiest = NULL, .local = NULL, .total_load = 0UL, - .total_pwr = 0UL, + .total_capacity = 0UL, .busiest_stat = { .avg_load = 0UL, }, @@ -5246,17 +5608,17 @@ static inline int get_sd_load_idx(struct sched_domain *sd, return load_idx; } -static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) +static unsigned long default_scale_capacity(struct sched_domain *sd, int cpu) { - return SCHED_POWER_SCALE; + return SCHED_CAPACITY_SCALE; } -unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) +unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu) { - return default_scale_freq_power(sd, cpu); + return default_scale_capacity(sd, cpu); } -static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) +static unsigned long default_scale_smt_capacity(struct sched_domain *sd, int cpu) { unsigned long weight = sd->span_weight; unsigned long smt_gain = sd->smt_gain; @@ -5266,15 +5628,16 @@ static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) return smt_gain; } -unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) +unsigned long __weak arch_scale_smt_capacity(struct sched_domain *sd, int cpu) { - return default_scale_smt_power(sd, cpu); + return default_scale_smt_capacity(sd, cpu); } -static unsigned long scale_rt_power(int cpu) +static unsigned long scale_rt_capacity(int cpu) { struct rq *rq = cpu_rq(cpu); u64 total, available, age_stamp, avg; + s64 delta; /* * Since we're reading these variables without serialization make sure @@ -5283,74 +5646,78 @@ static unsigned long scale_rt_power(int cpu) age_stamp = ACCESS_ONCE(rq->age_stamp); avg = ACCESS_ONCE(rq->rt_avg); - total = sched_avg_period() + (rq_clock(rq) - age_stamp); + delta = rq_clock(rq) - age_stamp; + if (unlikely(delta < 0)) + delta = 0; + + total = sched_avg_period() + delta; if (unlikely(total < avg)) { - /* Ensures that power won't end up being negative */ + /* Ensures that capacity won't end up being negative */ available = 0; } else { available = total - avg; } - if (unlikely((s64)total < SCHED_POWER_SCALE)) - total = SCHED_POWER_SCALE; + if (unlikely((s64)total < SCHED_CAPACITY_SCALE)) + total = SCHED_CAPACITY_SCALE; - total >>= SCHED_POWER_SHIFT; + total >>= SCHED_CAPACITY_SHIFT; return div_u64(available, total); } -static void update_cpu_power(struct sched_domain *sd, int cpu) +static void update_cpu_capacity(struct sched_domain *sd, int cpu) { unsigned long weight = sd->span_weight; - unsigned long power = SCHED_POWER_SCALE; + unsigned long capacity = SCHED_CAPACITY_SCALE; struct sched_group *sdg = sd->groups; - if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { - if (sched_feat(ARCH_POWER)) - power *= arch_scale_smt_power(sd, cpu); + if ((sd->flags & SD_SHARE_CPUCAPACITY) && weight > 1) { + if (sched_feat(ARCH_CAPACITY)) + capacity *= arch_scale_smt_capacity(sd, cpu); else - power *= default_scale_smt_power(sd, cpu); + capacity *= default_scale_smt_capacity(sd, cpu); - power >>= SCHED_POWER_SHIFT; + capacity >>= SCHED_CAPACITY_SHIFT; } - sdg->sgp->power_orig = power; + sdg->sgc->capacity_orig = capacity; - if (sched_feat(ARCH_POWER)) - power *= arch_scale_freq_power(sd, cpu); + if (sched_feat(ARCH_CAPACITY)) + capacity *= arch_scale_freq_capacity(sd, cpu); else - power *= default_scale_freq_power(sd, cpu); + capacity *= default_scale_capacity(sd, cpu); - power >>= SCHED_POWER_SHIFT; + capacity >>= SCHED_CAPACITY_SHIFT; - power *= scale_rt_power(cpu); - power >>= SCHED_POWER_SHIFT; + capacity *= scale_rt_capacity(cpu); + capacity >>= SCHED_CAPACITY_SHIFT; - if (!power) - power = 1; + if (!capacity) + capacity = 1; - cpu_rq(cpu)->cpu_power = power; - sdg->sgp->power = power; + cpu_rq(cpu)->cpu_capacity = capacity; + sdg->sgc->capacity = capacity; } -void update_group_power(struct sched_domain *sd, int cpu) +void update_group_capacity(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; - unsigned long power, power_orig; + unsigned long capacity, capacity_orig; unsigned long interval; interval = msecs_to_jiffies(sd->balance_interval); interval = clamp(interval, 1UL, max_load_balance_interval); - sdg->sgp->next_update = jiffies + interval; + sdg->sgc->next_update = jiffies + interval; if (!child) { - update_cpu_power(sd, cpu); + update_cpu_capacity(sd, cpu); return; } - power_orig = power = 0; + capacity_orig = capacity = 0; if (child->flags & SD_OVERLAP) { /* @@ -5359,31 +5726,31 @@ void update_group_power(struct sched_domain *sd, int cpu) */ for_each_cpu(cpu, sched_group_cpus(sdg)) { - struct sched_group_power *sgp; + struct sched_group_capacity *sgc; struct rq *rq = cpu_rq(cpu); /* - * build_sched_domains() -> init_sched_groups_power() + * build_sched_domains() -> init_sched_groups_capacity() * gets here before we've attached the domains to the * runqueues. * - * Use power_of(), which is set irrespective of domains - * in update_cpu_power(). + * Use capacity_of(), which is set irrespective of domains + * in update_cpu_capacity(). * - * This avoids power/power_orig from being 0 and + * This avoids capacity/capacity_orig from being 0 and * causing divide-by-zero issues on boot. * - * Runtime updates will correct power_orig. + * Runtime updates will correct capacity_orig. */ if (unlikely(!rq->sd)) { - power_orig += power_of(cpu); - power += power_of(cpu); + capacity_orig += capacity_of(cpu); + capacity += capacity_of(cpu); continue; } - sgp = rq->sd->groups->sgp; - power_orig += sgp->power_orig; - power += sgp->power; + sgc = rq->sd->groups->sgc; + capacity_orig += sgc->capacity_orig; + capacity += sgc->capacity; } } else { /* @@ -5393,14 +5760,14 @@ void update_group_power(struct sched_domain *sd, int cpu) group = child->groups; do { - power_orig += group->sgp->power_orig; - power += group->sgp->power; + capacity_orig += group->sgc->capacity_orig; + capacity += group->sgc->capacity; group = group->next; } while (group != child->groups); } - sdg->sgp->power_orig = power_orig; - sdg->sgp->power = power; + sdg->sgc->capacity_orig = capacity_orig; + sdg->sgc->capacity = capacity; } /* @@ -5414,15 +5781,15 @@ static inline int fix_small_capacity(struct sched_domain *sd, struct sched_group *group) { /* - * Only siblings can have significantly less than SCHED_POWER_SCALE + * Only siblings can have significantly less than SCHED_CAPACITY_SCALE */ - if (!(sd->flags & SD_SHARE_CPUPOWER)) + if (!(sd->flags & SD_SHARE_CPUCAPACITY)) return 0; /* - * If ~90% of the cpu_power is still there, we're good. + * If ~90% of the cpu_capacity is still there, we're good. */ - if (group->sgp->power * 32 > group->sgp->power_orig * 29) + if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29) return 1; return 0; @@ -5459,34 +5826,35 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) static inline int sg_imbalanced(struct sched_group *group) { - return group->sgp->imbalance; + return group->sgc->imbalance; } /* - * Compute the group capacity. + * Compute the group capacity factor. * - * Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by + * Avoid the issue where N*frac(smt_capacity) >= 1 creates 'phantom' cores by * first dividing out the smt factor and computing the actual number of cores - * and limit power unit capacity with that. + * and limit unit capacity with that. */ -static inline int sg_capacity(struct lb_env *env, struct sched_group *group) +static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group) { - unsigned int capacity, smt, cpus; - unsigned int power, power_orig; + unsigned int capacity_factor, smt, cpus; + unsigned int capacity, capacity_orig; - power = group->sgp->power; - power_orig = group->sgp->power_orig; + capacity = group->sgc->capacity; + capacity_orig = group->sgc->capacity_orig; cpus = group->group_weight; - /* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */ - smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig); - capacity = cpus / smt; /* cores */ + /* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */ + smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig); + capacity_factor = cpus / smt; /* cores */ - capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE)); - if (!capacity) - capacity = fix_small_capacity(env->sd, group); + capacity_factor = min_t(unsigned, + capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE)); + if (!capacity_factor) + capacity_factor = fix_small_capacity(env->sd, group); - return capacity; + return capacity_factor; } /** @@ -5526,9 +5894,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->idle_cpus++; } - /* Adjust by relative CPU power of the group */ - sgs->group_power = group->sgp->power; - sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power; + /* Adjust by relative CPU capacity of the group */ + sgs->group_capacity = group->sgc->capacity; + sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity; if (sgs->sum_nr_running) sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; @@ -5536,10 +5904,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_weight = group->group_weight; sgs->group_imb = sg_imbalanced(group); - sgs->group_capacity = sg_capacity(env, group); + sgs->group_capacity_factor = sg_capacity_factor(env, group); - if (sgs->group_capacity > sgs->sum_nr_running) - sgs->group_has_capacity = 1; + if (sgs->group_capacity_factor > sgs->sum_nr_running) + sgs->group_has_free_capacity = 1; } /** @@ -5563,7 +5931,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, if (sgs->avg_load <= sds->busiest_stat.avg_load) return false; - if (sgs->sum_nr_running > sgs->group_capacity) + if (sgs->sum_nr_running > sgs->group_capacity_factor) return true; if (sgs->group_imb) @@ -5643,8 +6011,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd sgs = &sds->local_stat; if (env->idle != CPU_NEWLY_IDLE || - time_after_eq(jiffies, sg->sgp->next_update)) - update_group_power(env->sd, env->dst_cpu); + time_after_eq(jiffies, sg->sgc->next_update)) + update_group_capacity(env->sd, env->dst_cpu); } update_sg_lb_stats(env, sg, load_idx, local_group, sgs); @@ -5654,17 +6022,17 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd /* * In case the child domain prefers tasks go to siblings - * first, lower the sg capacity to one so that we'll try + * first, lower the sg capacity factor to one so that we'll try * and move all the excess tasks away. We lower the capacity * of a group only if the local group has the capacity to fit - * these excess tasks, i.e. nr_running < group_capacity. The + * these excess tasks, i.e. nr_running < group_capacity_factor. The * extra check prevents the case where you always pull from the * heaviest group when it is already under-utilized (possible * with a large weight task outweighs the tasks on the system). */ if (prefer_sibling && sds->local && - sds->local_stat.group_has_capacity) - sgs->group_capacity = min(sgs->group_capacity, 1U); + sds->local_stat.group_has_free_capacity) + sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U); if (update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; @@ -5674,7 +6042,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd next_group: /* Now, start updating sd_lb_stats */ sds->total_load += sgs->group_load; - sds->total_pwr += sgs->group_power; + sds->total_capacity += sgs->group_capacity; sg = sg->next; } while (sg != env->sd->groups); @@ -5721,8 +6089,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) return 0; env->imbalance = DIV_ROUND_CLOSEST( - sds->busiest_stat.avg_load * sds->busiest_stat.group_power, - SCHED_POWER_SCALE); + sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity, + SCHED_CAPACITY_SCALE); return 1; } @@ -5737,7 +6105,7 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) static inline void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) { - unsigned long tmp, pwr_now = 0, pwr_move = 0; + unsigned long tmp, capa_now = 0, capa_move = 0; unsigned int imbn = 2; unsigned long scaled_busy_load_per_task; struct sg_lb_stats *local, *busiest; @@ -5751,8 +6119,8 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) imbn = 1; scaled_busy_load_per_task = - (busiest->load_per_task * SCHED_POWER_SCALE) / - busiest->group_power; + (busiest->load_per_task * SCHED_CAPACITY_SCALE) / + busiest->group_capacity; if (busiest->avg_load + scaled_busy_load_per_task >= local->avg_load + (scaled_busy_load_per_task * imbn)) { @@ -5762,40 +6130,38 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) /* * OK, we don't have enough imbalance to justify moving tasks, - * however we may be able to increase total CPU power used by + * however we may be able to increase total CPU capacity used by * moving them. */ - pwr_now += busiest->group_power * + capa_now += busiest->group_capacity * min(busiest->load_per_task, busiest->avg_load); - pwr_now += local->group_power * + capa_now += local->group_capacity * min(local->load_per_task, local->avg_load); - pwr_now /= SCHED_POWER_SCALE; + capa_now /= SCHED_CAPACITY_SCALE; /* Amount of load we'd subtract */ - tmp = (busiest->load_per_task * SCHED_POWER_SCALE) / - busiest->group_power; - if (busiest->avg_load > tmp) { - pwr_move += busiest->group_power * + if (busiest->avg_load > scaled_busy_load_per_task) { + capa_move += busiest->group_capacity * min(busiest->load_per_task, - busiest->avg_load - tmp); + busiest->avg_load - scaled_busy_load_per_task); } /* Amount of load we'd add */ - if (busiest->avg_load * busiest->group_power < - busiest->load_per_task * SCHED_POWER_SCALE) { - tmp = (busiest->avg_load * busiest->group_power) / - local->group_power; + if (busiest->avg_load * busiest->group_capacity < + busiest->load_per_task * SCHED_CAPACITY_SCALE) { + tmp = (busiest->avg_load * busiest->group_capacity) / + local->group_capacity; } else { - tmp = (busiest->load_per_task * SCHED_POWER_SCALE) / - local->group_power; + tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) / + local->group_capacity; } - pwr_move += local->group_power * + capa_move += local->group_capacity * min(local->load_per_task, local->avg_load + tmp); - pwr_move /= SCHED_POWER_SCALE; + capa_move /= SCHED_CAPACITY_SCALE; /* Move if we gain throughput */ - if (pwr_move > pwr_now) + if (capa_move > capa_now) env->imbalance = busiest->load_per_task; } @@ -5825,7 +6191,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s /* * In the presence of smp nice balancing, certain scenarios can have * max load less than avg load(as we skip the groups at or below - * its cpu_power, while calculating max_load..) + * its cpu_capacity, while calculating max_load..) */ if (busiest->avg_load <= sds->avg_load || local->avg_load >= sds->avg_load) { @@ -5840,10 +6206,10 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * have to drop below capacity to reach cpu-load equilibrium. */ load_above_capacity = - (busiest->sum_nr_running - busiest->group_capacity); + (busiest->sum_nr_running - busiest->group_capacity_factor); - load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); - load_above_capacity /= busiest->group_power; + load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_CAPACITY_SCALE); + load_above_capacity /= busiest->group_capacity; } /* @@ -5858,9 +6224,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s /* How much load to actually move to equalise the imbalance */ env->imbalance = min( - max_pull * busiest->group_power, - (sds->avg_load - local->avg_load) * local->group_power - ) / SCHED_POWER_SCALE; + max_pull * busiest->group_capacity, + (sds->avg_load - local->avg_load) * local->group_capacity + ) / SCHED_CAPACITY_SCALE; /* * if *imbalance is less than the average load per runnable task @@ -5914,7 +6280,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env) if (!sds.busiest || busiest->sum_nr_running == 0) goto out_balanced; - sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr; + sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load) + / sds.total_capacity; /* * If the busiest group is imbalanced the below checks don't @@ -5925,8 +6292,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env) goto force_balance; /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ - if (env->idle == CPU_NEWLY_IDLE && local->group_has_capacity && - !busiest->group_has_capacity) + if (env->idle == CPU_NEWLY_IDLE && local->group_has_free_capacity && + !busiest->group_has_free_capacity) goto force_balance; /* @@ -5980,11 +6347,11 @@ static struct rq *find_busiest_queue(struct lb_env *env, struct sched_group *group) { struct rq *busiest = NULL, *rq; - unsigned long busiest_load = 0, busiest_power = 1; + unsigned long busiest_load = 0, busiest_capacity = 1; int i; for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { - unsigned long power, capacity, wl; + unsigned long capacity, capacity_factor, wl; enum fbq_type rt; rq = cpu_rq(i); @@ -6012,34 +6379,34 @@ static struct rq *find_busiest_queue(struct lb_env *env, if (rt > env->fbq_type) continue; - power = power_of(i); - capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); - if (!capacity) - capacity = fix_small_capacity(env->sd, group); + capacity = capacity_of(i); + capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE); + if (!capacity_factor) + capacity_factor = fix_small_capacity(env->sd, group); wl = weighted_cpuload(i); /* * When comparing with imbalance, use weighted_cpuload() - * which is not scaled with the cpu power. + * which is not scaled with the cpu capacity. */ - if (capacity && rq->nr_running == 1 && wl > env->imbalance) + if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance) continue; /* * For the load comparisons with the other cpu's, consider - * the weighted_cpuload() scaled with the cpu power, so that - * the load can be moved away from the cpu that is potentially - * running at a lower capacity. + * the weighted_cpuload() scaled with the cpu capacity, so + * that the load can be moved away from the cpu that is + * potentially running at a lower capacity. * - * Thus we're looking for max(wl_i / power_i), crosswise + * Thus we're looking for max(wl_i / capacity_i), crosswise * multiplication to rid ourselves of the division works out - * to: wl_i * power_j > wl_j * power_i; where j is our - * previous maximum. + * to: wl_i * capacity_j > wl_j * capacity_i; where j is + * our previous maximum. */ - if (wl * busiest_power > busiest_load * power) { + if (wl * busiest_capacity > busiest_load * capacity) { busiest_load = wl; - busiest_power = power; + busiest_capacity = capacity; busiest = rq; } } @@ -6247,7 +6614,7 @@ more_balance: * We failed to reach balance because of affinity. */ if (sd_parent) { - int *group_imbalance = &sd_parent->groups->sgp->imbalance; + int *group_imbalance = &sd_parent->groups->sgc->imbalance; if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { *group_imbalance = 1; @@ -6353,21 +6720,62 @@ out: return ld_moved; } +static inline unsigned long +get_sd_balance_interval(struct sched_domain *sd, int cpu_busy) +{ + unsigned long interval = sd->balance_interval; + + if (cpu_busy) + interval *= sd->busy_factor; + + /* scale ms to jiffies */ + interval = msecs_to_jiffies(interval); + interval = clamp(interval, 1UL, max_load_balance_interval); + + return interval; +} + +static inline void +update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance) +{ + unsigned long interval, next; + + interval = get_sd_balance_interval(sd, cpu_busy); + next = sd->last_balance + interval; + + if (time_after(*next_balance, next)) + *next_balance = next; +} + /* * idle_balance is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. */ -void idle_balance(int this_cpu, struct rq *this_rq) +static int idle_balance(struct rq *this_rq) { + unsigned long next_balance = jiffies + HZ; + int this_cpu = this_rq->cpu; struct sched_domain *sd; int pulled_task = 0; - unsigned long next_balance = jiffies + HZ; u64 curr_cost = 0; + idle_enter_fair(this_rq); + + /* + * We must set idle_stamp _before_ calling idle_balance(), such that we + * measure the duration of idle_balance() as idle time. + */ this_rq->idle_stamp = rq_clock(this_rq); - if (this_rq->avg_idle < sysctl_sched_migration_cost) - return; + if (this_rq->avg_idle < sysctl_sched_migration_cost) { + rcu_read_lock(); + sd = rcu_dereference_check_sched_domain(this_rq->sd); + if (sd) + update_next_balance(sd, 0, &next_balance); + rcu_read_unlock(); + + goto out; + } /* * Drop the rq->lock, but keep IRQ/preempt disabled. @@ -6377,20 +6785,20 @@ void idle_balance(int this_cpu, struct rq *this_rq) update_blocked_averages(this_cpu); rcu_read_lock(); for_each_domain(this_cpu, sd) { - unsigned long interval; int continue_balancing = 1; u64 t0, domain_cost; if (!(sd->flags & SD_LOAD_BALANCE)) continue; - if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) + if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { + update_next_balance(sd, 0, &next_balance); break; + } if (sd->flags & SD_BALANCE_NEWIDLE) { t0 = sched_clock_cpu(this_cpu); - /* If we've pulled tasks over stop searching: */ pulled_task = load_balance(this_cpu, this_rq, sd, CPU_NEWLY_IDLE, &continue_balancing); @@ -6402,28 +6810,45 @@ void idle_balance(int this_cpu, struct rq *this_rq) curr_cost += domain_cost; } - interval = msecs_to_jiffies(sd->balance_interval); - if (time_after(next_balance, sd->last_balance + interval)) - next_balance = sd->last_balance + interval; - if (pulled_task) { - this_rq->idle_stamp = 0; + update_next_balance(sd, 0, &next_balance); + + /* + * Stop searching for tasks to pull if there are + * now runnable tasks on this rq. + */ + if (pulled_task || this_rq->nr_running > 0) break; - } } rcu_read_unlock(); raw_spin_lock(&this_rq->lock); - if (pulled_task || time_after(jiffies, this_rq->next_balance)) { - /* - * We are going idle. next_balance may be set based on - * a busy processor. So reset next_balance. - */ + if (curr_cost > this_rq->max_idle_balance_cost) + this_rq->max_idle_balance_cost = curr_cost; + + /* + * While browsing the domains, we released the rq lock, a task could + * have been enqueued in the meantime. Since we're not going idle, + * pretend we pulled a task. + */ + if (this_rq->cfs.h_nr_running && !pulled_task) + pulled_task = 1; + +out: + /* Move the next balance forward */ + if (time_after(this_rq->next_balance, next_balance)) this_rq->next_balance = next_balance; + + /* Is there a task of a high priority class? */ + if (this_rq->nr_running != this_rq->cfs.h_nr_running) + pulled_task = -1; + + if (pulled_task) { + idle_exit_fair(this_rq); + this_rq->idle_stamp = 0; } - if (curr_cost > this_rq->max_idle_balance_cost) - this_rq->max_idle_balance_cost = curr_cost; + return pulled_task; } /* @@ -6494,6 +6919,11 @@ out_unlock: return 0; } +static inline int on_null_domain(struct rq *rq) +{ + return unlikely(!rcu_dereference_sched(rq->sd)); +} + #ifdef CONFIG_NO_HZ_COMMON /* * idle load balancing details @@ -6548,8 +6978,13 @@ static void nohz_balancer_kick(void) static inline void nohz_balance_exit_idle(int cpu) { if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { - cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); - atomic_dec(&nohz.nr_cpus); + /* + * Completely isolated CPUs don't ever set, so we must test. + */ + if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { + cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); + atomic_dec(&nohz.nr_cpus); + } clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); } } @@ -6566,7 +7001,7 @@ static inline void set_cpu_sd_state_busy(void) goto unlock; sd->nohz_idle = 0; - atomic_inc(&sd->groups->sgp->nr_busy_cpus); + atomic_inc(&sd->groups->sgc->nr_busy_cpus); unlock: rcu_read_unlock(); } @@ -6583,7 +7018,7 @@ void set_cpu_sd_state_idle(void) goto unlock; sd->nohz_idle = 1; - atomic_dec(&sd->groups->sgp->nr_busy_cpus); + atomic_dec(&sd->groups->sgc->nr_busy_cpus); unlock: rcu_read_unlock(); } @@ -6603,6 +7038,12 @@ void nohz_balance_enter_idle(int cpu) if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) return; + /* + * If we're a completely isolated CPU, we don't play. + */ + if (on_null_domain(cpu_rq(cpu))) + return; + cpumask_set_cpu(cpu, nohz.idle_cpus_mask); atomic_inc(&nohz.nr_cpus); set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); @@ -6680,16 +7121,9 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) break; } - interval = sd->balance_interval; - if (idle != CPU_IDLE) - interval *= sd->busy_factor; - - /* scale ms to jiffies */ - interval = msecs_to_jiffies(interval); - interval = clamp(interval, 1UL, max_load_balance_interval); + interval = get_sd_balance_interval(sd, idle != CPU_IDLE); need_serialize = sd->flags & SD_SERIALIZE; - if (need_serialize) { if (!spin_trylock(&balancing)) goto out; @@ -6705,6 +7139,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; } sd->last_balance = jiffies; + interval = get_sd_balance_interval(sd, idle != CPU_IDLE); } if (need_serialize) spin_unlock(&balancing); @@ -6762,12 +7197,17 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) rq = cpu_rq(balance_cpu); - raw_spin_lock_irq(&rq->lock); - update_rq_clock(rq); - update_idle_cpu_load(rq); - raw_spin_unlock_irq(&rq->lock); - - rebalance_domains(rq, CPU_IDLE); + /* + * If time for next balance is due, + * do the balance. + */ + if (time_after_eq(jiffies, rq->next_balance)) { + raw_spin_lock_irq(&rq->lock); + update_rq_clock(rq); + update_idle_cpu_load(rq); + raw_spin_unlock_irq(&rq->lock); + rebalance_domains(rq, CPU_IDLE); + } if (time_after(this_rq->next_balance, rq->next_balance)) this_rq->next_balance = rq->next_balance; @@ -6782,7 +7222,7 @@ end: * of an idle cpu is the system. * - This rq has more than one task. * - At any scheduler domain level, this cpu's scheduler group has multiple - * busy cpu's exceeding the group's power. + * busy cpu's exceeding the group's capacity. * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler * domain span are idle. */ @@ -6790,7 +7230,7 @@ static inline int nohz_kick_needed(struct rq *rq) { unsigned long now = jiffies; struct sched_domain *sd; - struct sched_group_power *sgp; + struct sched_group_capacity *sgc; int nr_busy, cpu = rq->cpu; if (unlikely(rq->idle_balance)) @@ -6820,8 +7260,8 @@ static inline int nohz_kick_needed(struct rq *rq) sd = rcu_dereference(per_cpu(sd_busy, cpu)); if (sd) { - sgp = sd->groups->sgp; - nr_busy = atomic_read(&sgp->nr_busy_cpus); + sgc = sd->groups->sgc; + nr_busy = atomic_read(&sgc->nr_busy_cpus); if (nr_busy > 1) goto need_kick_unlock; @@ -6865,11 +7305,6 @@ static void run_rebalance_domains(struct softirq_action *h) nohz_idle_balance(this_rq, idle); } -static inline int on_null_domain(struct rq *rq) -{ - return !rcu_dereference_sched(rq->sd); -} - /* * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. */ @@ -6999,15 +7434,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) struct cfs_rq *cfs_rq = cfs_rq_of(se); /* - * Ensure the task's vruntime is normalized, so that when its + * Ensure the task's vruntime is normalized, so that when it's * switched back to the fair class the enqueue_entity(.flags=0) will * do the right thing. * - * If it was on_rq, then the dequeue_entity(.flags=0) will already - * have normalized the vruntime, if it was !on_rq, then only when + * If it's on_rq, then the dequeue_entity(.flags=0) will already + * have normalized the vruntime, if it's !on_rq, then only when * the task is sleeping will it still have non-normalized vruntime. */ - if (!se->on_rq && p->state != TASK_RUNNING) { + if (!p->on_rq && p->state != TASK_RUNNING) { /* * Fix up our vruntime so that the current sleep doesn't * cause 'unlimited' sleep bonus. @@ -7034,7 +7469,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) */ static void switched_to_fair(struct rq *rq, struct task_struct *p) { - if (!p->se.on_rq) + struct sched_entity *se = &p->se; +#ifdef CONFIG_FAIR_GROUP_SCHED + /* + * Since the real-depth could have been changed (only FAIR + * class maintain depth value), reset depth properly. + */ + se->depth = se->parent ? se->parent->depth + 1 : 0; +#endif + if (!se->on_rq) return; /* @@ -7082,7 +7525,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) #ifdef CONFIG_FAIR_GROUP_SCHED static void task_move_group_fair(struct task_struct *p, int on_rq) { + struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq; + /* * If the task was not on the rq at the time of this cgroup movement * it must have been asleep, sleeping tasks keep their ->vruntime @@ -7108,23 +7553,24 @@ static void task_move_group_fair(struct task_struct *p, int on_rq) * To prevent boost or penalty in the new cfs_rq caused by delta * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. */ - if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING)) + if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING)) on_rq = 1; if (!on_rq) - p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; + se->vruntime -= cfs_rq_of(se)->min_vruntime; set_task_rq(p, task_cpu(p)); + se->depth = se->parent ? se->parent->depth + 1 : 0; if (!on_rq) { - cfs_rq = cfs_rq_of(&p->se); - p->se.vruntime += cfs_rq->min_vruntime; + cfs_rq = cfs_rq_of(se); + se->vruntime += cfs_rq->min_vruntime; #ifdef CONFIG_SMP /* * migrate_task_rq_fair() will have removed our previous * contribution, but we must synchronize for ongoing future * decay. */ - p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter); - cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib; + se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); + cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; #endif } } @@ -7220,10 +7666,13 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, if (!se) return; - if (!parent) + if (!parent) { se->cfs_rq = &rq->cfs; - else + se->depth = 0; + } else { se->cfs_rq = parent->my_q; + se->depth = parent->depth + 1; + } se->my_q = cfs_rq; /* guarantee group entities always have weight */ diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 5716929a2e3..90284d117fe 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -37,18 +37,18 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true) SCHED_FEAT(WAKEUP_PREEMPTION, true) /* - * Use arch dependent cpu power functions + * Use arch dependent cpu capacity functions */ -SCHED_FEAT(ARCH_POWER, true) +SCHED_FEAT(ARCH_CAPACITY, true) SCHED_FEAT(HRTICK, false) SCHED_FEAT(DOUBLE_TICK, false) SCHED_FEAT(LB_BIAS, true) /* - * Decrement CPU power based on time not spent running tasks + * Decrement CPU capacity based on time not spent running tasks */ -SCHED_FEAT(NONTASK_POWER, true) +SCHED_FEAT(NONTASK_CAPACITY, true) /* * Queue remote wakeups on the target CPU and process them diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c new file mode 100644 index 00000000000..cf009fb0bc2 --- /dev/null +++ b/kernel/sched/idle.c @@ -0,0 +1,273 @@ +/* + * Generic entry point for the idle threads + */ +#include <linux/sched.h> +#include <linux/cpu.h> +#include <linux/cpuidle.h> +#include <linux/tick.h> +#include <linux/mm.h> +#include <linux/stackprotector.h> + +#include <asm/tlb.h> + +#include <trace/events/power.h> + +#include "sched.h" + +static int __read_mostly cpu_idle_force_poll; + +void cpu_idle_poll_ctrl(bool enable) +{ + if (enable) { + cpu_idle_force_poll++; + } else { + cpu_idle_force_poll--; + WARN_ON_ONCE(cpu_idle_force_poll < 0); + } +} + +#ifdef CONFIG_GENERIC_IDLE_POLL_SETUP +static int __init cpu_idle_poll_setup(char *__unused) +{ + cpu_idle_force_poll = 1; + return 1; +} +__setup("nohlt", cpu_idle_poll_setup); + +static int __init cpu_idle_nopoll_setup(char *__unused) +{ + cpu_idle_force_poll = 0; + return 1; +} +__setup("hlt", cpu_idle_nopoll_setup); +#endif + +static inline int cpu_idle_poll(void) +{ + rcu_idle_enter(); + trace_cpu_idle_rcuidle(0, smp_processor_id()); + local_irq_enable(); + while (!tif_need_resched()) + cpu_relax(); + trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); + rcu_idle_exit(); + return 1; +} + +/* Weak implementations for optional arch specific functions */ +void __weak arch_cpu_idle_prepare(void) { } +void __weak arch_cpu_idle_enter(void) { } +void __weak arch_cpu_idle_exit(void) { } +void __weak arch_cpu_idle_dead(void) { } +void __weak arch_cpu_idle(void) +{ + cpu_idle_force_poll = 1; + local_irq_enable(); +} + +/** + * cpuidle_idle_call - the main idle function + * + * NOTE: no locks or semaphores should be used here + * + * On archs that support TIF_POLLING_NRFLAG, is called with polling + * set, and it returns with polling set. If it ever stops polling, it + * must clear the polling bit. + */ +static void cpuidle_idle_call(void) +{ + struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); + struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); + int next_state, entered_state; + bool broadcast; + + /* + * Check if the idle task must be rescheduled. If it is the + * case, exit the function after re-enabling the local irq. + */ + if (need_resched()) { + local_irq_enable(); + return; + } + + /* + * During the idle period, stop measuring the disabled irqs + * critical sections latencies + */ + stop_critical_timings(); + + /* + * Tell the RCU framework we are entering an idle section, + * so no more rcu read side critical sections and one more + * step to the grace period + */ + rcu_idle_enter(); + + /* + * Ask the cpuidle framework to choose a convenient idle state. + * Fall back to the default arch idle method on errors. + */ + next_state = cpuidle_select(drv, dev); + if (next_state < 0) { +use_default: + /* + * We can't use the cpuidle framework, let's use the default + * idle routine. + */ + if (current_clr_polling_and_test()) + local_irq_enable(); + else + arch_cpu_idle(); + + goto exit_idle; + } + + + /* + * The idle task must be scheduled, it is pointless to + * go to idle, just update no idle residency and get + * out of this function + */ + if (current_clr_polling_and_test()) { + dev->last_residency = 0; + entered_state = next_state; + local_irq_enable(); + goto exit_idle; + } + + broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP); + + /* + * Tell the time framework to switch to a broadcast timer + * because our local timer will be shutdown. If a local timer + * is used from another cpu as a broadcast timer, this call may + * fail if it is not available + */ + if (broadcast && + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu)) + goto use_default; + + trace_cpu_idle_rcuidle(next_state, dev->cpu); + + /* + * Enter the idle state previously returned by the governor decision. + * This function will block until an interrupt occurs and will take + * care of re-enabling the local interrupts + */ + entered_state = cpuidle_enter(drv, dev, next_state); + + trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu); + + if (broadcast) + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); + + /* + * Give the governor an opportunity to reflect on the outcome + */ + cpuidle_reflect(dev, entered_state); + +exit_idle: + __current_set_polling(); + + /* + * It is up to the idle functions to reenable local interrupts + */ + if (WARN_ON_ONCE(irqs_disabled())) + local_irq_enable(); + + rcu_idle_exit(); + start_critical_timings(); +} + +/* + * Generic idle loop implementation + * + * Called with polling cleared. + */ +static void cpu_idle_loop(void) +{ + while (1) { + /* + * If the arch has a polling bit, we maintain an invariant: + * + * Our polling bit is clear if we're not scheduled (i.e. if + * rq->curr != rq->idle). This means that, if rq->idle has + * the polling bit set, then setting need_resched is + * guaranteed to cause the cpu to reschedule. + */ + + __current_set_polling(); + tick_nohz_idle_enter(); + + while (!need_resched()) { + check_pgt_cache(); + rmb(); + + if (cpu_is_offline(smp_processor_id())) + arch_cpu_idle_dead(); + + local_irq_disable(); + arch_cpu_idle_enter(); + + /* + * In poll mode we reenable interrupts and spin. + * + * Also if we detected in the wakeup from idle + * path that the tick broadcast device expired + * for us, we don't want to go deep idle as we + * know that the IPI is going to arrive right + * away + */ + if (cpu_idle_force_poll || tick_check_broadcast_expired()) + cpu_idle_poll(); + else + cpuidle_idle_call(); + + arch_cpu_idle_exit(); + } + + /* + * Since we fell out of the loop above, we know + * TIF_NEED_RESCHED must be set, propagate it into + * PREEMPT_NEED_RESCHED. + * + * This is required because for polling idle loops we will + * not have had an IPI to fold the state for us. + */ + preempt_set_need_resched(); + tick_nohz_idle_exit(); + __current_clr_polling(); + + /* + * We promise to call sched_ttwu_pending and reschedule + * if need_resched is set while polling is set. That + * means that clearing polling needs to be visible + * before doing these things. + */ + smp_mb__after_atomic(); + + sched_ttwu_pending(); + schedule_preempt_disabled(); + } +} + +void cpu_startup_entry(enum cpuhp_state state) +{ + /* + * This #ifdef needs to die, but it's too late in the cycle to + * make this generic (arm and sh have never invoked the canary + * init for the non boot cpus!). Will be fixed in 3.11 + */ +#ifdef CONFIG_X86 + /* + * If we're the non-boot CPU, nothing set the stack canary up + * for us. The boot CPU already has it initialized but no harm + * in doing it again. This is a good place for updating it, as + * we wont ever return from this function (so the invalid + * canaries already on the stack wont ever trigger). + */ + boot_init_stack_canary(); +#endif + arch_cpu_idle_prepare(); + cpu_idle_loop(); +} diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 516c3d9ceea..879f2b75266 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -13,18 +13,8 @@ select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) { return task_cpu(p); /* IDLE tasks as never migrated */ } - -static void pre_schedule_idle(struct rq *rq, struct task_struct *prev) -{ - idle_exit_fair(rq); - rq_last_tick_reset(rq); -} - -static void post_schedule_idle(struct rq *rq) -{ - idle_enter_fair(rq); -} #endif /* CONFIG_SMP */ + /* * Idle tasks are unconditionally rescheduled: */ @@ -33,13 +23,12 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl resched_task(rq->idle); } -static struct task_struct *pick_next_task_idle(struct rq *rq) +static struct task_struct * +pick_next_task_idle(struct rq *rq, struct task_struct *prev) { + put_prev_task(rq, prev); + schedstat_inc(rq, sched_goidle); -#ifdef CONFIG_SMP - /* Trigger the post schedule to do an idle_enter for CFS */ - rq->post_schedule = 1; -#endif return rq->idle; } @@ -58,6 +47,8 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) { + idle_exit_fair(rq); + rq_last_tick_reset(rq); } static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) @@ -101,8 +92,6 @@ const struct sched_class idle_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_idle, - .pre_schedule = pre_schedule_idle, - .post_schedule = post_schedule_idle, #endif .set_curr_task = set_curr_task_idle, diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index a2740b775b4..a49083192c6 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -79,6 +79,8 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) rt_rq->overloaded = 0; plist_head_init(&rt_rq->pushable_tasks); #endif + /* We start is dequeued state, because no RT tasks are queued */ + rt_rq->rt_queued = 0; rt_rq->rt_time = 0; rt_rq->rt_throttled = 0; @@ -112,6 +114,13 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) return rt_se->rt_rq; } +static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se) +{ + struct rt_rq *rt_rq = rt_se->rt_rq; + + return rt_rq->rq; +} + void free_rt_sched_group(struct task_group *tg) { int i; @@ -211,10 +220,16 @@ static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) return container_of(rt_rq, struct rq, rt); } -static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) +static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se) { struct task_struct *p = rt_task_of(rt_se); - struct rq *rq = task_rq(p); + + return task_rq(p); +} + +static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) +{ + struct rq *rq = rq_of_rt_se(rt_se); return &rq->rt; } @@ -229,6 +244,14 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) #ifdef CONFIG_SMP +static int pull_rt_task(struct rq *this_rq); + +static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) +{ + /* Try to pull RT tasks here if we lower this rq's prio */ + return rq->rt.highest_prio.curr > prev->prio; +} + static inline int rt_overloaded(struct rq *rq) { return atomic_read(&rq->rd->rto_count); @@ -315,6 +338,15 @@ static inline int has_pushable_tasks(struct rq *rq) return !plist_head_empty(&rq->rt.pushable_tasks); } +static inline void set_post_schedule(struct rq *rq) +{ + /* + * We detect this state here so that we can avoid taking the RQ + * lock again later if there is no need to push + */ + rq->post_schedule = has_pushable_tasks(rq); +} + static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) { plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); @@ -359,8 +391,24 @@ void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { } +static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) +{ + return false; +} + +static inline int pull_rt_task(struct rq *this_rq) +{ + return 0; +} + +static inline void set_post_schedule(struct rq *rq) +{ +} #endif /* CONFIG_SMP */ +static void enqueue_top_rt_rq(struct rt_rq *rt_rq); +static void dequeue_top_rt_rq(struct rt_rq *rt_rq); + static inline int on_rt_rq(struct sched_rt_entity *rt_se) { return !list_empty(&rt_se->run_list); @@ -422,8 +470,11 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) rt_se = rt_rq->tg->rt_se[cpu]; if (rt_rq->rt_nr_running) { - if (rt_se && !on_rt_rq(rt_se)) + if (!rt_se) + enqueue_top_rt_rq(rt_rq); + else if (!on_rt_rq(rt_se)) enqueue_rt_entity(rt_se, false); + if (rt_rq->highest_prio.curr < curr->prio) resched_task(curr); } @@ -436,7 +487,9 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) rt_se = rt_rq->tg->rt_se[cpu]; - if (rt_se && on_rt_rq(rt_se)) + if (!rt_se) + dequeue_top_rt_rq(rt_rq); + else if (on_rt_rq(rt_se)) dequeue_rt_entity(rt_se); } @@ -507,12 +560,18 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) { - if (rt_rq->rt_nr_running) - resched_task(rq_of_rt_rq(rt_rq)->curr); + struct rq *rq = rq_of_rt_rq(rt_rq); + + if (!rt_rq->rt_nr_running) + return; + + enqueue_top_rt_rq(rt_rq); + resched_task(rq->curr); } static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) { + dequeue_top_rt_rq(rt_rq); } static inline int rt_rq_throttled(struct rt_rq *rt_rq) @@ -538,6 +597,14 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) #endif /* CONFIG_RT_GROUP_SCHED */ +bool sched_rt_bandwidth_account(struct rt_rq *rt_rq) +{ + struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); + + return (hrtimer_active(&rt_b->rt_period_timer) || + rt_rq->rt_time < rt_b->rt_runtime); +} + #ifdef CONFIG_SMP /* * We ran out of runtime, see if we can borrow some from our neighbours. @@ -823,14 +890,8 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) * but accrue some time due to boosting. */ if (likely(rt_b->rt_runtime)) { - static bool once = false; - rt_rq->rt_throttled = 1; - - if (!once) { - once = true; - printk_sched("sched: RT throttling activated\n"); - } + printk_deferred_once("sched: RT throttling activated\n"); } else { /* * In case we did anyway, make it go away, @@ -857,7 +918,6 @@ static void update_curr_rt(struct rq *rq) { struct task_struct *curr = rq->curr; struct sched_rt_entity *rt_se = &curr->rt; - struct rt_rq *rt_rq = rt_rq_of_se(rt_se); u64 delta_exec; if (curr->sched_class != &rt_sched_class) @@ -882,7 +942,7 @@ static void update_curr_rt(struct rq *rq) return; for_each_sched_rt_entity(rt_se) { - rt_rq = rt_rq_of_se(rt_se); + struct rt_rq *rt_rq = rt_rq_of_se(rt_se); if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { raw_spin_lock(&rt_rq->rt_runtime_lock); @@ -894,6 +954,38 @@ static void update_curr_rt(struct rq *rq) } } +static void +dequeue_top_rt_rq(struct rt_rq *rt_rq) +{ + struct rq *rq = rq_of_rt_rq(rt_rq); + + BUG_ON(&rq->rt != rt_rq); + + if (!rt_rq->rt_queued) + return; + + BUG_ON(!rq->nr_running); + + sub_nr_running(rq, rt_rq->rt_nr_running); + rt_rq->rt_queued = 0; +} + +static void +enqueue_top_rt_rq(struct rt_rq *rt_rq) +{ + struct rq *rq = rq_of_rt_rq(rt_rq); + + BUG_ON(&rq->rt != rt_rq); + + if (rt_rq->rt_queued) + return; + if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running) + return; + + add_nr_running(rq, rt_rq->rt_nr_running); + rt_rq->rt_queued = 1; +} + #if defined CONFIG_SMP static void @@ -1017,12 +1109,23 @@ void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {} #endif /* CONFIG_RT_GROUP_SCHED */ static inline +unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se) +{ + struct rt_rq *group_rq = group_rt_rq(rt_se); + + if (group_rq) + return group_rq->rt_nr_running; + else + return 1; +} + +static inline void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { int prio = rt_se_prio(rt_se); WARN_ON(!rt_prio(prio)); - rt_rq->rt_nr_running++; + rt_rq->rt_nr_running += rt_se_nr_running(rt_se); inc_rt_prio(rt_rq, prio); inc_rt_migration(rt_se, rt_rq); @@ -1034,7 +1137,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { WARN_ON(!rt_prio(rt_se_prio(rt_se))); WARN_ON(!rt_rq->rt_nr_running); - rt_rq->rt_nr_running--; + rt_rq->rt_nr_running -= rt_se_nr_running(rt_se); dec_rt_prio(rt_rq, rt_se_prio(rt_se)); dec_rt_migration(rt_se, rt_rq); @@ -1091,6 +1194,8 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se) back = rt_se; } + dequeue_top_rt_rq(rt_rq_of_se(back)); + for (rt_se = back; rt_se; rt_se = rt_se->back) { if (on_rt_rq(rt_se)) __dequeue_rt_entity(rt_se); @@ -1099,13 +1204,18 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se) static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) { + struct rq *rq = rq_of_rt_se(rt_se); + dequeue_rt_stack(rt_se); for_each_sched_rt_entity(rt_se) __enqueue_rt_entity(rt_se, head); + enqueue_top_rt_rq(&rq->rt); } static void dequeue_rt_entity(struct sched_rt_entity *rt_se) { + struct rq *rq = rq_of_rt_se(rt_se); + dequeue_rt_stack(rt_se); for_each_sched_rt_entity(rt_se) { @@ -1114,6 +1224,7 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se) if (rt_rq && rt_rq->rt_nr_running) __enqueue_rt_entity(rt_se, false); } + enqueue_top_rt_rq(&rq->rt); } /* @@ -1131,8 +1242,6 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); - - inc_nr_running(rq); } static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) @@ -1143,8 +1252,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) dequeue_rt_entity(rt_se); dequeue_pushable_task(rq, p); - - dec_nr_running(rq); } /* @@ -1310,15 +1417,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) { struct sched_rt_entity *rt_se; struct task_struct *p; - struct rt_rq *rt_rq; - - rt_rq = &rq->rt; - - if (!rt_rq->rt_nr_running) - return NULL; - - if (rt_rq_throttled(rt_rq)) - return NULL; + struct rt_rq *rt_rq = &rq->rt; do { rt_se = pick_next_rt_entity(rq, rt_rq); @@ -1332,21 +1431,43 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) return p; } -static struct task_struct *pick_next_task_rt(struct rq *rq) +static struct task_struct * +pick_next_task_rt(struct rq *rq, struct task_struct *prev) { - struct task_struct *p = _pick_next_task_rt(rq); + struct task_struct *p; + struct rt_rq *rt_rq = &rq->rt; + + if (need_pull_rt_task(rq, prev)) { + pull_rt_task(rq); + /* + * pull_rt_task() can drop (and re-acquire) rq->lock; this + * means a dl or stop task can slip in, in which case we need + * to re-start task selection. + */ + if (unlikely((rq->stop && rq->stop->on_rq) || + rq->dl.dl_nr_running)) + return RETRY_TASK; + } + + /* + * We may dequeue prev's rt_rq in put_prev_task(). + * So, we update time before rt_nr_running check. + */ + if (prev->sched_class == &rt_sched_class) + update_curr_rt(rq); + + if (!rt_rq->rt_queued) + return NULL; + + put_prev_task(rq, prev); + + p = _pick_next_task_rt(rq); /* The running task is never eligible for pushing */ if (p) dequeue_pushable_task(rq, p); -#ifdef CONFIG_SMP - /* - * We detect this state here so that we can avoid taking the RQ - * lock again later if there is no need to push - */ - rq->post_schedule = has_pushable_tasks(rq); -#endif + set_post_schedule(rq); return p; } @@ -1716,13 +1837,6 @@ skip: return ret; } -static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) -{ - /* Try to pull RT tasks here if we lower this rq's prio */ - if (rq->rt.highest_prio.curr > prev->prio) - pull_rt_task(rq); -} - static void post_schedule_rt(struct rq *rq) { push_rt_tasks(rq); @@ -1825,7 +1939,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) resched_task(rq->curr); } -void init_sched_rt_class(void) +void __init init_sched_rt_class(void) { unsigned int i; @@ -1854,9 +1968,9 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) */ if (p->on_rq && rq->curr != p) { #ifdef CONFIG_SMP - if (rq->rt.overloaded && push_rt_task(rq) && + if (p->nr_cpus_allowed > 1 && rq->rt.overloaded && /* Don't resched if we changed runqueues */ - rq != task_rq(p)) + push_rt_task(rq) && rq != task_rq(p)) check_resched = 0; #endif /* CONFIG_SMP */ if (check_resched && p->prio < rq->curr->prio) @@ -1999,7 +2113,6 @@ const struct sched_class rt_sched_class = { .set_cpus_allowed = set_cpus_allowed_rt, .rq_online = rq_online_rt, .rq_offline = rq_offline_rt, - .pre_schedule = pre_schedule_rt, .post_schedule = post_schedule_rt, .task_woken = task_woken_rt, .switched_from = switched_from_rt, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c2119fd20f8..31cc02ebc54 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -24,24 +24,6 @@ extern long calc_load_fold_active(struct rq *this_rq); extern void update_cpu_load_active(struct rq *this_rq); /* - * Convert user-nice values [ -20 ... 0 ... 19 ] - * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], - * and back. - */ -#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) -#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) -#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) - -/* - * 'User priority' is the nice value converted to something we - * can work with better when scaling various scheduler parameters, - * it's a [ 0 ... 39 ] range. - */ -#define USER_PRIO(p) ((p)-MAX_RT_PRIO) -#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) -#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) - -/* * Helpers for converting nanosecond timing to jiffy resolution */ #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) @@ -296,7 +278,7 @@ extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b); extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); -extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); +extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force); extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); extern void free_rt_sched_group(struct task_group *tg); @@ -427,6 +409,8 @@ struct rt_rq { int overloaded; struct plist_head pushable_tasks; #endif + int rt_queued; + int rt_throttled; u64 rt_time; u64 rt_runtime; @@ -462,7 +446,6 @@ struct dl_rq { } earliest_dl; unsigned long dl_nr_migratory; - unsigned long dl_nr_total; int overloaded; /* @@ -559,11 +542,9 @@ struct rq { #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ struct list_head leaf_cfs_rq_list; -#endif /* CONFIG_FAIR_GROUP_SCHED */ -#ifdef CONFIG_RT_GROUP_SCHED - struct list_head leaf_rt_rq_list; -#endif + struct sched_avg avg; +#endif /* CONFIG_FAIR_GROUP_SCHED */ /* * This is part of a global counter where only the total sum @@ -586,7 +567,7 @@ struct rq { struct root_domain *rd; struct sched_domain *sd; - unsigned long cpu_power; + unsigned long cpu_capacity; unsigned char idle_balance; /* For active balancing */ @@ -652,8 +633,6 @@ struct rq { #ifdef CONFIG_SMP struct llist_head wake_list; #endif - - struct sched_avg avg; }; static inline int cpu_of(struct rq *rq) @@ -691,6 +670,8 @@ extern int migrate_swap(struct task_struct *, struct task_struct *); #ifdef CONFIG_SMP +extern void sched_ttwu_pending(void); + #define rcu_dereference_check_sched_domain(p) \ rcu_dereference_check((p), \ lockdep_is_held(&sched_domains_mutex)) @@ -749,15 +730,15 @@ DECLARE_PER_CPU(struct sched_domain *, sd_numa); DECLARE_PER_CPU(struct sched_domain *, sd_busy); DECLARE_PER_CPU(struct sched_domain *, sd_asym); -struct sched_group_power { +struct sched_group_capacity { atomic_t ref; /* - * CPU power of this group, SCHED_LOAD_SCALE being max power for a - * single CPU. + * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity + * for a single CPU. */ - unsigned int power, power_orig; + unsigned int capacity, capacity_orig; unsigned long next_update; - int imbalance; /* XXX unrelated to power but shared group state */ + int imbalance; /* XXX unrelated to capacity but shared group state */ /* * Number of busy cpus in this group. */ @@ -771,7 +752,7 @@ struct sched_group { atomic_t ref; unsigned int group_weight; - struct sched_group_power *sgp; + struct sched_group_capacity *sgc; /* * The CPUs this group covers. @@ -794,7 +775,7 @@ static inline struct cpumask *sched_group_cpus(struct sched_group *sg) */ static inline struct cpumask *sched_group_mask(struct sched_group *sg) { - return to_cpumask(sg->sgp->cpumask); + return to_cpumask(sg->sgc->cpumask); } /** @@ -808,6 +789,10 @@ static inline unsigned int group_first_cpu(struct sched_group *group) extern int group_balance_cpu(struct sched_group *sg); +#else + +static inline void sched_ttwu_pending(void) { } + #endif /* CONFIG_SMP */ #include "stats.h" @@ -1113,6 +1098,8 @@ static const u32 prio_to_wmult[40] = { #define DEQUEUE_SLEEP 1 +#define RETRY_TASK ((void *)-1UL) + struct sched_class { const struct sched_class *next; @@ -1123,14 +1110,22 @@ struct sched_class { void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); - struct task_struct * (*pick_next_task) (struct rq *rq); + /* + * It is the responsibility of the pick_next_task() method that will + * return the next task to call put_prev_task() on the @prev task or + * something equivalent. + * + * May return RETRY_TASK when it finds a higher prio class has runnable + * tasks. + */ + struct task_struct * (*pick_next_task) (struct rq *rq, + struct task_struct *prev); void (*put_prev_task) (struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); void (*migrate_task_rq)(struct task_struct *p, int next_cpu); - void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); void (*post_schedule) (struct rq *this_rq); void (*task_waking) (struct task_struct *task); void (*task_woken) (struct rq *this_rq, struct task_struct *task); @@ -1160,6 +1155,11 @@ struct sched_class { #endif }; +static inline void put_prev_task(struct rq *rq, struct task_struct *prev) +{ + prev->sched_class->put_prev_task(rq, prev); +} + #define sched_class_highest (&stop_sched_class) #define for_each_class(class) \ for (class = sched_class_highest; class; class = class->next) @@ -1173,19 +1173,17 @@ extern const struct sched_class idle_sched_class; #ifdef CONFIG_SMP -extern void update_group_power(struct sched_domain *sd, int cpu); +extern void update_group_capacity(struct sched_domain *sd, int cpu); extern void trigger_load_balance(struct rq *rq); -extern void idle_balance(int this_cpu, struct rq *this_rq); extern void idle_enter_fair(struct rq *this_rq); extern void idle_exit_fair(struct rq *this_rq); -#else /* CONFIG_SMP */ +#else -static inline void idle_balance(int cpu, struct rq *rq) -{ -} +static inline void idle_enter_fair(struct rq *rq) { } +static inline void idle_exit_fair(struct rq *rq) { } #endif @@ -1214,22 +1212,14 @@ extern void update_idle_cpu_load(struct rq *this_rq); extern void init_task_runnable_average(struct task_struct *p); -#ifdef CONFIG_PARAVIRT -static inline u64 steal_ticks(u64 steal) +static inline void add_nr_running(struct rq *rq, unsigned count) { - if (unlikely(steal > NSEC_PER_SEC)) - return div_u64(steal, TICK_NSEC); - - return __iter_div_u64_rem(steal, TICK_NSEC, &steal); -} -#endif + unsigned prev_nr = rq->nr_running; -static inline void inc_nr_running(struct rq *rq) -{ - rq->nr_running++; + rq->nr_running = prev_nr + count; #ifdef CONFIG_NO_HZ_FULL - if (rq->nr_running == 2) { + if (prev_nr < 2 && rq->nr_running >= 2) { if (tick_nohz_full_cpu(rq->cpu)) { /* Order rq->nr_running write against the IPI */ smp_wmb(); @@ -1239,9 +1229,9 @@ static inline void inc_nr_running(struct rq *rq) #endif } -static inline void dec_nr_running(struct rq *rq) +static inline void sub_nr_running(struct rq *rq, unsigned count) { - rq->nr_running--; + rq->nr_running -= count; } static inline void rq_last_tick_reset(struct rq *rq) @@ -1393,6 +1383,15 @@ static inline void double_lock(spinlock_t *l1, spinlock_t *l2) spin_lock_nested(l2, SINGLE_DEPTH_NESTING); } +static inline void double_lock_irq(spinlock_t *l1, spinlock_t *l2) +{ + if (l1 > l2) + swap(l1, l2); + + spin_lock_irq(l1); + spin_lock_nested(l2, SINGLE_DEPTH_NESTING); +} + static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2) { if (l1 > l2) diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index da98af347e8..a476bea17fb 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -142,4 +142,4 @@ static int __init proc_schedstat_init(void) proc_create("schedstat", 0, NULL, &proc_schedstat_operations); return 0; } -module_init(proc_schedstat_init); +subsys_initcall(proc_schedstat_init); diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index fdb6bb0b335..bfe0edadbfb 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -23,28 +23,31 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) /* we're never preempted */ } -static struct task_struct *pick_next_task_stop(struct rq *rq) +static struct task_struct * +pick_next_task_stop(struct rq *rq, struct task_struct *prev) { struct task_struct *stop = rq->stop; - if (stop && stop->on_rq) { - stop->se.exec_start = rq_clock_task(rq); - return stop; - } + if (!stop || !stop->on_rq) + return NULL; - return NULL; + put_prev_task(rq, prev); + + stop->se.exec_start = rq_clock_task(rq); + + return stop; } static void enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) { - inc_nr_running(rq); + add_nr_running(rq, 1); } static void dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) { - dec_nr_running(rq); + sub_nr_running(rq, 1); } static void yield_task_stop(struct rq *rq) diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 7d50f794e24..0ffa20ae657 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -394,7 +394,7 @@ EXPORT_SYMBOL(__wake_up_bit); * * In order for this to function properly, as it uses waitqueue_active() * internally, some kind of memory barrier must be done prior to calling - * this. Typically, this will be smp_mb__after_clear_bit(), but in some + * this. Typically, this will be smp_mb__after_atomic(), but in some * cases where bitflags are manipulated non-atomically under a lock, one * may need to use a less regular barrier, such fs/inode.c's smp_mb(), * because spin_unlock() does not guarantee a memory barrier. diff --git a/kernel/seccomp.c b/kernel/seccomp.c index b7a10048a32..301bbc24739 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -39,7 +39,7 @@ * is only needed for handling filters shared across tasks. * @prev: points to a previously installed, or inherited, filter * @len: the number of instructions in the program - * @insns: the BPF program instructions to evaluate + * @insnsi: the BPF program instructions to evaluate * * seccomp_filter objects are organized in a tree linked via the @prev * pointer. For any task, it appears to be a singly-linked list starting @@ -54,61 +54,32 @@ struct seccomp_filter { atomic_t usage; struct seccomp_filter *prev; - unsigned short len; /* Instruction count */ - struct sock_filter insns[]; + struct sk_filter *prog; }; /* Limit any path through the tree to 256KB worth of instructions. */ #define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter)) -/** - * get_u32 - returns a u32 offset into data - * @data: a unsigned 64 bit value - * @index: 0 or 1 to return the first or second 32-bits - * - * This inline exists to hide the length of unsigned long. If a 32-bit - * unsigned long is passed in, it will be extended and the top 32-bits will be - * 0. If it is a 64-bit unsigned long, then whatever data is resident will be - * properly returned. - * +/* * Endianness is explicitly ignored and left for BPF program authors to manage * as per the specific architecture. */ -static inline u32 get_u32(u64 data, int index) +static void populate_seccomp_data(struct seccomp_data *sd) { - return ((u32 *)&data)[index]; -} - -/* Helper for bpf_load below. */ -#define BPF_DATA(_name) offsetof(struct seccomp_data, _name) -/** - * bpf_load: checks and returns a pointer to the requested offset - * @off: offset into struct seccomp_data to load from - * - * Returns the requested 32-bits of data. - * seccomp_check_filter() should assure that @off is 32-bit aligned - * and not out of bounds. Failure to do so is a BUG. - */ -u32 seccomp_bpf_load(int off) -{ - struct pt_regs *regs = task_pt_regs(current); - if (off == BPF_DATA(nr)) - return syscall_get_nr(current, regs); - if (off == BPF_DATA(arch)) - return syscall_get_arch(current, regs); - if (off >= BPF_DATA(args[0]) && off < BPF_DATA(args[6])) { - unsigned long value; - int arg = (off - BPF_DATA(args[0])) / sizeof(u64); - int index = !!(off % sizeof(u64)); - syscall_get_arguments(current, regs, arg, 1, &value); - return get_u32(value, index); - } - if (off == BPF_DATA(instruction_pointer)) - return get_u32(KSTK_EIP(current), 0); - if (off == BPF_DATA(instruction_pointer) + sizeof(u32)) - return get_u32(KSTK_EIP(current), 1); - /* seccomp_check_filter should make this impossible. */ - BUG(); + struct task_struct *task = current; + struct pt_regs *regs = task_pt_regs(task); + unsigned long args[6]; + + sd->nr = syscall_get_nr(task, regs); + sd->arch = syscall_get_arch(); + syscall_get_arguments(task, regs, 0, 6, args); + sd->args[0] = args[0]; + sd->args[1] = args[1]; + sd->args[2] = args[2]; + sd->args[3] = args[3]; + sd->args[4] = args[4]; + sd->args[5] = args[5]; + sd->instruction_pointer = KSTK_EIP(task); } /** @@ -132,59 +103,59 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) u32 k = ftest->k; switch (code) { - case BPF_S_LD_W_ABS: - ftest->code = BPF_S_ANC_SECCOMP_LD_W; + case BPF_LD | BPF_W | BPF_ABS: + ftest->code = BPF_LDX | BPF_W | BPF_ABS; /* 32-bit aligned and not out of bounds. */ if (k >= sizeof(struct seccomp_data) || k & 3) return -EINVAL; continue; - case BPF_S_LD_W_LEN: - ftest->code = BPF_S_LD_IMM; + case BPF_LD | BPF_W | BPF_LEN: + ftest->code = BPF_LD | BPF_IMM; ftest->k = sizeof(struct seccomp_data); continue; - case BPF_S_LDX_W_LEN: - ftest->code = BPF_S_LDX_IMM; + case BPF_LDX | BPF_W | BPF_LEN: + ftest->code = BPF_LDX | BPF_IMM; ftest->k = sizeof(struct seccomp_data); continue; /* Explicitly include allowed calls. */ - case BPF_S_RET_K: - case BPF_S_RET_A: - case BPF_S_ALU_ADD_K: - case BPF_S_ALU_ADD_X: - case BPF_S_ALU_SUB_K: - case BPF_S_ALU_SUB_X: - case BPF_S_ALU_MUL_K: - case BPF_S_ALU_MUL_X: - case BPF_S_ALU_DIV_X: - case BPF_S_ALU_AND_K: - case BPF_S_ALU_AND_X: - case BPF_S_ALU_OR_K: - case BPF_S_ALU_OR_X: - case BPF_S_ALU_XOR_K: - case BPF_S_ALU_XOR_X: - case BPF_S_ALU_LSH_K: - case BPF_S_ALU_LSH_X: - case BPF_S_ALU_RSH_K: - case BPF_S_ALU_RSH_X: - case BPF_S_ALU_NEG: - case BPF_S_LD_IMM: - case BPF_S_LDX_IMM: - case BPF_S_MISC_TAX: - case BPF_S_MISC_TXA: - case BPF_S_ALU_DIV_K: - case BPF_S_LD_MEM: - case BPF_S_LDX_MEM: - case BPF_S_ST: - case BPF_S_STX: - case BPF_S_JMP_JA: - case BPF_S_JMP_JEQ_K: - case BPF_S_JMP_JEQ_X: - case BPF_S_JMP_JGE_K: - case BPF_S_JMP_JGE_X: - case BPF_S_JMP_JGT_K: - case BPF_S_JMP_JGT_X: - case BPF_S_JMP_JSET_K: - case BPF_S_JMP_JSET_X: + case BPF_RET | BPF_K: + case BPF_RET | BPF_A: + case BPF_ALU | BPF_ADD | BPF_K: + case BPF_ALU | BPF_ADD | BPF_X: + case BPF_ALU | BPF_SUB | BPF_K: + case BPF_ALU | BPF_SUB | BPF_X: + case BPF_ALU | BPF_MUL | BPF_K: + case BPF_ALU | BPF_MUL | BPF_X: + case BPF_ALU | BPF_DIV | BPF_K: + case BPF_ALU | BPF_DIV | BPF_X: + case BPF_ALU | BPF_AND | BPF_K: + case BPF_ALU | BPF_AND | BPF_X: + case BPF_ALU | BPF_OR | BPF_K: + case BPF_ALU | BPF_OR | BPF_X: + case BPF_ALU | BPF_XOR | BPF_K: + case BPF_ALU | BPF_XOR | BPF_X: + case BPF_ALU | BPF_LSH | BPF_K: + case BPF_ALU | BPF_LSH | BPF_X: + case BPF_ALU | BPF_RSH | BPF_K: + case BPF_ALU | BPF_RSH | BPF_X: + case BPF_ALU | BPF_NEG: + case BPF_LD | BPF_IMM: + case BPF_LDX | BPF_IMM: + case BPF_MISC | BPF_TAX: + case BPF_MISC | BPF_TXA: + case BPF_LD | BPF_MEM: + case BPF_LDX | BPF_MEM: + case BPF_ST: + case BPF_STX: + case BPF_JMP | BPF_JA: + case BPF_JMP | BPF_JEQ | BPF_K: + case BPF_JMP | BPF_JEQ | BPF_X: + case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JGE | BPF_X: + case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JGT | BPF_X: + case BPF_JMP | BPF_JSET | BPF_K: + case BPF_JMP | BPF_JSET | BPF_X: continue; default: return -EINVAL; @@ -202,18 +173,22 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) static u32 seccomp_run_filters(int syscall) { struct seccomp_filter *f; + struct seccomp_data sd; u32 ret = SECCOMP_RET_ALLOW; /* Ensure unexpected behavior doesn't result in failing open. */ if (WARN_ON(current->seccomp.filter == NULL)) return SECCOMP_RET_KILL; + populate_seccomp_data(&sd); + /* * All filters in the list are evaluated and the lowest BPF return * value always takes priority (ignoring the DATA). */ for (f = current->seccomp.filter; f; f = f->prev) { - u32 cur_ret = sk_run_filter(NULL, f->insns); + u32 cur_ret = SK_RUN_FILTER(f->prog, (void *)&sd); + if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) ret = cur_ret; } @@ -231,18 +206,20 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) struct seccomp_filter *filter; unsigned long fp_size = fprog->len * sizeof(struct sock_filter); unsigned long total_insns = fprog->len; + struct sock_filter *fp; + int new_len; long ret; if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) return -EINVAL; for (filter = current->seccomp.filter; filter; filter = filter->prev) - total_insns += filter->len + 4; /* include a 4 instr penalty */ + total_insns += filter->prog->len + 4; /* include a 4 instr penalty */ if (total_insns > MAX_INSNS_PER_PATH) return -ENOMEM; /* - * Installing a seccomp filter requires that the task have + * Installing a seccomp filter requires that the task has * CAP_SYS_ADMIN in its namespace or be running with no_new_privs. * This avoids scenarios where unprivileged tasks can affect the * behavior of privileged children. @@ -252,28 +229,51 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) CAP_SYS_ADMIN) != 0) return -EACCES; - /* Allocate a new seccomp_filter */ - filter = kzalloc(sizeof(struct seccomp_filter) + fp_size, - GFP_KERNEL|__GFP_NOWARN); - if (!filter) + fp = kzalloc(fp_size, GFP_KERNEL|__GFP_NOWARN); + if (!fp) return -ENOMEM; - atomic_set(&filter->usage, 1); - filter->len = fprog->len; /* Copy the instructions from fprog. */ ret = -EFAULT; - if (copy_from_user(filter->insns, fprog->filter, fp_size)) - goto fail; + if (copy_from_user(fp, fprog->filter, fp_size)) + goto free_prog; /* Check and rewrite the fprog via the skb checker */ - ret = sk_chk_filter(filter->insns, filter->len); + ret = sk_chk_filter(fp, fprog->len); if (ret) - goto fail; + goto free_prog; /* Check and rewrite the fprog for seccomp use */ - ret = seccomp_check_filter(filter->insns, filter->len); + ret = seccomp_check_filter(fp, fprog->len); + if (ret) + goto free_prog; + + /* Convert 'sock_filter' insns to 'sock_filter_int' insns */ + ret = sk_convert_filter(fp, fprog->len, NULL, &new_len); if (ret) - goto fail; + goto free_prog; + + /* Allocate a new seccomp_filter */ + ret = -ENOMEM; + filter = kzalloc(sizeof(struct seccomp_filter), + GFP_KERNEL|__GFP_NOWARN); + if (!filter) + goto free_prog; + + filter->prog = kzalloc(sk_filter_size(new_len), + GFP_KERNEL|__GFP_NOWARN); + if (!filter->prog) + goto free_filter; + + ret = sk_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len); + if (ret) + goto free_filter_prog; + kfree(fp); + + atomic_set(&filter->usage, 1); + filter->prog->len = new_len; + + sk_filter_select_runtime(filter->prog); /* * If there is an existing filter, make it the prev and don't drop its @@ -282,8 +282,13 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) filter->prev = current->seccomp.filter; current->seccomp.filter = filter; return 0; -fail: + +free_filter_prog: + kfree(filter->prog); +free_filter: kfree(filter); +free_prog: + kfree(fp); return ret; } @@ -293,7 +298,7 @@ fail: * * Returns 0 on success and non-zero otherwise. */ -long seccomp_attach_user_filter(char __user *user_filter) +static long seccomp_attach_user_filter(char __user *user_filter) { struct sock_fprog fprog; long ret = -EFAULT; @@ -332,6 +337,7 @@ void put_seccomp_filter(struct task_struct *tsk) while (orig && atomic_dec_and_test(&orig->usage)) { struct seccomp_filter *freeme = orig; orig = orig->prev; + sk_filter_free(freeme->prog); kfree(freeme); } } @@ -351,7 +357,7 @@ static void seccomp_send_sigsys(int syscall, int reason) info.si_code = SYS_SECCOMP; info.si_call_addr = (void __user *)KSTK_EIP(current); info.si_errno = reason; - info.si_arch = syscall_get_arch(current, task_pt_regs(current)); + info.si_arch = syscall_get_arch(); info.si_syscall = syscall; force_sig_info(SIGSYS, &info, current); } diff --git a/kernel/signal.c b/kernel/signal.c index 52f881db1ca..a4077e90f19 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -33,6 +33,8 @@ #include <linux/uprobes.h> #include <linux/compat.h> #include <linux/cn_proc.h> +#include <linux/compiler.h> + #define CREATE_TRACE_POINTS #include <trace/events/signal.h> @@ -275,6 +277,7 @@ void task_clear_jobctl_trapping(struct task_struct *task) { if (unlikely(task->jobctl & JOBCTL_TRAPPING)) { task->jobctl &= ~JOBCTL_TRAPPING; + smp_mb(); /* advised by wake_up_bit() */ wake_up_bit(&task->jobctl, JOBCTL_TRAPPING_BIT); } } @@ -703,11 +706,8 @@ void signal_wake_up_state(struct task_struct *t, unsigned int state) * Returns 1 if any signals were found. * * All callers must be holding the siglock. - * - * This version takes a sigset mask and looks at all signals, - * not just those in the first mask word. */ -static int rm_from_queue_full(sigset_t *mask, struct sigpending *s) +static int flush_sigqueue_mask(sigset_t *mask, struct sigpending *s) { struct sigqueue *q, *n; sigset_t m; @@ -725,29 +725,6 @@ static int rm_from_queue_full(sigset_t *mask, struct sigpending *s) } return 1; } -/* - * Remove signals in mask from the pending set and queue. - * Returns 1 if any signals were found. - * - * All callers must be holding the siglock. - */ -static int rm_from_queue(unsigned long mask, struct sigpending *s) -{ - struct sigqueue *q, *n; - - if (!sigtestsetmask(&s->signal, mask)) - return 0; - - sigdelsetmask(&s->signal, mask); - list_for_each_entry_safe(q, n, &s->list, list) { - if (q->info.si_signo < SIGRTMIN && - (mask & sigmask(q->info.si_signo))) { - list_del_init(&q->list); - __sigqueue_free(q); - } - } - return 1; -} static inline int is_si_special(const struct siginfo *info) { @@ -859,6 +836,7 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force) { struct signal_struct *signal = p->signal; struct task_struct *t; + sigset_t flush; if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) { if (signal->flags & SIGNAL_GROUP_COREDUMP) @@ -870,26 +848,25 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force) /* * This is a stop signal. Remove SIGCONT from all queues. */ - rm_from_queue(sigmask(SIGCONT), &signal->shared_pending); - t = p; - do { - rm_from_queue(sigmask(SIGCONT), &t->pending); - } while_each_thread(p, t); + siginitset(&flush, sigmask(SIGCONT)); + flush_sigqueue_mask(&flush, &signal->shared_pending); + for_each_thread(p, t) + flush_sigqueue_mask(&flush, &t->pending); } else if (sig == SIGCONT) { unsigned int why; /* * Remove all stop signals from all queues, wake all threads. */ - rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending); - t = p; - do { + siginitset(&flush, SIG_KERNEL_STOP_MASK); + flush_sigqueue_mask(&flush, &signal->shared_pending); + for_each_thread(p, t) { + flush_sigqueue_mask(&flush, &t->pending); task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING); - rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); if (likely(!(t->ptrace & PT_SEIZED))) wake_up_state(t, __TASK_STOPPED); else ptrace_trap_notify(t); - } while_each_thread(p, t); + } /* * Notify the parent with CLD_CONTINUED if we were stopped. @@ -2382,7 +2359,7 @@ relock: * @regs: user register state * @stepping: nonzero if debugger single-step or block-step in use * - * This function should be called when a signal has succesfully been + * This function should be called when a signal has successfully been * delivered. It updates the blocked signals accordingly (@ka->sa.sa_mask * is always blocked, and the signal itself is blocked unless %SA_NODEFER * is set in @ka->sa.sa_flags. Tracing is notified. @@ -2852,7 +2829,7 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info, spin_lock_irq(&tsk->sighand->siglock); __set_task_blocked(tsk, &tsk->real_blocked); - siginitset(&tsk->real_blocked, 0); + sigemptyset(&tsk->real_blocked); sig = dequeue_signal(tsk, &mask, info); } spin_unlock_irq(&tsk->sighand->siglock); @@ -3089,18 +3066,39 @@ COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo, } #endif +/* + * For kthreads only, must not be used if cloned with CLONE_SIGHAND + */ +void kernel_sigaction(int sig, __sighandler_t action) +{ + spin_lock_irq(¤t->sighand->siglock); + current->sighand->action[sig - 1].sa.sa_handler = action; + if (action == SIG_IGN) { + sigset_t mask; + + sigemptyset(&mask); + sigaddset(&mask, sig); + + flush_sigqueue_mask(&mask, ¤t->signal->shared_pending); + flush_sigqueue_mask(&mask, ¤t->pending); + recalc_sigpending(); + } + spin_unlock_irq(¤t->sighand->siglock); +} +EXPORT_SYMBOL(kernel_sigaction); + int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) { - struct task_struct *t = current; + struct task_struct *p = current, *t; struct k_sigaction *k; sigset_t mask; if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig))) return -EINVAL; - k = &t->sighand->action[sig-1]; + k = &p->sighand->action[sig-1]; - spin_lock_irq(¤t->sighand->siglock); + spin_lock_irq(&p->sighand->siglock); if (oact) *oact = *k; @@ -3119,21 +3117,20 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) * (for example, SIGCHLD), shall cause the pending signal to * be discarded, whether or not it is blocked" */ - if (sig_handler_ignored(sig_handler(t, sig), sig)) { + if (sig_handler_ignored(sig_handler(p, sig), sig)) { sigemptyset(&mask); sigaddset(&mask, sig); - rm_from_queue_full(&mask, &t->signal->shared_pending); - do { - rm_from_queue_full(&mask, &t->pending); - } while_each_thread(current, t); + flush_sigqueue_mask(&mask, &p->signal->shared_pending); + for_each_thread(p, t) + flush_sigqueue_mask(&mask, &t->pending); } } - spin_unlock_irq(¤t->sighand->siglock); + spin_unlock_irq(&p->sighand->siglock); return 0; } -static int +static int do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp) { stack_t oss; @@ -3494,7 +3491,7 @@ COMPAT_SYSCALL_DEFINE3(sigaction, int, sig, } #endif -#ifdef __ARCH_WANT_SYS_SGETMASK +#ifdef CONFIG_SGETMASK_SYSCALL /* * For backwards compatibility. Functionality superseded by sigprocmask. @@ -3515,7 +3512,7 @@ SYSCALL_DEFINE1(ssetmask, int, newmask) return old; } -#endif /* __ARCH_WANT_SGETMASK */ +#endif /* CONFIG_SGETMASK_SYSCALL */ #ifdef __ARCH_WANT_SYS_SIGNAL /* @@ -3618,7 +3615,7 @@ SYSCALL_DEFINE3(sigsuspend, int, unused1, int, unused2, old_sigset_t, mask) } #endif -__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma) +__weak const char *arch_vma_name(struct vm_area_struct *vma) { return NULL; } diff --git a/kernel/smp.c b/kernel/smp.c index ffee35bef17..80c33f8de14 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -29,6 +29,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data); static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue); +static void flush_smp_call_function_queue(bool warn_cpu_offline); + static int hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) { @@ -51,12 +53,27 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: + /* Fall-through to the CPU_DEAD[_FROZEN] case. */ case CPU_DEAD: case CPU_DEAD_FROZEN: free_cpumask_var(cfd->cpumask); free_percpu(cfd->csd); break; + + case CPU_DYING: + case CPU_DYING_FROZEN: + /* + * The IPIs for the smp-call-function callbacks queued by other + * CPUs might arrive late, either due to hardware latencies or + * because this CPU disabled interrupts (inside stop-machine) + * before the IPIs were sent. So flush out any pending callbacks + * explicitly (without waiting for the IPIs to arrive), to + * ensure that the outgoing CPU doesn't go offline with work + * still pending. + */ + flush_smp_call_function_queue(false); + break; #endif }; @@ -117,13 +134,43 @@ static void csd_unlock(struct call_single_data *csd) csd->flags &= ~CSD_FLAG_LOCK; } +static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data); + /* * Insert a previously allocated call_single_data element * for execution on the given CPU. data must already have * ->func, ->info, and ->flags set. */ -static void generic_exec_single(int cpu, struct call_single_data *csd, int wait) +static int generic_exec_single(int cpu, struct call_single_data *csd, + smp_call_func_t func, void *info, int wait) { + struct call_single_data csd_stack = { .flags = 0 }; + unsigned long flags; + + + if (cpu == smp_processor_id()) { + local_irq_save(flags); + func(info); + local_irq_restore(flags); + return 0; + } + + + if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu)) + return -ENXIO; + + + if (!csd) { + csd = &csd_stack; + if (!wait) + csd = &__get_cpu_var(csd_data); + } + + csd_lock(csd); + + csd->func = func; + csd->info = info; + if (wait) csd->flags |= CSD_FLAG_WAIT; @@ -143,39 +190,69 @@ static void generic_exec_single(int cpu, struct call_single_data *csd, int wait) if (wait) csd_lock_wait(csd); + + return 0; } -/* - * Invoked by arch to handle an IPI for call function single. Must be - * called from the arch with interrupts disabled. +/** + * generic_smp_call_function_single_interrupt - Execute SMP IPI callbacks + * + * Invoked by arch to handle an IPI for call function single. + * Must be called with interrupts disabled. */ void generic_smp_call_function_single_interrupt(void) { - struct llist_node *entry, *next; + flush_smp_call_function_queue(true); +} - /* - * Shouldn't receive this interrupt on a cpu that is not yet online. - */ - WARN_ON_ONCE(!cpu_online(smp_processor_id())); +/** + * flush_smp_call_function_queue - Flush pending smp-call-function callbacks + * + * @warn_cpu_offline: If set to 'true', warn if callbacks were queued on an + * offline CPU. Skip this check if set to 'false'. + * + * Flush any pending smp-call-function callbacks queued on this CPU. This is + * invoked by the generic IPI handler, as well as by a CPU about to go offline, + * to ensure that all pending IPI callbacks are run before it goes completely + * offline. + * + * Loop through the call_single_queue and run all the queued callbacks. + * Must be called with interrupts disabled. + */ +static void flush_smp_call_function_queue(bool warn_cpu_offline) +{ + struct llist_head *head; + struct llist_node *entry; + struct call_single_data *csd, *csd_next; + static bool warned; - entry = llist_del_all(&__get_cpu_var(call_single_queue)); + WARN_ON(!irqs_disabled()); + + head = &__get_cpu_var(call_single_queue); + entry = llist_del_all(head); entry = llist_reverse_order(entry); - while (entry) { - struct call_single_data *csd; + /* There shouldn't be any pending callbacks on an offline CPU. */ + if (unlikely(warn_cpu_offline && !cpu_online(smp_processor_id()) && + !warned && !llist_empty(head))) { + warned = true; + WARN(1, "IPI on offline CPU %d\n", smp_processor_id()); - next = entry->next; + /* + * We don't have to use the _safe() variant here + * because we are not invoking the IPI handlers yet. + */ + llist_for_each_entry(csd, entry, llist) + pr_warn("IPI callback %pS sent to offline CPU\n", + csd->func); + } - csd = llist_entry(entry, struct call_single_data, llist); + llist_for_each_entry_safe(csd, csd_next, entry, llist) { csd->func(csd->info); csd_unlock(csd); - - entry = next; } } -static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data); - /* * smp_call_function_single - Run a function on a specific CPU * @func: The function to run. This must be fast and non-blocking. @@ -187,12 +264,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data); int smp_call_function_single(int cpu, smp_call_func_t func, void *info, int wait) { - struct call_single_data d = { - .flags = 0, - }; - unsigned long flags; int this_cpu; - int err = 0; + int err; /* * prevent preemption and reschedule on another processor, @@ -209,32 +282,41 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() && !oops_in_progress); - if (cpu == this_cpu) { - local_irq_save(flags); - func(info); - local_irq_restore(flags); - } else { - if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) { - struct call_single_data *csd = &d; + err = generic_exec_single(cpu, NULL, func, info, wait); - if (!wait) - csd = &__get_cpu_var(csd_data); + put_cpu(); - csd_lock(csd); + return err; +} +EXPORT_SYMBOL(smp_call_function_single); - csd->func = func; - csd->info = info; - generic_exec_single(cpu, csd, wait); - } else { - err = -ENXIO; /* CPU not online */ - } - } +/** + * smp_call_function_single_async(): Run an asynchronous function on a + * specific CPU. + * @cpu: The CPU to run on. + * @csd: Pre-allocated and setup data structure + * + * Like smp_call_function_single(), but the call is asynchonous and + * can thus be done from contexts with disabled interrupts. + * + * The caller passes his own pre-allocated data structure + * (ie: embedded in an object) and is responsible for synchronizing it + * such that the IPIs performed on the @csd are strictly serialized. + * + * NOTE: Be careful, there is unfortunately no current debugging facility to + * validate the correctness of this serialization. + */ +int smp_call_function_single_async(int cpu, struct call_single_data *csd) +{ + int err = 0; - put_cpu(); + preempt_disable(); + err = generic_exec_single(cpu, csd, csd->func, csd->info, 0); + preempt_enable(); return err; } -EXPORT_SYMBOL(smp_call_function_single); +EXPORT_SYMBOL_GPL(smp_call_function_single_async); /* * smp_call_function_any - Run a function on any of the given cpus @@ -280,44 +362,6 @@ call: EXPORT_SYMBOL_GPL(smp_call_function_any); /** - * __smp_call_function_single(): Run a function on a specific CPU - * @cpu: The CPU to run on. - * @data: Pre-allocated and setup data structure - * @wait: If true, wait until function has completed on specified CPU. - * - * Like smp_call_function_single(), but allow caller to pass in a - * pre-allocated data structure. Useful for embedding @data inside - * other structures, for instance. - */ -void __smp_call_function_single(int cpu, struct call_single_data *csd, - int wait) -{ - unsigned int this_cpu; - unsigned long flags; - - this_cpu = get_cpu(); - /* - * Can deadlock when called with interrupts disabled. - * We allow cpu's that are not yet online though, as no one else can - * send smp call function interrupt to this cpu and as such deadlocks - * can't happen. - */ - WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled() - && !oops_in_progress); - - if (cpu == this_cpu) { - local_irq_save(flags); - csd->func(csd->info); - local_irq_restore(flags); - } else { - csd_lock(csd); - generic_exec_single(cpu, csd, wait); - } - put_cpu(); -} -EXPORT_SYMBOL_GPL(__smp_call_function_single); - -/** * smp_call_function_many(): Run a function on a set of other CPUs. * @mask: The set of cpus to run on (only runs on online subset). * @func: The function to run. This must be fast and non-blocking. diff --git a/kernel/softirq.c b/kernel/softirq.c index 490fcbb1dc5..5918d227730 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -25,6 +25,7 @@ #include <linux/smp.h> #include <linux/smpboot.h> #include <linux/tick.h> +#include <linux/irq.h> #define CREATE_TRACE_POINTS #include <trace/events/irq.h> @@ -222,7 +223,7 @@ static inline bool lockdep_softirq_start(void) { return false; } static inline void lockdep_softirq_end(bool in_hardirq) { } #endif -asmlinkage void __do_softirq(void) +asmlinkage __visible void __do_softirq(void) { unsigned long end = jiffies + MAX_SOFTIRQ_TIME; unsigned long old_flags = current->flags; @@ -231,7 +232,6 @@ asmlinkage void __do_softirq(void) bool in_hardirq; __u32 pending; int softirq_bit; - int cpu; /* * Mask out PF_MEMALLOC s current task context is borrowed for the @@ -246,7 +246,6 @@ asmlinkage void __do_softirq(void) __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET); in_hardirq = lockdep_softirq_start(); - cpu = smp_processor_id(); restart: /* Reset the pending bitmask before enabling irqs */ set_softirq_pending(0); @@ -275,11 +274,11 @@ restart: prev_count, preempt_count()); preempt_count_set(prev_count); } - rcu_bh_qs(cpu); h++; pending >>= softirq_bit; } + rcu_bh_qs(smp_processor_id()); local_irq_disable(); pending = local_softirq_pending(); @@ -298,7 +297,7 @@ restart: tsk_restore_flags(current, old_flags, PF_MEMALLOC); } -asmlinkage void do_softirq(void) +asmlinkage __visible void do_softirq(void) { __u32 pending; unsigned long flags; @@ -778,3 +777,8 @@ int __init __weak arch_early_irq_init(void) { return 0; } + +unsigned int __weak arch_dynirq_lower_bound(unsigned int from) +{ + return from; +} diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 84571e09c90..695f0c6cd16 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -293,7 +293,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * */ smp_call_function_single(min(cpu1, cpu2), &irq_cpu_stop_queue_work, - &call_args, 0); + &call_args, 1); lg_local_unlock(&stop_cpus_lock); preempt_enable(); @@ -307,6 +307,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * * @cpu: cpu to stop * @fn: function to execute * @arg: argument to @fn + * @work_buf: pointer to cpu_stop_work structure * * Similar to stop_one_cpu() but doesn't wait for completion. The * caller is responsible for ensuring @work_buf is currently unused diff --git a/kernel/sys.c b/kernel/sys.c index c0a58be780a..66a751ebf9d 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -174,10 +174,10 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) /* normalize: avoid signed division (rounding problems) */ error = -ESRCH; - if (niceval < -20) - niceval = -20; - if (niceval > 19) - niceval = 19; + if (niceval < MIN_NICE) + niceval = MIN_NICE; + if (niceval > MAX_NICE) + niceval = MAX_NICE; rcu_read_lock(); read_lock(&tasklist_lock); @@ -250,7 +250,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) else p = current; if (p) { - niceval = 20 - task_nice(p); + niceval = nice_to_rlimit(task_nice(p)); if (niceval > retval) retval = niceval; } @@ -261,7 +261,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) else pgrp = task_pgrp(current); do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { - niceval = 20 - task_nice(p); + niceval = nice_to_rlimit(task_nice(p)); if (niceval > retval) retval = niceval; } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); @@ -277,7 +277,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) do_each_thread(g, p) { if (uid_eq(task_uid(p), uid)) { - niceval = 20 - task_nice(p); + niceval = nice_to_rlimit(task_nice(p)); if (niceval > retval) retval = niceval; } @@ -1996,6 +1996,21 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, if (arg2 || arg3 || arg4 || arg5) return -EINVAL; return current->no_new_privs ? 1 : 0; + case PR_GET_THP_DISABLE: + if (arg2 || arg3 || arg4 || arg5) + return -EINVAL; + error = !!(me->mm->def_flags & VM_NOHUGEPAGE); + break; + case PR_SET_THP_DISABLE: + if (arg3 || arg4 || arg5) + return -EINVAL; + down_write(&me->mm->mmap_sem); + if (arg2) + me->mm->def_flags |= VM_NOHUGEPAGE; + else + me->mm->def_flags &= ~VM_NOHUGEPAGE; + up_write(&me->mm->mmap_sem); + break; default: error = -EINVAL; break; diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 7078052284f..36441b51b5d 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -135,6 +135,8 @@ cond_syscall(sys_setresgid16); cond_syscall(sys_setresuid16); cond_syscall(sys_setreuid16); cond_syscall(sys_setuid16); +cond_syscall(sys_sgetmask); +cond_syscall(sys_ssetmask); cond_syscall(sys_vm86old); cond_syscall(sys_vm86); cond_syscall(sys_ipc); @@ -146,11 +148,13 @@ cond_syscall(sys_io_destroy); cond_syscall(sys_io_submit); cond_syscall(sys_io_cancel); cond_syscall(sys_io_getevents); +cond_syscall(sys_sysfs); cond_syscall(sys_syslog); cond_syscall(sys_process_vm_readv); cond_syscall(sys_process_vm_writev); cond_syscall(compat_sys_process_vm_readv); cond_syscall(compat_sys_process_vm_writev); +cond_syscall(sys_uselib); /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 49e13e1f8fe..75b22e22a72 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -112,9 +112,6 @@ extern int sysctl_nr_open_min, sysctl_nr_open_max; #ifndef CONFIG_MMU extern int sysctl_nr_trim_pages; #endif -#ifdef CONFIG_BLOCK -extern int blk_iopoll_enabled; -#endif /* Constants used for minimum and maximum */ #ifdef CONFIG_LOCKUP_DETECTOR @@ -126,7 +123,7 @@ static int __maybe_unused neg_one = -1; static int zero; static int __maybe_unused one = 1; static int __maybe_unused two = 2; -static int __maybe_unused three = 3; +static int __maybe_unused four = 4; static unsigned long one_ul = 1; static int one_hundred = 100; #ifdef CONFIG_PRINTK @@ -139,21 +136,21 @@ static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static int maxolduid = 65535; static int minolduid; -static int min_percpu_pagelist_fract = 8; static int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; +/*this is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs */ +#ifdef CONFIG_DETECT_HUNG_TASK +static unsigned long hung_task_timeout_max = (LONG_MAX/HZ); +#endif + #ifdef CONFIG_INOTIFY_USER #include <linux/inotify.h> #endif #ifdef CONFIG_SPARC #endif -#ifdef CONFIG_SPARC64 -extern int sysctl_tsb_ratio; -#endif - #ifdef __hppa__ extern int pwrsw_enabled; #endif @@ -171,6 +168,13 @@ extern int no_unaligned_warning; #endif #ifdef CONFIG_PROC_SYSCTL + +#define SYSCTL_WRITES_LEGACY -1 +#define SYSCTL_WRITES_WARN 0 +#define SYSCTL_WRITES_STRICT 1 + +static int sysctl_writes_strict = SYSCTL_WRITES_WARN; + static int proc_do_cad_pid(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); static int proc_taint(struct ctl_table *table, int write, @@ -193,7 +197,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write, /* Note: sysrq code uses it's own private copy */ static int __sysrq_enabled = CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE; -static int sysrq_sysctl_handler(ctl_table *table, int write, +static int sysrq_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -386,13 +390,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, { - .procname = "numa_balancing_migrate_deferred", - .data = &sysctl_numa_balancing_migrate_deferred, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { .procname = "numa_balancing", .data = NULL, /* filled in by handler */ .maxlen = sizeof(unsigned int), @@ -500,6 +497,15 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_taint, }, + { + .procname = "sysctl_writes_strict", + .data = &sysctl_writes_strict, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &neg_one, + .extra2 = &one, + }, #endif #ifdef CONFIG_LATENCYTOP { @@ -648,7 +654,7 @@ static struct ctl_table kern_table[] = { .extra2 = &one, }, #endif - +#ifdef CONFIG_UEVENT_HELPER { .procname = "hotplug", .data = &uevent_helper, @@ -656,7 +662,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dostring, }, - +#endif #ifdef CONFIG_CHR_DEV_SG { .procname = "sg-big-buff", @@ -854,6 +860,17 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, +#ifdef CONFIG_SMP + { + .procname = "softlockup_all_cpu_backtrace", + .data = &sysctl_softlockup_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, +#endif /* CONFIG_SMP */ { .procname = "nmi_watchdog", .data = &watchdog_user_enabled, @@ -995,6 +1012,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_dohung_task_timeout_secs, + .extra2 = &hung_task_timeout_max, }, { .procname = "hung_task_warnings", @@ -1094,15 +1112,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_BLOCK - { - .procname = "blk_iopoll", - .data = &blk_iopoll_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif { } }; @@ -1283,7 +1292,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = drop_caches_sysctl_handler, .extra1 = &one, - .extra2 = &three, + .extra2 = &four, }, #ifdef CONFIG_COMPACTION { @@ -1318,7 +1327,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(percpu_pagelist_fraction), .mode = 0644, .proc_handler = percpu_pagelist_fraction_sysctl_handler, - .extra1 = &min_percpu_pagelist_fract, + .extra1 = &zero, }, #ifdef CONFIG_MMU { @@ -1431,8 +1440,13 @@ static struct ctl_table vm_table[] = { (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) { .procname = "vdso_enabled", +#ifdef CONFIG_X86_32 + .data = &vdso32_enabled, + .maxlen = sizeof(vdso32_enabled), +#else .data = &vdso_enabled, .maxlen = sizeof(vdso_enabled), +#endif .mode = 0644, .proc_handler = proc_dointvec, .extra1 = &zero, @@ -1711,8 +1725,8 @@ int __init sysctl_init(void) #ifdef CONFIG_PROC_SYSCTL -static int _proc_do_string(void* data, int maxlen, int write, - void __user *buffer, +static int _proc_do_string(char *data, int maxlen, int write, + char __user *buffer, size_t *lenp, loff_t *ppos) { size_t len; @@ -1725,21 +1739,30 @@ static int _proc_do_string(void* data, int maxlen, int write, } if (write) { - len = 0; + if (sysctl_writes_strict == SYSCTL_WRITES_STRICT) { + /* Only continue writes not past the end of buffer. */ + len = strlen(data); + if (len > maxlen - 1) + len = maxlen - 1; + + if (*ppos > len) + return 0; + len = *ppos; + } else { + /* Start writing from beginning of buffer. */ + len = 0; + } + + *ppos += *lenp; p = buffer; - while (len < *lenp) { + while ((p - buffer) < *lenp && len < maxlen - 1) { if (get_user(c, p++)) return -EFAULT; if (c == 0 || c == '\n') break; - len++; + data[len++] = c; } - if (len >= maxlen) - len = maxlen-1; - if(copy_from_user(data, buffer, len)) - return -EFAULT; - ((char *) data)[len] = 0; - *ppos += *lenp; + data[len] = 0; } else { len = strlen(data); if (len > maxlen) @@ -1756,10 +1779,10 @@ static int _proc_do_string(void* data, int maxlen, int write, if (len > *lenp) len = *lenp; if (len) - if(copy_to_user(buffer, data, len)) + if (copy_to_user(buffer, data, len)) return -EFAULT; if (len < *lenp) { - if(put_user('\n', ((char __user *) buffer) + len)) + if (put_user('\n', buffer + len)) return -EFAULT; len++; } @@ -1769,6 +1792,14 @@ static int _proc_do_string(void* data, int maxlen, int write, return 0; } +static void warn_sysctl_write(struct ctl_table *table) +{ + pr_warn_once("%s wrote to %s when file position was not 0!\n" + "This will not be supported in the future. To silence this\n" + "warning, set kernel.sysctl_writes_strict = -1\n", + current->comm, table->procname); +} + /** * proc_dostring - read a string sysctl * @table: the sysctl table @@ -1789,8 +1820,11 @@ static int _proc_do_string(void* data, int maxlen, int write, int proc_dostring(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return _proc_do_string(table->data, table->maxlen, write, - buffer, lenp, ppos); + if (write && *ppos && sysctl_writes_strict == SYSCTL_WRITES_WARN) + warn_sysctl_write(table); + + return _proc_do_string((char *)(table->data), table->maxlen, write, + (char __user *)buffer, lenp, ppos); } static size_t proc_skip_spaces(char **buf) @@ -1964,6 +1998,18 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, conv = do_proc_dointvec_conv; if (write) { + if (*ppos) { + switch (sysctl_writes_strict) { + case SYSCTL_WRITES_STRICT: + goto out; + case SYSCTL_WRITES_WARN: + warn_sysctl_write(table); + break; + default: + break; + } + } + if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; page = __get_free_page(GFP_TEMPORARY); @@ -2021,6 +2067,7 @@ free: return err ? : -EINVAL; } *lenp -= left; +out: *ppos += *lenp; return err; } @@ -2213,6 +2260,18 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int left = *lenp; if (write) { + if (*ppos) { + switch (sysctl_writes_strict) { + case SYSCTL_WRITES_STRICT: + goto out; + case SYSCTL_WRITES_WARN: + warn_sysctl_write(table); + break; + default: + break; + } + } + if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; page = __get_free_page(GFP_TEMPORARY); @@ -2268,6 +2327,7 @@ free: return err ? : -EINVAL; } *lenp -= left; +out: *ppos += *lenp; return err; } @@ -2514,11 +2574,11 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, bool first = 1; size_t left = *lenp; unsigned long bitmap_len = table->maxlen; - unsigned long *bitmap = (unsigned long *) table->data; + unsigned long *bitmap = *(unsigned long **) table->data; unsigned long *tmp_bitmap = NULL; char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c; - if (!bitmap_len || !left || (*ppos && !write)) { + if (!bitmap || !bitmap_len || !left || (*ppos && !write)) { *lenp = 0; return 0; } diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 3ce6e8c5f3f..f448513a45e 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -124,7 +124,7 @@ config NO_HZ_FULL endchoice config NO_HZ_FULL_ALL - bool "Full dynticks system on all CPUs by default" + bool "Full dynticks system on all CPUs by default (except CPU 0)" depends on NO_HZ_FULL help If the user doesn't pass the nohz_full boot option to diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 9250130646f..57a413fd0eb 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -3,7 +3,10 @@ obj-y += timeconv.o posix-clock.o alarmtimer.o obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o -obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o +ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y) + obj-y += tick-broadcast.o + obj-$(CONFIG_TICK_ONESHOT) += tick-broadcast-hrtimer.o +endif obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 88c9c65a430..fe75444ae7e 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -585,9 +585,14 @@ static int alarm_timer_set(struct k_itimer *timr, int flags, struct itimerspec *new_setting, struct itimerspec *old_setting) { + ktime_t exp; + if (!rtcdev) return -ENOTSUPP; + if (flags & ~TIMER_ABSTIME) + return -EINVAL; + if (old_setting) alarm_timer_get(timr, old_setting); @@ -597,8 +602,16 @@ static int alarm_timer_set(struct k_itimer *timr, int flags, /* start the timer */ timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval); - alarm_start(&timr->it.alarm.alarmtimer, - timespec_to_ktime(new_setting->it_value)); + exp = timespec_to_ktime(new_setting->it_value); + /* Convert (if necessary) to absolute time */ + if (flags != TIMER_ABSTIME) { + ktime_t now; + + now = alarm_bases[timr->it.alarm.alarmtimer.type].gettime(); + exp = ktime_add(now, exp); + } + + alarm_start(&timr->it.alarm.alarmtimer, exp); return 0; } @@ -730,6 +743,9 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, if (!alarmtimer_get_rtcdev()) return -ENOTSUPP; + if (flags & ~TIMER_ABSTIME) + return -EINVAL; + if (!capable(CAP_WAKE_ALARM)) return -EPERM; diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 086ad6043bc..9c94c19f130 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -146,7 +146,8 @@ static int clockevents_increase_min_delta(struct clock_event_device *dev) { /* Nothing to do if we already reached the limit */ if (dev->min_delta_ns >= MIN_DELTA_LIMIT) { - printk(KERN_WARNING "CE: Reprogramming failure. Giving up\n"); + printk_deferred(KERN_WARNING + "CE: Reprogramming failure. Giving up\n"); dev->next_event.tv64 = KTIME_MAX; return -ETIME; } @@ -159,9 +160,10 @@ static int clockevents_increase_min_delta(struct clock_event_device *dev) if (dev->min_delta_ns > MIN_DELTA_LIMIT) dev->min_delta_ns = MIN_DELTA_LIMIT; - printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n", - dev->name ? dev->name : "?", - (unsigned long long) dev->min_delta_ns); + printk_deferred(KERN_WARNING + "CE: %s increased min_delta_ns to %llu nsec\n", + dev->name ? dev->name : "?", + (unsigned long long) dev->min_delta_ns); return 0; } @@ -439,6 +441,19 @@ void clockevents_config_and_register(struct clock_event_device *dev, } EXPORT_SYMBOL_GPL(clockevents_config_and_register); +int __clockevents_update_freq(struct clock_event_device *dev, u32 freq) +{ + clockevents_config(dev, freq); + + if (dev->mode == CLOCK_EVT_MODE_ONESHOT) + return clockevents_program_event(dev, dev->next_event, false); + + if (dev->mode == CLOCK_EVT_MODE_PERIODIC) + dev->set_mode(CLOCK_EVT_MODE_PERIODIC, dev); + + return 0; +} + /** * clockevents_update_freq - Update frequency and reprogram a clock event device. * @dev: device to modify @@ -446,17 +461,22 @@ EXPORT_SYMBOL_GPL(clockevents_config_and_register); * * Reconfigure and reprogram a clock event device in oneshot * mode. Must be called on the cpu for which the device delivers per - * cpu timer events with interrupts disabled! Returns 0 on success, - * -ETIME when the event is in the past. + * cpu timer events. If called for the broadcast device the core takes + * care of serialization. + * + * Returns 0 on success, -ETIME when the event is in the past. */ int clockevents_update_freq(struct clock_event_device *dev, u32 freq) { - clockevents_config(dev, freq); - - if (dev->mode != CLOCK_EVT_MODE_ONESHOT) - return 0; + unsigned long flags; + int ret; - return clockevents_program_event(dev, dev->next_event, false); + local_irq_save(flags); + ret = tick_broadcast_update_freq(dev, freq); + if (ret == -ENODEV) + ret = __clockevents_update_freq(dev, freq); + local_irq_restore(flags); + return ret; } /* @@ -524,12 +544,13 @@ void clockevents_resume(void) #ifdef CONFIG_GENERIC_CLOCKEVENTS /** * clockevents_notify - notification about relevant events + * Returns 0 on success, any other value on error */ -void clockevents_notify(unsigned long reason, void *arg) +int clockevents_notify(unsigned long reason, void *arg) { struct clock_event_device *dev, *tmp; unsigned long flags; - int cpu; + int cpu, ret = 0; raw_spin_lock_irqsave(&clockevents_lock, flags); @@ -542,7 +563,7 @@ void clockevents_notify(unsigned long reason, void *arg) case CLOCK_EVT_NOTIFY_BROADCAST_ENTER: case CLOCK_EVT_NOTIFY_BROADCAST_EXIT: - tick_broadcast_oneshot_control(reason); + ret = tick_broadcast_oneshot_control(reason); break; case CLOCK_EVT_NOTIFY_CPU_DYING: @@ -585,6 +606,7 @@ void clockevents_notify(unsigned long reason, void *arg) break; } raw_spin_unlock_irqrestore(&clockevents_lock, flags); + return ret; } EXPORT_SYMBOL_GPL(clockevents_notify); diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 7a925ba456f..a6a5bf53e86 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -51,7 +51,13 @@ * HZ shrinks, so values greater than 8 overflow 32bits when * HZ=100. */ +#if HZ < 34 +#define JIFFIES_SHIFT 6 +#elif HZ < 67 +#define JIFFIES_SHIFT 7 +#else #define JIFFIES_SHIFT 8 +#endif static cycle_t jiffies_read(struct clocksource *cs) { diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index af8d1d4f3d5..33db43a3951 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -165,21 +165,21 @@ static inline void pps_set_freq(s64 freq) static inline int is_error_status(int status) { - return (time_status & (STA_UNSYNC|STA_CLOCKERR)) + return (status & (STA_UNSYNC|STA_CLOCKERR)) /* PPS signal lost when either PPS time or * PPS frequency synchronization requested */ - || ((time_status & (STA_PPSFREQ|STA_PPSTIME)) - && !(time_status & STA_PPSSIGNAL)) + || ((status & (STA_PPSFREQ|STA_PPSTIME)) + && !(status & STA_PPSSIGNAL)) /* PPS jitter exceeded when * PPS time synchronization requested */ - || ((time_status & (STA_PPSTIME|STA_PPSJITTER)) + || ((status & (STA_PPSTIME|STA_PPSJITTER)) == (STA_PPSTIME|STA_PPSJITTER)) /* PPS wander exceeded or calibration error when * PPS frequency synchronization requested */ - || ((time_status & STA_PPSFREQ) - && (time_status & (STA_PPSWANDER|STA_PPSERROR))); + || ((status & STA_PPSFREQ) + && (status & (STA_PPSWANDER|STA_PPSERROR))); } static inline void pps_fill_timex(struct timex *txc) @@ -514,12 +514,13 @@ static void sync_cmos_clock(struct work_struct *work) next.tv_sec++; next.tv_nsec -= NSEC_PER_SEC; } - schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next)); + queue_delayed_work(system_power_efficient_wq, + &sync_cmos_work, timespec_to_jiffies(&next)); } void ntp_notify_cmos_timer(void) { - schedule_delayed_work(&sync_cmos_work, 0); + queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0); } #else @@ -785,8 +786,9 @@ static long hardpps_update_freq(struct pps_normtime freq_norm) time_status |= STA_PPSERROR; pps_errcnt++; pps_dec_freq_interval(); - pr_err("hardpps: PPSERROR: interval too long - %ld s\n", - freq_norm.sec); + printk_deferred(KERN_ERR + "hardpps: PPSERROR: interval too long - %ld s\n", + freq_norm.sec); return 0; } @@ -799,7 +801,8 @@ static long hardpps_update_freq(struct pps_normtime freq_norm) delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT); pps_freq = ftemp; if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) { - pr_warning("hardpps: PPSWANDER: change=%ld\n", delta); + printk_deferred(KERN_WARNING + "hardpps: PPSWANDER: change=%ld\n", delta); time_status |= STA_PPSWANDER; pps_stbcnt++; pps_dec_freq_interval(); @@ -843,8 +846,9 @@ static void hardpps_update_phase(long error) * the time offset is updated. */ if (jitter > (pps_jitter << PPS_POPCORN)) { - pr_warning("hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", - jitter, (pps_jitter << PPS_POPCORN)); + printk_deferred(KERN_WARNING + "hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", + jitter, (pps_jitter << PPS_POPCORN)); time_status |= STA_PPSJITTER; pps_jitcnt++; } else if (time_status & STA_PPSTIME) { @@ -901,7 +905,7 @@ void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) time_status |= STA_PPSJITTER; /* restart the frequency calibration interval */ pps_fbase = *raw_ts; - pr_err("hardpps: PPSJITTER: bad pulse\n"); + printk_deferred(KERN_ERR "hardpps: PPSJITTER: bad pulse\n"); return; } @@ -922,7 +926,10 @@ void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) static int __init ntp_tick_adj_setup(char *str) { - ntp_tick_adj = simple_strtol(str, NULL, 0); + int rc = kstrtol(str, 0, (long *)&ntp_tick_adj); + + if (rc) + return rc; ntp_tick_adj <<= NTP_SCALE_SHIFT; return 1; diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 0abb3646428..01d2d15aa66 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -49,13 +49,6 @@ static u64 notrace jiffy_sched_clock_read(void) return (u64)(jiffies - INITIAL_JIFFIES); } -static u32 __read_mostly (*read_sched_clock_32)(void); - -static u64 notrace read_sched_clock_32_wrapper(void) -{ - return read_sched_clock_32(); -} - static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) @@ -116,20 +109,42 @@ static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt) void __init sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) { + u64 res, wrap, new_mask, new_epoch, cyc, ns; + u32 new_mult, new_shift; + ktime_t new_wrap_kt; unsigned long r; - u64 res, wrap; char r_unit; if (cd.rate > rate) return; WARN_ON(!irqs_disabled()); - read_sched_clock = read; - sched_clock_mask = CLOCKSOURCE_MASK(bits); - cd.rate = rate; /* calculate the mult/shift to convert counter ticks to ns. */ - clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 3600); + clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600); + + new_mask = CLOCKSOURCE_MASK(bits); + + /* calculate how many ns until we wrap */ + wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask); + new_wrap_kt = ns_to_ktime(wrap - (wrap >> 3)); + + /* update epoch for new counter and update epoch_ns from old counter*/ + new_epoch = read(); + cyc = read_sched_clock(); + ns = cd.epoch_ns + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, + cd.mult, cd.shift); + + raw_write_seqcount_begin(&cd.seq); + read_sched_clock = read; + sched_clock_mask = new_mask; + cd.rate = rate; + cd.wrap_kt = new_wrap_kt; + cd.mult = new_mult; + cd.shift = new_shift; + cd.epoch_cyc = new_epoch; + cd.epoch_ns = ns; + raw_write_seqcount_end(&cd.seq); r = rate; if (r >= 4000000) { @@ -141,22 +156,12 @@ void __init sched_clock_register(u64 (*read)(void), int bits, } else r_unit = ' '; - /* calculate how many ns until we wrap */ - wrap = clocks_calc_max_nsecs(cd.mult, cd.shift, 0, sched_clock_mask); - cd.wrap_kt = ns_to_ktime(wrap - (wrap >> 3)); - /* calculate the ns resolution of this counter */ - res = cyc_to_ns(1ULL, cd.mult, cd.shift); + res = cyc_to_ns(1ULL, new_mult, new_shift); + pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n", bits, r, r_unit, res, wrap); - update_sched_clock(); - - /* - * Ensure that sched_clock() starts off at 0ns - */ - cd.epoch_ns = 0; - /* Enable IRQ time accounting if we have a fast enough sched_clock */ if (irqtime > 0 || (irqtime == -1 && rate >= 1000000)) enable_sched_clock_irqtime(); @@ -164,12 +169,6 @@ void __init sched_clock_register(u64 (*read)(void), int bits, pr_debug("Registered %pF as sched_clock source\n", read); } -void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) -{ - read_sched_clock_32 = read; - sched_clock_register(read_sched_clock_32_wrapper, bits, rate); -} - void __init sched_clock_postinit(void) { /* @@ -192,7 +191,8 @@ void __init sched_clock_postinit(void) static int sched_clock_suspend(void) { - sched_clock_poll(&sched_clock_timer); + update_sched_clock(); + hrtimer_cancel(&sched_clock_timer); cd.suspended = true; return 0; } @@ -200,6 +200,7 @@ static int sched_clock_suspend(void) static void sched_clock_resume(void) { cd.epoch_cyc = read_sched_clock(); + hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); cd.suspended = false; } diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c new file mode 100644 index 00000000000..eb682d5c697 --- /dev/null +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -0,0 +1,106 @@ +/* + * linux/kernel/time/tick-broadcast-hrtimer.c + * This file emulates a local clock event device + * via a pseudo clock device. + */ +#include <linux/cpu.h> +#include <linux/err.h> +#include <linux/hrtimer.h> +#include <linux/interrupt.h> +#include <linux/percpu.h> +#include <linux/profile.h> +#include <linux/clockchips.h> +#include <linux/sched.h> +#include <linux/smp.h> +#include <linux/module.h> + +#include "tick-internal.h" + +static struct hrtimer bctimer; + +static void bc_set_mode(enum clock_event_mode mode, + struct clock_event_device *bc) +{ + switch (mode) { + case CLOCK_EVT_MODE_SHUTDOWN: + /* + * Note, we cannot cancel the timer here as we might + * run into the following live lock scenario: + * + * cpu 0 cpu1 + * lock(broadcast_lock); + * hrtimer_interrupt() + * bc_handler() + * tick_handle_oneshot_broadcast(); + * lock(broadcast_lock); + * hrtimer_cancel() + * wait_for_callback() + */ + hrtimer_try_to_cancel(&bctimer); + break; + default: + break; + } +} + +/* + * This is called from the guts of the broadcast code when the cpu + * which is about to enter idle has the earliest broadcast timer event. + */ +static int bc_set_next(ktime_t expires, struct clock_event_device *bc) +{ + /* + * We try to cancel the timer first. If the callback is on + * flight on some other cpu then we let it handle it. If we + * were able to cancel the timer nothing can rearm it as we + * own broadcast_lock. + * + * However we can also be called from the event handler of + * ce_broadcast_hrtimer itself when it expires. We cannot + * restart the timer because we are in the callback, but we + * can set the expiry time and let the callback return + * HRTIMER_RESTART. + */ + if (hrtimer_try_to_cancel(&bctimer) >= 0) { + hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED); + /* Bind the "device" to the cpu */ + bc->bound_on = smp_processor_id(); + } else if (bc->bound_on == smp_processor_id()) { + hrtimer_set_expires(&bctimer, expires); + } + return 0; +} + +static struct clock_event_device ce_broadcast_hrtimer = { + .set_mode = bc_set_mode, + .set_next_ktime = bc_set_next, + .features = CLOCK_EVT_FEAT_ONESHOT | + CLOCK_EVT_FEAT_KTIME | + CLOCK_EVT_FEAT_HRTIMER, + .rating = 0, + .bound_on = -1, + .min_delta_ns = 1, + .max_delta_ns = KTIME_MAX, + .min_delta_ticks = 1, + .max_delta_ticks = ULONG_MAX, + .mult = 1, + .shift = 0, + .cpumask = cpu_all_mask, +}; + +static enum hrtimer_restart bc_handler(struct hrtimer *t) +{ + ce_broadcast_hrtimer.event_handler(&ce_broadcast_hrtimer); + + if (ce_broadcast_hrtimer.next_event.tv64 == KTIME_MAX) + return HRTIMER_NORESTART; + + return HRTIMER_RESTART; +} + +void tick_setup_hrtimer_broadcast(void) +{ + hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + bctimer.function = bc_handler; + clockevents_register_device(&ce_broadcast_hrtimer); +} diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 43780ab5e27..64c5990fd50 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -120,6 +120,19 @@ int tick_is_broadcast_device(struct clock_event_device *dev) return (dev && tick_broadcast_device.evtdev == dev); } +int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq) +{ + int ret = -ENODEV; + + if (tick_is_broadcast_device(dev)) { + raw_spin_lock(&tick_broadcast_lock); + ret = __clockevents_update_freq(dev, freq); + raw_spin_unlock(&tick_broadcast_lock); + } + return ret; +} + + static void err_broadcast(const struct cpumask *mask) { pr_crit_once("Failed to broadcast timer tick. Some CPUs may be unresponsive.\n"); @@ -272,12 +285,8 @@ static void tick_do_broadcast(struct cpumask *mask) */ static void tick_do_periodic_broadcast(void) { - raw_spin_lock(&tick_broadcast_lock); - cpumask_and(tmpmask, cpu_online_mask, tick_broadcast_mask); tick_do_broadcast(tmpmask); - - raw_spin_unlock(&tick_broadcast_lock); } /* @@ -287,13 +296,15 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) { ktime_t next; + raw_spin_lock(&tick_broadcast_lock); + tick_do_periodic_broadcast(); /* * The device is in periodic mode. No reprogramming necessary: */ if (dev->mode == CLOCK_EVT_MODE_PERIODIC) - return; + goto unlock; /* * Setup the next period for devices, which do not have @@ -306,9 +317,11 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) next = ktime_add(next, tick_period); if (!clockevents_program_event(dev, next, false)) - return; + goto unlock; tick_do_periodic_broadcast(); } +unlock: + raw_spin_unlock(&tick_broadcast_lock); } /* @@ -630,24 +643,61 @@ again: raw_spin_unlock(&tick_broadcast_lock); } +static int broadcast_needs_cpu(struct clock_event_device *bc, int cpu) +{ + if (!(bc->features & CLOCK_EVT_FEAT_HRTIMER)) + return 0; + if (bc->next_event.tv64 == KTIME_MAX) + return 0; + return bc->bound_on == cpu ? -EBUSY : 0; +} + +static void broadcast_shutdown_local(struct clock_event_device *bc, + struct clock_event_device *dev) +{ + /* + * For hrtimer based broadcasting we cannot shutdown the cpu + * local device if our own event is the first one to expire or + * if we own the broadcast timer. + */ + if (bc->features & CLOCK_EVT_FEAT_HRTIMER) { + if (broadcast_needs_cpu(bc, smp_processor_id())) + return; + if (dev->next_event.tv64 < bc->next_event.tv64) + return; + } + clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); +} + +static void broadcast_move_bc(int deadcpu) +{ + struct clock_event_device *bc = tick_broadcast_device.evtdev; + + if (!bc || !broadcast_needs_cpu(bc, deadcpu)) + return; + /* This moves the broadcast assignment to this cpu */ + clockevents_program_event(bc, bc->next_event, 1); +} + /* * Powerstate information: The system enters/leaves a state, where * affected devices might stop + * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups. */ -void tick_broadcast_oneshot_control(unsigned long reason) +int tick_broadcast_oneshot_control(unsigned long reason) { struct clock_event_device *bc, *dev; struct tick_device *td; unsigned long flags; ktime_t now; - int cpu; + int cpu, ret = 0; /* * Periodic mode does not care about the enter/exit of power * states */ if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) - return; + return 0; /* * We are called with preemtion disabled from the depth of the @@ -658,7 +708,7 @@ void tick_broadcast_oneshot_control(unsigned long reason) dev = td->evtdev; if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) - return; + return 0; bc = tick_broadcast_device.evtdev; @@ -666,7 +716,7 @@ void tick_broadcast_oneshot_control(unsigned long reason) if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); - clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); + broadcast_shutdown_local(bc, dev); /* * We only reprogram the broadcast timer if we * did not mark ourself in the force mask and @@ -679,6 +729,16 @@ void tick_broadcast_oneshot_control(unsigned long reason) dev->next_event.tv64 < bc->next_event.tv64) tick_broadcast_set_event(bc, cpu, dev->next_event, 1); } + /* + * If the current CPU owns the hrtimer broadcast + * mechanism, it cannot go deep idle and we remove the + * CPU from the broadcast mask. We don't have to go + * through the EXIT path as the local timer is not + * shutdown. + */ + ret = broadcast_needs_cpu(bc, cpu); + if (ret) + cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); } else { if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) { clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); @@ -746,6 +806,7 @@ void tick_broadcast_oneshot_control(unsigned long reason) } out: raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); + return ret; } /* @@ -756,6 +817,7 @@ out: static void tick_broadcast_clear_oneshot(int cpu) { cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); + cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); } static void tick_broadcast_init_next_event(struct cpumask *mask, @@ -851,6 +913,8 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup) cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); cpumask_clear_cpu(cpu, tick_broadcast_force_mask); + broadcast_move_bc(cpu); + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 20b2fe37d10..0a0608edeb2 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -98,18 +98,19 @@ static void tick_periodic(int cpu) void tick_handle_periodic(struct clock_event_device *dev) { int cpu = smp_processor_id(); - ktime_t next; + ktime_t next = dev->next_event; tick_periodic(cpu); if (dev->mode != CLOCK_EVT_MODE_ONESHOT) return; - /* - * Setup the next period for devices, which do not have - * periodic mode: - */ - next = ktime_add(dev->next_event, tick_period); for (;;) { + /* + * Setup the next period for devices, which do not have + * periodic mode: + */ + next = ktime_add(next, tick_period); + if (!clockevents_program_event(dev, next, false)) return; /* @@ -118,12 +119,11 @@ void tick_handle_periodic(struct clock_event_device *dev) * to be sure we're using a real hardware clocksource. * Otherwise we could get trapped in an infinite * loop, as the tick_periodic() increments jiffies, - * when then will increment time, posibly causing + * which then will increment time, possibly causing * the loop to trigger again and again. */ if (timekeeping_valid_for_hres()) tick_periodic(cpu); - next = ktime_add(next, tick_period); } } @@ -276,7 +276,7 @@ static bool tick_check_preferred(struct clock_event_device *curdev, bool tick_check_replacement(struct clock_event_device *curdev, struct clock_event_device *newdev) { - if (tick_check_percpu(curdev, newdev, smp_processor_id())) + if (!tick_check_percpu(curdev, newdev, smp_processor_id())) return false; return tick_check_preferred(curdev, newdev); diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 8329669b51e..7ab92b19965 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -46,7 +46,7 @@ extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); extern void tick_resume_oneshot(void); # ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc); -extern void tick_broadcast_oneshot_control(unsigned long reason); +extern int tick_broadcast_oneshot_control(unsigned long reason); extern void tick_broadcast_switch_to_oneshot(void); extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); @@ -58,7 +58,7 @@ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); } -static inline void tick_broadcast_oneshot_control(unsigned long reason) { } +static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; } static inline void tick_broadcast_switch_to_oneshot(void) { } static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } static inline int tick_broadcast_oneshot_active(void) { return 0; } @@ -87,7 +87,7 @@ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); } -static inline void tick_broadcast_oneshot_control(unsigned long reason) { } +static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; } static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc) { @@ -111,6 +111,7 @@ extern int tick_resume_broadcast(void); extern void tick_broadcast_init(void); extern void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast); +int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq); #else /* !BROADCAST */ @@ -133,6 +134,8 @@ static inline void tick_shutdown_broadcast(unsigned int *cpup) { } static inline void tick_suspend_broadcast(void) { } static inline int tick_resume_broadcast(void) { return 0; } static inline void tick_broadcast_init(void) { } +static inline int tick_broadcast_update_freq(struct clock_event_device *dev, + u32 freq) { return -ENODEV; } /* * Set the periodic handler in non broadcast mode @@ -152,6 +155,8 @@ static inline int tick_device_is_functional(struct clock_event_device *dev) return !(dev->features & CLOCK_EVT_FEAT_DUMMY); } +int __clockevents_update_freq(struct clock_event_device *dev, u32 freq); + #endif extern void do_timer(unsigned long ticks); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 9f8af69c67e..6558b7ac112 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -84,6 +84,9 @@ static void tick_do_update_jiffies64(ktime_t now) /* Keep the tick_next_period variable up to date */ tick_next_period = ktime_add(last_jiffies_update, tick_period); + } else { + write_sequnlock(&jiffies_lock); + return; } write_sequnlock(&jiffies_lock); update_wall_time(); @@ -967,7 +970,7 @@ static void tick_nohz_switch_to_nohz(void) struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); ktime_t next; - if (!tick_nohz_active) + if (!tick_nohz_enabled) return; local_irq_disable(); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 0aa4ce81bc1..32d8d6aaedb 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -22,6 +22,7 @@ #include <linux/tick.h> #include <linux/stop_machine.h> #include <linux/pvclock_gtod.h> +#include <linux/compiler.h> #include "tick-internal.h" #include "ntp_internal.h" @@ -760,7 +761,7 @@ u64 timekeeping_max_deferment(void) * * XXX - Do be sure to remove it once all arches implement it. */ -void __attribute__((weak)) read_persistent_clock(struct timespec *ts) +void __weak read_persistent_clock(struct timespec *ts) { ts->tv_sec = 0; ts->tv_nsec = 0; @@ -775,7 +776,7 @@ void __attribute__((weak)) read_persistent_clock(struct timespec *ts) * * XXX - Do be sure to remove it once all arches implement it. */ -void __attribute__((weak)) read_boot_clock(struct timespec *ts) +void __weak read_boot_clock(struct timespec *ts) { ts->tv_sec = 0; ts->tv_nsec = 0; @@ -851,8 +852,9 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, struct timespec *delta) { if (!timespec_valid_strict(delta)) { - printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " - "sleep delta value!\n"); + printk_deferred(KERN_WARNING + "__timekeeping_inject_sleeptime: Invalid " + "sleep delta value!\n"); return; } tk_xtime_add(tk, delta); @@ -1156,7 +1158,7 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) if (unlikely(tk->clock->maxadj && (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) { - printk_once(KERN_WARNING + printk_deferred_once(KERN_WARNING "Adjusting %s more than 11%% (%ld vs %ld)\n", tk->clock->name, (long)tk->mult + adj, (long)tk->clock->mult + tk->clock->maxadj); @@ -1435,7 +1437,8 @@ void update_wall_time(void) out: raw_spin_unlock_irqrestore(&timekeeper_lock, flags); if (clock_set) - clock_was_set(); + /* Have to call _delayed version, since in irq context*/ + clock_was_set_delayed(); } /** diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c index 802433a4f5e..4d54f97558d 100644 --- a/kernel/time/timekeeping_debug.c +++ b/kernel/time/timekeeping_debug.c @@ -21,6 +21,8 @@ #include <linux/seq_file.h> #include <linux/time.h> +#include "timekeeping_internal.h" + static unsigned int sleep_time_bin[32] = {0}; static int tk_debug_show_sleep_time(struct seq_file *s, void *data) diff --git a/kernel/timer.c b/kernel/timer.c index accfd241b9e..3bb01a323b2 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -52,7 +52,7 @@ #define CREATE_TRACE_POINTS #include <trace/events/timer.h> -u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; +__visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; EXPORT_SYMBOL(jiffies_64); @@ -81,6 +81,7 @@ struct tvec_base { unsigned long timer_jiffies; unsigned long next_timer; unsigned long active_timers; + unsigned long all_timers; struct tvec_root tv1; struct tvec tv2; struct tvec tv3; @@ -337,6 +338,20 @@ void set_timer_slack(struct timer_list *timer, int slack_hz) } EXPORT_SYMBOL_GPL(set_timer_slack); +/* + * If the list is empty, catch up ->timer_jiffies to the current time. + * The caller must hold the tvec_base lock. Returns true if the list + * was empty and therefore ->timer_jiffies was updated. + */ +static bool catchup_timer_jiffies(struct tvec_base *base) +{ + if (!base->all_timers) { + base->timer_jiffies = jiffies; + return true; + } + return false; +} + static void __internal_add_timer(struct tvec_base *base, struct timer_list *timer) { @@ -383,15 +398,17 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer) static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) { + (void)catchup_timer_jiffies(base); __internal_add_timer(base, timer); /* * Update base->active_timers and base->next_timer */ if (!tbase_get_deferrable(timer->base)) { - if (time_before(timer->expires, base->next_timer)) + if (!base->active_timers++ || + time_before(timer->expires, base->next_timer)) base->next_timer = timer->expires; - base->active_timers++; } + base->all_timers++; } #ifdef CONFIG_TIMER_STATS @@ -671,6 +688,8 @@ detach_expired_timer(struct timer_list *timer, struct tvec_base *base) detach_timer(timer, true); if (!tbase_get_deferrable(timer->base)) base->active_timers--; + base->all_timers--; + (void)catchup_timer_jiffies(base); } static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, @@ -685,6 +704,8 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, if (timer->expires == base->next_timer) base->next_timer = base->timer_jiffies; } + base->all_timers--; + (void)catchup_timer_jiffies(base); return 1; } @@ -739,12 +760,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, debug_activate(timer, expires); - cpu = smp_processor_id(); - -#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP) - if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) - cpu = get_nohz_timer_target(); -#endif + cpu = get_nohz_timer_target(pinned); new_base = per_cpu(tvec_bases, cpu); if (base != new_base) { @@ -822,7 +838,7 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires) bit = find_last_bit(&mask, BITS_PER_LONG); - mask = (1 << bit) - 1; + mask = (1UL << bit) - 1; expires_limit = expires_limit & ~(mask); @@ -939,8 +955,15 @@ void add_timer_on(struct timer_list *timer, int cpu) * with the timer by holding the timer base lock. This also * makes sure that a CPU on the way to stop its tick can not * evaluate the timer wheel. + * + * Spare the IPI for deferrable timers on idle targets though. + * The next busy ticks will take care of it. Except full dynticks + * require special care against races with idle_cpu(), lets deal + * with that later. */ - wake_up_nohz_cpu(cpu); + if (!tbase_get_deferrable(timer->base) || tick_nohz_full_cpu(cpu)) + wake_up_nohz_cpu(cpu); + spin_unlock_irqrestore(&base->lock, flags); } EXPORT_SYMBOL_GPL(add_timer_on); @@ -1146,6 +1169,10 @@ static inline void __run_timers(struct tvec_base *base) struct timer_list *timer; spin_lock_irq(&base->lock); + if (catchup_timer_jiffies(base)) { + spin_unlock_irq(&base->lock); + return; + } while (time_after_eq(jiffies, base->timer_jiffies)) { struct list_head work_list; struct list_head *head = &work_list; @@ -1160,7 +1187,7 @@ static inline void __run_timers(struct tvec_base *base) !cascade(base, &base->tv4, INDEX(2))) cascade(base, &base->tv5, INDEX(3)); ++base->timer_jiffies; - list_replace_init(base->tv1.vec + index, &work_list); + list_replace_init(base->tv1.vec + index, head); while (!list_empty(head)) { void (*fn)(unsigned long); unsigned long data; @@ -1523,9 +1550,8 @@ static int init_timers_cpu(int cpu) if (!base) return -ENOMEM; - /* Make sure that tvec_base is 2 byte aligned */ - if (tbase_get_deferrable(base)) { - WARN_ON(1); + /* Make sure tvec_base has TIMER_FLAG_MASK bits free */ + if (WARN_ON(base != tbase_get_base(base))) { kfree(base); return -ENOMEM; } @@ -1559,6 +1585,7 @@ static int init_timers_cpu(int cpu) base->timer_jiffies = jiffies; base->next_timer = base->timer_jiffies; base->active_timers = 0; + base->all_timers = 0; return 0; } @@ -1648,9 +1675,9 @@ void __init init_timers(void) err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); - init_timer_stats(); - BUG_ON(err != NOTIFY_OK); + + init_timer_stats(); register_cpu_notifier(&timers_nb); open_softirq(TIMER_SOFTIRQ, run_timer_softirq); } diff --git a/kernel/torture.c b/kernel/torture.c new file mode 100644 index 00000000000..40bb511cca4 --- /dev/null +++ b/kernel/torture.c @@ -0,0 +1,733 @@ +/* + * Common functions for in-kernel torture tests. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright (C) IBM Corporation, 2014 + * + * Author: Paul E. McKenney <paulmck@us.ibm.com> + * Based on kernel/rcu/torture.c. + */ +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/err.h> +#include <linux/spinlock.h> +#include <linux/smp.h> +#include <linux/interrupt.h> +#include <linux/sched.h> +#include <linux/atomic.h> +#include <linux/bitops.h> +#include <linux/completion.h> +#include <linux/moduleparam.h> +#include <linux/percpu.h> +#include <linux/notifier.h> +#include <linux/reboot.h> +#include <linux/freezer.h> +#include <linux/cpu.h> +#include <linux/delay.h> +#include <linux/stat.h> +#include <linux/slab.h> +#include <linux/trace_clock.h> +#include <asm/byteorder.h> +#include <linux/torture.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>"); + +static char *torture_type; +static bool verbose; + +/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ +#define FULLSTOP_DONTSTOP 0 /* Normal operation. */ +#define FULLSTOP_SHUTDOWN 1 /* System shutdown with torture running. */ +#define FULLSTOP_RMMOD 2 /* Normal rmmod of torture. */ +static int fullstop = FULLSTOP_RMMOD; +static DEFINE_MUTEX(fullstop_mutex); +static int *torture_runnable; + +#ifdef CONFIG_HOTPLUG_CPU + +/* + * Variables for online-offline handling. Only present if CPU hotplug + * is enabled, otherwise does nothing. + */ + +static struct task_struct *onoff_task; +static long onoff_holdoff; +static long onoff_interval; +static long n_offline_attempts; +static long n_offline_successes; +static unsigned long sum_offline; +static int min_offline = -1; +static int max_offline; +static long n_online_attempts; +static long n_online_successes; +static unsigned long sum_online; +static int min_online = -1; +static int max_online; + +/* + * Execute random CPU-hotplug operations at the interval specified + * by the onoff_interval. + */ +static int +torture_onoff(void *arg) +{ + int cpu; + unsigned long delta; + int maxcpu = -1; + DEFINE_TORTURE_RANDOM(rand); + int ret; + unsigned long starttime; + + VERBOSE_TOROUT_STRING("torture_onoff task started"); + for_each_online_cpu(cpu) + maxcpu = cpu; + WARN_ON(maxcpu < 0); + if (onoff_holdoff > 0) { + VERBOSE_TOROUT_STRING("torture_onoff begin holdoff"); + schedule_timeout_interruptible(onoff_holdoff); + VERBOSE_TOROUT_STRING("torture_onoff end holdoff"); + } + while (!torture_must_stop()) { + cpu = (torture_random(&rand) >> 4) % (maxcpu + 1); + if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: offlining %d\n", + torture_type, cpu); + starttime = jiffies; + n_offline_attempts++; + ret = cpu_down(cpu); + if (ret) { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: offline %d failed: errno %d\n", + torture_type, cpu, ret); + } else { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: offlined %d\n", + torture_type, cpu); + n_offline_successes++; + delta = jiffies - starttime; + sum_offline += delta; + if (min_offline < 0) { + min_offline = delta; + max_offline = delta; + } + if (min_offline > delta) + min_offline = delta; + if (max_offline < delta) + max_offline = delta; + } + } else if (cpu_is_hotpluggable(cpu)) { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: onlining %d\n", + torture_type, cpu); + starttime = jiffies; + n_online_attempts++; + ret = cpu_up(cpu); + if (ret) { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: online %d failed: errno %d\n", + torture_type, cpu, ret); + } else { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: onlined %d\n", + torture_type, cpu); + n_online_successes++; + delta = jiffies - starttime; + sum_online += delta; + if (min_online < 0) { + min_online = delta; + max_online = delta; + } + if (min_online > delta) + min_online = delta; + if (max_online < delta) + max_online = delta; + } + } + schedule_timeout_interruptible(onoff_interval); + } + torture_kthread_stopping("torture_onoff"); + return 0; +} + +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + +/* + * Initiate online-offline handling. + */ +int torture_onoff_init(long ooholdoff, long oointerval) +{ + int ret = 0; + +#ifdef CONFIG_HOTPLUG_CPU + onoff_holdoff = ooholdoff; + onoff_interval = oointerval; + if (onoff_interval <= 0) + return 0; + ret = torture_create_kthread(torture_onoff, NULL, onoff_task); +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + return ret; +} +EXPORT_SYMBOL_GPL(torture_onoff_init); + +/* + * Clean up after online/offline testing. + */ +static void torture_onoff_cleanup(void) +{ +#ifdef CONFIG_HOTPLUG_CPU + if (onoff_task == NULL) + return; + VERBOSE_TOROUT_STRING("Stopping torture_onoff task"); + kthread_stop(onoff_task); + onoff_task = NULL; +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ +} +EXPORT_SYMBOL_GPL(torture_onoff_cleanup); + +/* + * Print online/offline testing statistics. + */ +char *torture_onoff_stats(char *page) +{ +#ifdef CONFIG_HOTPLUG_CPU + page += sprintf(page, + "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ", + n_online_successes, n_online_attempts, + n_offline_successes, n_offline_attempts, + min_online, max_online, + min_offline, max_offline, + sum_online, sum_offline, HZ); +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + return page; +} +EXPORT_SYMBOL_GPL(torture_onoff_stats); + +/* + * Were all the online/offline operations successful? + */ +bool torture_onoff_failures(void) +{ +#ifdef CONFIG_HOTPLUG_CPU + return n_online_successes != n_online_attempts || + n_offline_successes != n_offline_attempts; +#else /* #ifdef CONFIG_HOTPLUG_CPU */ + return false; +#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ +} +EXPORT_SYMBOL_GPL(torture_onoff_failures); + +#define TORTURE_RANDOM_MULT 39916801 /* prime */ +#define TORTURE_RANDOM_ADD 479001701 /* prime */ +#define TORTURE_RANDOM_REFRESH 10000 + +/* + * Crude but fast random-number generator. Uses a linear congruential + * generator, with occasional help from cpu_clock(). + */ +unsigned long +torture_random(struct torture_random_state *trsp) +{ + if (--trsp->trs_count < 0) { + trsp->trs_state += (unsigned long)local_clock(); + trsp->trs_count = TORTURE_RANDOM_REFRESH; + } + trsp->trs_state = trsp->trs_state * TORTURE_RANDOM_MULT + + TORTURE_RANDOM_ADD; + return swahw32(trsp->trs_state); +} +EXPORT_SYMBOL_GPL(torture_random); + +/* + * Variables for shuffling. The idea is to ensure that each CPU stays + * idle for an extended period to test interactions with dyntick idle, + * as well as interactions with any per-CPU varibles. + */ +struct shuffle_task { + struct list_head st_l; + struct task_struct *st_t; +}; + +static long shuffle_interval; /* In jiffies. */ +static struct task_struct *shuffler_task; +static cpumask_var_t shuffle_tmp_mask; +static int shuffle_idle_cpu; /* Force all torture tasks off this CPU */ +static struct list_head shuffle_task_list = LIST_HEAD_INIT(shuffle_task_list); +static DEFINE_MUTEX(shuffle_task_mutex); + +/* + * Register a task to be shuffled. If there is no memory, just splat + * and don't bother registering. + */ +void torture_shuffle_task_register(struct task_struct *tp) +{ + struct shuffle_task *stp; + + if (WARN_ON_ONCE(tp == NULL)) + return; + stp = kmalloc(sizeof(*stp), GFP_KERNEL); + if (WARN_ON_ONCE(stp == NULL)) + return; + stp->st_t = tp; + mutex_lock(&shuffle_task_mutex); + list_add(&stp->st_l, &shuffle_task_list); + mutex_unlock(&shuffle_task_mutex); +} +EXPORT_SYMBOL_GPL(torture_shuffle_task_register); + +/* + * Unregister all tasks, for example, at the end of the torture run. + */ +static void torture_shuffle_task_unregister_all(void) +{ + struct shuffle_task *stp; + struct shuffle_task *p; + + mutex_lock(&shuffle_task_mutex); + list_for_each_entry_safe(stp, p, &shuffle_task_list, st_l) { + list_del(&stp->st_l); + kfree(stp); + } + mutex_unlock(&shuffle_task_mutex); +} + +/* Shuffle tasks such that we allow shuffle_idle_cpu to become idle. + * A special case is when shuffle_idle_cpu = -1, in which case we allow + * the tasks to run on all CPUs. + */ +static void torture_shuffle_tasks(void) +{ + struct shuffle_task *stp; + + cpumask_setall(shuffle_tmp_mask); + get_online_cpus(); + + /* No point in shuffling if there is only one online CPU (ex: UP) */ + if (num_online_cpus() == 1) { + put_online_cpus(); + return; + } + + /* Advance to the next CPU. Upon overflow, don't idle any CPUs. */ + shuffle_idle_cpu = cpumask_next(shuffle_idle_cpu, shuffle_tmp_mask); + if (shuffle_idle_cpu >= nr_cpu_ids) + shuffle_idle_cpu = -1; + else + cpumask_clear_cpu(shuffle_idle_cpu, shuffle_tmp_mask); + + mutex_lock(&shuffle_task_mutex); + list_for_each_entry(stp, &shuffle_task_list, st_l) + set_cpus_allowed_ptr(stp->st_t, shuffle_tmp_mask); + mutex_unlock(&shuffle_task_mutex); + + put_online_cpus(); +} + +/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the + * system to become idle at a time and cut off its timer ticks. This is meant + * to test the support for such tickless idle CPU in RCU. + */ +static int torture_shuffle(void *arg) +{ + VERBOSE_TOROUT_STRING("torture_shuffle task started"); + do { + schedule_timeout_interruptible(shuffle_interval); + torture_shuffle_tasks(); + torture_shutdown_absorb("torture_shuffle"); + } while (!torture_must_stop()); + torture_kthread_stopping("torture_shuffle"); + return 0; +} + +/* + * Start the shuffler, with shuffint in jiffies. + */ +int torture_shuffle_init(long shuffint) +{ + shuffle_interval = shuffint; + + shuffle_idle_cpu = -1; + + if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) { + VERBOSE_TOROUT_ERRSTRING("Failed to alloc mask"); + return -ENOMEM; + } + + /* Create the shuffler thread */ + return torture_create_kthread(torture_shuffle, NULL, shuffler_task); +} +EXPORT_SYMBOL_GPL(torture_shuffle_init); + +/* + * Stop the shuffling. + */ +static void torture_shuffle_cleanup(void) +{ + torture_shuffle_task_unregister_all(); + if (shuffler_task) { + VERBOSE_TOROUT_STRING("Stopping torture_shuffle task"); + kthread_stop(shuffler_task); + free_cpumask_var(shuffle_tmp_mask); + } + shuffler_task = NULL; +} +EXPORT_SYMBOL_GPL(torture_shuffle_cleanup); + +/* + * Variables for auto-shutdown. This allows "lights out" torture runs + * to be fully scripted. + */ +static int shutdown_secs; /* desired test duration in seconds. */ +static struct task_struct *shutdown_task; +static unsigned long shutdown_time; /* jiffies to system shutdown. */ +static void (*torture_shutdown_hook)(void); + +/* + * Absorb kthreads into a kernel function that won't return, so that + * they won't ever access module text or data again. + */ +void torture_shutdown_absorb(const char *title) +{ + while (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { + pr_notice("torture thread %s parking due to system shutdown\n", + title); + schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT); + } +} +EXPORT_SYMBOL_GPL(torture_shutdown_absorb); + +/* + * Cause the torture test to shutdown the system after the test has + * run for the time specified by the shutdown_secs parameter. + */ +static int torture_shutdown(void *arg) +{ + long delta; + unsigned long jiffies_snap; + + VERBOSE_TOROUT_STRING("torture_shutdown task started"); + jiffies_snap = jiffies; + while (ULONG_CMP_LT(jiffies_snap, shutdown_time) && + !torture_must_stop()) { + delta = shutdown_time - jiffies_snap; + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_shutdown task: %lu jiffies remaining\n", + torture_type, delta); + schedule_timeout_interruptible(delta); + jiffies_snap = jiffies; + } + if (torture_must_stop()) { + torture_kthread_stopping("torture_shutdown"); + return 0; + } + + /* OK, shut down the system. */ + + VERBOSE_TOROUT_STRING("torture_shutdown task shutting down system"); + shutdown_task = NULL; /* Avoid self-kill deadlock. */ + if (torture_shutdown_hook) + torture_shutdown_hook(); + else + VERBOSE_TOROUT_STRING("No torture_shutdown_hook(), skipping."); + kernel_power_off(); /* Shut down the system. */ + return 0; +} + +/* + * Start up the shutdown task. + */ +int torture_shutdown_init(int ssecs, void (*cleanup)(void)) +{ + int ret = 0; + + shutdown_secs = ssecs; + torture_shutdown_hook = cleanup; + if (shutdown_secs > 0) { + shutdown_time = jiffies + shutdown_secs * HZ; + ret = torture_create_kthread(torture_shutdown, NULL, + shutdown_task); + } + return ret; +} +EXPORT_SYMBOL_GPL(torture_shutdown_init); + +/* + * Detect and respond to a system shutdown. + */ +static int torture_shutdown_notify(struct notifier_block *unused1, + unsigned long unused2, void *unused3) +{ + mutex_lock(&fullstop_mutex); + if (ACCESS_ONCE(fullstop) == FULLSTOP_DONTSTOP) { + VERBOSE_TOROUT_STRING("Unscheduled system shutdown detected"); + ACCESS_ONCE(fullstop) = FULLSTOP_SHUTDOWN; + } else { + pr_warn("Concurrent rmmod and shutdown illegal!\n"); + } + mutex_unlock(&fullstop_mutex); + return NOTIFY_DONE; +} + +static struct notifier_block torture_shutdown_nb = { + .notifier_call = torture_shutdown_notify, +}; + +/* + * Shut down the shutdown task. Say what??? Heh! This can happen if + * the torture module gets an rmmod before the shutdown time arrives. ;-) + */ +static void torture_shutdown_cleanup(void) +{ + unregister_reboot_notifier(&torture_shutdown_nb); + if (shutdown_task != NULL) { + VERBOSE_TOROUT_STRING("Stopping torture_shutdown task"); + kthread_stop(shutdown_task); + } + shutdown_task = NULL; +} + +/* + * Variables for stuttering, which means to periodically pause and + * restart testing in order to catch bugs that appear when load is + * suddenly applied to or removed from the system. + */ +static struct task_struct *stutter_task; +static int stutter_pause_test; +static int stutter; + +/* + * Block until the stutter interval ends. This must be called periodically + * by all running kthreads that need to be subject to stuttering. + */ +void stutter_wait(const char *title) +{ + while (ACCESS_ONCE(stutter_pause_test) || + (torture_runnable && !ACCESS_ONCE(*torture_runnable))) { + if (stutter_pause_test) + if (ACCESS_ONCE(stutter_pause_test) == 1) + schedule_timeout_interruptible(1); + else + while (ACCESS_ONCE(stutter_pause_test)) + cond_resched(); + else + schedule_timeout_interruptible(round_jiffies_relative(HZ)); + torture_shutdown_absorb(title); + } +} +EXPORT_SYMBOL_GPL(stutter_wait); + +/* + * Cause the torture test to "stutter", starting and stopping all + * threads periodically. + */ +static int torture_stutter(void *arg) +{ + VERBOSE_TOROUT_STRING("torture_stutter task started"); + do { + if (!torture_must_stop()) { + if (stutter > 1) { + schedule_timeout_interruptible(stutter - 1); + ACCESS_ONCE(stutter_pause_test) = 2; + } + schedule_timeout_interruptible(1); + ACCESS_ONCE(stutter_pause_test) = 1; + } + if (!torture_must_stop()) + schedule_timeout_interruptible(stutter); + ACCESS_ONCE(stutter_pause_test) = 0; + torture_shutdown_absorb("torture_stutter"); + } while (!torture_must_stop()); + torture_kthread_stopping("torture_stutter"); + return 0; +} + +/* + * Initialize and kick off the torture_stutter kthread. + */ +int torture_stutter_init(int s) +{ + int ret; + + stutter = s; + ret = torture_create_kthread(torture_stutter, NULL, stutter_task); + return ret; +} +EXPORT_SYMBOL_GPL(torture_stutter_init); + +/* + * Cleanup after the torture_stutter kthread. + */ +static void torture_stutter_cleanup(void) +{ + if (!stutter_task) + return; + VERBOSE_TOROUT_STRING("Stopping torture_stutter task"); + kthread_stop(stutter_task); + stutter_task = NULL; +} + +/* + * Initialize torture module. Please note that this is -not- invoked via + * the usual module_init() mechanism, but rather by an explicit call from + * the client torture module. This call must be paired with a later + * torture_init_end(). + * + * The runnable parameter points to a flag that controls whether or not + * the test is currently runnable. If there is no such flag, pass in NULL. + */ +bool torture_init_begin(char *ttype, bool v, int *runnable) +{ + mutex_lock(&fullstop_mutex); + if (torture_type != NULL) { + pr_alert("torture_init_begin: refusing %s init: %s running", + ttype, torture_type); + mutex_unlock(&fullstop_mutex); + return false; + } + torture_type = ttype; + verbose = v; + torture_runnable = runnable; + fullstop = FULLSTOP_DONTSTOP; + return true; +} +EXPORT_SYMBOL_GPL(torture_init_begin); + +/* + * Tell the torture module that initialization is complete. + */ +void torture_init_end(void) +{ + mutex_unlock(&fullstop_mutex); + register_reboot_notifier(&torture_shutdown_nb); +} +EXPORT_SYMBOL_GPL(torture_init_end); + +/* + * Clean up torture module. Please note that this is -not- invoked via + * the usual module_exit() mechanism, but rather by an explicit call from + * the client torture module. Returns true if a race with system shutdown + * is detected, otherwise, all kthreads started by functions in this file + * will be shut down. + * + * This must be called before the caller starts shutting down its own + * kthreads. + */ +bool torture_cleanup(void) +{ + mutex_lock(&fullstop_mutex); + if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { + pr_warn("Concurrent rmmod and shutdown illegal!\n"); + mutex_unlock(&fullstop_mutex); + schedule_timeout_uninterruptible(10); + return true; + } + ACCESS_ONCE(fullstop) = FULLSTOP_RMMOD; + mutex_unlock(&fullstop_mutex); + torture_shutdown_cleanup(); + torture_shuffle_cleanup(); + torture_stutter_cleanup(); + torture_onoff_cleanup(); + mutex_lock(&fullstop_mutex); + torture_type = NULL; + mutex_unlock(&fullstop_mutex); + return false; +} +EXPORT_SYMBOL_GPL(torture_cleanup); + +/* + * Is it time for the current torture test to stop? + */ +bool torture_must_stop(void) +{ + return torture_must_stop_irq() || kthread_should_stop(); +} +EXPORT_SYMBOL_GPL(torture_must_stop); + +/* + * Is it time for the current torture test to stop? This is the irq-safe + * version, hence no check for kthread_should_stop(). + */ +bool torture_must_stop_irq(void) +{ + return ACCESS_ONCE(fullstop) != FULLSTOP_DONTSTOP; +} +EXPORT_SYMBOL_GPL(torture_must_stop_irq); + +/* + * Each kthread must wait for kthread_should_stop() before returning from + * its top-level function, otherwise segfaults ensue. This function + * prints a "stopping" message and waits for kthread_should_stop(), and + * should be called from all torture kthreads immediately prior to + * returning. + */ +void torture_kthread_stopping(char *title) +{ + char buf[128]; + + snprintf(buf, sizeof(buf), "Stopping %s", title); + VERBOSE_TOROUT_STRING(buf); + while (!kthread_should_stop()) { + torture_shutdown_absorb(title); + schedule_timeout_uninterruptible(1); + } +} +EXPORT_SYMBOL_GPL(torture_kthread_stopping); + +/* + * Create a generic torture kthread that is immediately runnable. If you + * need the kthread to be stopped so that you can do something to it before + * it starts, you will need to open-code your own. + */ +int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m, + char *f, struct task_struct **tp) +{ + int ret = 0; + + VERBOSE_TOROUT_STRING(m); + *tp = kthread_run(fn, arg, s); + if (IS_ERR(*tp)) { + ret = PTR_ERR(*tp); + VERBOSE_TOROUT_ERRSTRING(f); + *tp = NULL; + } + torture_shuffle_task_register(*tp); + return ret; +} +EXPORT_SYMBOL_GPL(_torture_create_kthread); + +/* + * Stop a generic kthread, emitting a message. + */ +void _torture_stop_kthread(char *m, struct task_struct **tp) +{ + if (*tp == NULL) + return; + VERBOSE_TOROUT_STRING(m); + kthread_stop(*tp); + *tp = NULL; +} +EXPORT_SYMBOL_GPL(_torture_stop_kthread); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 015f85aaca0..d4409356f40 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -424,6 +424,7 @@ config UPROBE_EVENT bool "Enable uprobes-based dynamic events" depends on ARCH_SUPPORTS_UPROBES depends on MMU + depends on PERF_EVENTS select UPROBES select PROBE_EVENTS select TRACING @@ -534,6 +535,36 @@ config MMIOTRACE_TEST Say N, unless you absolutely know what you are doing. +config TRACEPOINT_BENCHMARK + bool "Add tracepoint that benchmarks tracepoints" + help + This option creates the tracepoint "benchmark:benchmark_event". + When the tracepoint is enabled, it kicks off a kernel thread that + goes into an infinite loop (calling cond_sched() to let other tasks + run), and calls the tracepoint. Each iteration will record the time + it took to write to the tracepoint and the next iteration that + data will be passed to the tracepoint itself. That is, the tracepoint + will report the time it took to do the previous tracepoint. + The string written to the tracepoint is a static string of 128 bytes + to keep the time the same. The initial string is simply a write of + "START". The second string records the cold cache time of the first + write which is not added to the rest of the calculations. + + As it is a tight loop, it benchmarks as hot cache. That's fine because + we care most about hot paths that are probably in cache already. + + An example of the output: + + START + first=3672 [COLD CACHED] + last=632 first=3672 max=632 min=632 avg=316 std=446 std^2=199712 + last=278 first=3672 max=632 min=278 avg=303 std=316 std^2=100337 + last=277 first=3672 max=632 min=277 avg=296 std=258 std^2=67064 + last=273 first=3672 max=632 min=273 avg=292 std=224 std^2=50411 + last=273 first=3672 max=632 min=273 avg=288 std=200 std^2=40389 + last=281 first=3672 max=632 min=273 avg=287 std=183 std^2=33666 + + config RING_BUFFER_BENCHMARK tristate "Ring buffer benchmark stress tester" depends on RING_BUFFER diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 1378e84fbe3..2611613f14f 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -17,6 +17,7 @@ ifdef CONFIG_TRACING_BRANCHES KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING endif +CFLAGS_trace_benchmark.o := -I$(src) CFLAGS_trace_events_filter.o := -I$(src) obj-$(CONFIG_TRACE_CLOCK) += trace_clock.o @@ -62,4 +63,6 @@ endif obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o obj-$(CONFIG_UPROBE_EVENT) += trace_uprobe.o +obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o + libftrace-y := ftrace.o diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index b418cb0d724..c1bd4ada2a0 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -702,6 +702,7 @@ void blk_trace_shutdown(struct request_queue *q) * blk_add_trace_rq - Add a trace for a request oriented action * @q: queue the io is for * @rq: the source request + * @nr_bytes: number of completed bytes * @what: the action * * Description: @@ -709,7 +710,7 @@ void blk_trace_shutdown(struct request_queue *q) * **/ static void blk_add_trace_rq(struct request_queue *q, struct request *rq, - u32 what) + unsigned int nr_bytes, u32 what) { struct blk_trace *bt = q->blk_trace; @@ -718,11 +719,11 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq, if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { what |= BLK_TC_ACT(BLK_TC_PC); - __blk_add_trace(bt, 0, blk_rq_bytes(rq), rq->cmd_flags, + __blk_add_trace(bt, 0, nr_bytes, rq->cmd_flags, what, rq->errors, rq->cmd_len, rq->cmd); } else { what |= BLK_TC_ACT(BLK_TC_FS); - __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), + __blk_add_trace(bt, blk_rq_pos(rq), nr_bytes, rq->cmd_flags, what, rq->errors, 0, NULL); } } @@ -730,33 +731,34 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq, static void blk_add_trace_rq_abort(void *ignore, struct request_queue *q, struct request *rq) { - blk_add_trace_rq(q, rq, BLK_TA_ABORT); + blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_ABORT); } static void blk_add_trace_rq_insert(void *ignore, struct request_queue *q, struct request *rq) { - blk_add_trace_rq(q, rq, BLK_TA_INSERT); + blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_INSERT); } static void blk_add_trace_rq_issue(void *ignore, struct request_queue *q, struct request *rq) { - blk_add_trace_rq(q, rq, BLK_TA_ISSUE); + blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_ISSUE); } static void blk_add_trace_rq_requeue(void *ignore, struct request_queue *q, struct request *rq) { - blk_add_trace_rq(q, rq, BLK_TA_REQUEUE); + blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_REQUEUE); } static void blk_add_trace_rq_complete(void *ignore, struct request_queue *q, - struct request *rq) + struct request *rq, + unsigned int nr_bytes) { - blk_add_trace_rq(q, rq, BLK_TA_COMPLETE); + blk_add_trace_rq(q, rq, nr_bytes, BLK_TA_COMPLETE); } /** @@ -1427,7 +1429,8 @@ static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter) return print_one_line(iter, true); } -static int blk_tracer_set_flag(u32 old_flags, u32 bit, int set) +static int +blk_tracer_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { /* don't output context-info for blk_classic output */ if (bit == TRACE_BLK_OPT_CLASSIC) { diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index cd7f76d1eb8..ac9d1dad630 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -62,7 +62,7 @@ #define FTRACE_HASH_DEFAULT_BITS 10 #define FTRACE_HASH_MAX_BITS 12 -#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL) +#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_CONTROL) #ifdef CONFIG_DYNAMIC_FTRACE #define INIT_REGEX_LOCK(opsname) \ @@ -103,7 +103,6 @@ static int ftrace_disabled __read_mostly; static DEFINE_MUTEX(ftrace_lock); -static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end; static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end; static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; @@ -171,23 +170,6 @@ int ftrace_nr_registered_ops(void) return cnt; } -static void -ftrace_global_list_func(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *regs) -{ - int bit; - - bit = trace_test_and_set_recursion(TRACE_GLOBAL_START, TRACE_GLOBAL_MAX); - if (bit < 0) - return; - - do_for_each_ftrace_op(op, ftrace_global_list) { - op->func(ip, parent_ip, op, regs); - } while_for_each_ftrace_op(op); - - trace_clear_recursion(bit); -} - static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *regs) { @@ -237,49 +219,6 @@ static int control_ops_alloc(struct ftrace_ops *ops) return 0; } -static void control_ops_free(struct ftrace_ops *ops) -{ - free_percpu(ops->disabled); -} - -static void update_global_ops(void) -{ - ftrace_func_t func; - - /* - * If there's only one function registered, then call that - * function directly. Otherwise, we need to iterate over the - * registered callers. - */ - if (ftrace_global_list == &ftrace_list_end || - ftrace_global_list->next == &ftrace_list_end) { - func = ftrace_global_list->func; - /* - * As we are calling the function directly. - * If it does not have recursion protection, - * the function_trace_op needs to be updated - * accordingly. - */ - if (ftrace_global_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) - global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE; - else - global_ops.flags &= ~FTRACE_OPS_FL_RECURSION_SAFE; - } else { - func = ftrace_global_list_func; - /* The list has its own recursion protection. */ - global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE; - } - - - /* If we filter on pids, update to use the pid function */ - if (!list_empty(&ftrace_pids)) { - set_ftrace_pid_function(func); - func = ftrace_pid_func; - } - - global_ops.func = func; -} - static void ftrace_sync(struct work_struct *work) { /* @@ -307,8 +246,6 @@ static void update_ftrace_function(void) { ftrace_func_t func; - update_global_ops(); - /* * If we are at the end of the list and this ops is * recursion safe and not dynamic and the arch supports passing ops, @@ -320,10 +257,7 @@ static void update_ftrace_function(void) (ftrace_ops_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) && !FTRACE_FORCE_LIST_FUNC)) { /* Set the ftrace_ops that the arch callback uses */ - if (ftrace_ops_list == &global_ops) - set_function_trace_op = ftrace_global_list; - else - set_function_trace_op = ftrace_ops_list; + set_function_trace_op = ftrace_ops_list; func = ftrace_ops_list->func; } else { /* Just use the default ftrace_ops */ @@ -331,12 +265,12 @@ static void update_ftrace_function(void) func = ftrace_ops_list_func; } + update_function_graph_func(); + /* If there's no change, then do nothing more here */ if (ftrace_trace_function == func) return; - update_function_graph_func(); - /* * If we are using the list function, it doesn't care * about the function_trace_ops. @@ -379,6 +313,11 @@ static void update_ftrace_function(void) ftrace_trace_function = func; } +int using_ftrace_ops_list_func(void) +{ + return ftrace_trace_function == ftrace_ops_list_func; +} + static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) { ops->next = *list; @@ -437,16 +376,12 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list, static int __register_ftrace_function(struct ftrace_ops *ops) { - if (FTRACE_WARN_ON(ops == &global_ops)) + if (ops->flags & FTRACE_OPS_FL_DELETED) return -EINVAL; if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED)) return -EBUSY; - /* We don't support both control and global flags set. */ - if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK) - return -EINVAL; - #ifndef CONFIG_DYNAMIC_FTRACE_WITH_REGS /* * If the ftrace_ops specifies SAVE_REGS, then it only can be used @@ -464,10 +399,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops) if (!core_kernel_data((unsigned long)ops)) ops->flags |= FTRACE_OPS_FL_DYNAMIC; - if (ops->flags & FTRACE_OPS_FL_GLOBAL) { - add_ftrace_list_ops(&ftrace_global_list, &global_ops, ops); - ops->flags |= FTRACE_OPS_FL_ENABLED; - } else if (ops->flags & FTRACE_OPS_FL_CONTROL) { + if (ops->flags & FTRACE_OPS_FL_CONTROL) { if (control_ops_alloc(ops)) return -ENOMEM; add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops); @@ -487,15 +419,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED))) return -EBUSY; - if (FTRACE_WARN_ON(ops == &global_ops)) - return -EINVAL; - - if (ops->flags & FTRACE_OPS_FL_GLOBAL) { - ret = remove_ftrace_list_ops(&ftrace_global_list, - &global_ops, ops); - if (!ret) - ops->flags &= ~FTRACE_OPS_FL_ENABLED; - } else if (ops->flags & FTRACE_OPS_FL_CONTROL) { + if (ops->flags & FTRACE_OPS_FL_CONTROL) { ret = remove_ftrace_list_ops(&ftrace_control_list, &control_ops, ops); } else @@ -898,7 +822,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip, local_irq_save(flags); - stat = &__get_cpu_var(ftrace_profile_stats); + stat = this_cpu_ptr(&ftrace_profile_stats); if (!stat->hash || !ftrace_profile_enabled) goto out; @@ -929,7 +853,7 @@ static void profile_graph_return(struct ftrace_graph_ret *trace) unsigned long flags; local_irq_save(flags); - stat = &__get_cpu_var(ftrace_profile_stats); + stat = this_cpu_ptr(&ftrace_profile_stats); if (!stat->hash || !ftrace_profile_enabled) goto out; @@ -1172,8 +1096,6 @@ struct ftrace_page { int size; }; -static struct ftrace_page *ftrace_new_pgs; - #define ENTRY_SIZE sizeof(struct dyn_ftrace) #define ENTRIES_PER_PAGE (PAGE_SIZE / ENTRY_SIZE) @@ -1183,7 +1105,7 @@ static struct ftrace_page *ftrace_new_pgs; static struct ftrace_page *ftrace_pages_start; static struct ftrace_page *ftrace_pages; -static bool ftrace_hash_empty(struct ftrace_hash *hash) +static bool __always_inline ftrace_hash_empty(struct ftrace_hash *hash) { return !hash || !hash->count; } @@ -1560,7 +1482,7 @@ unsigned long ftrace_location(unsigned long ip) * the function tracer. It checks the ftrace internal tables to * determine if the address belongs or not. */ -int ftrace_text_reserved(void *start, void *end) +int ftrace_text_reserved(const void *start, const void *end) { unsigned long ret; @@ -1630,7 +1552,14 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip); /* + * If filter_hash is set, we want to match all functions + * that are in the hash but not in the other hash. * + * If filter_hash is not set, then we are decrementing. + * That means we match anything that is in the hash + * and also in the other_hash. That is, we need to turn + * off functions in the other hash because they are disabled + * by this hash. */ if (filter_hash && in_hash && !in_other_hash) match = 1; @@ -1772,19 +1701,15 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) /* * If this record is being updated from a nop, then * return UPDATE_MAKE_CALL. - * Otherwise, if the EN flag is set, then return - * UPDATE_MODIFY_CALL_REGS to tell the caller to convert - * from the non-save regs, to a save regs function. * Otherwise, * return UPDATE_MODIFY_CALL to tell the caller to convert - * from the save regs, to a non-save regs function. + * from the save regs, to a non-save regs function or + * vice versa. */ if (flag & FTRACE_FL_ENABLED) return FTRACE_UPDATE_MAKE_CALL; - else if (rec->flags & FTRACE_FL_REGS_EN) - return FTRACE_UPDATE_MODIFY_CALL_REGS; - else - return FTRACE_UPDATE_MODIFY_CALL; + + return FTRACE_UPDATE_MODIFY_CALL; } if (update) { @@ -1826,6 +1751,42 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable) return ftrace_check_record(rec, enable, 0); } +/** + * ftrace_get_addr_new - Get the call address to set to + * @rec: The ftrace record descriptor + * + * If the record has the FTRACE_FL_REGS set, that means that it + * wants to convert to a callback that saves all regs. If FTRACE_FL_REGS + * is not not set, then it wants to convert to the normal callback. + * + * Returns the address of the trampoline to set to + */ +unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec) +{ + if (rec->flags & FTRACE_FL_REGS) + return (unsigned long)FTRACE_REGS_ADDR; + else + return (unsigned long)FTRACE_ADDR; +} + +/** + * ftrace_get_addr_curr - Get the call address that is already there + * @rec: The ftrace record descriptor + * + * The FTRACE_FL_REGS_EN is set when the record already points to + * a function that saves all the regs. Basically the '_EN' version + * represents the current state of the function. + * + * Returns the address of the trampoline that is currently being called + */ +unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec) +{ + if (rec->flags & FTRACE_FL_REGS_EN) + return (unsigned long)FTRACE_REGS_ADDR; + else + return (unsigned long)FTRACE_ADDR; +} + static int __ftrace_replace_code(struct dyn_ftrace *rec, int enable) { @@ -1833,12 +1794,12 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) unsigned long ftrace_addr; int ret; - ret = ftrace_update_record(rec, enable); + ftrace_addr = ftrace_get_addr_new(rec); - if (rec->flags & FTRACE_FL_REGS) - ftrace_addr = (unsigned long)FTRACE_REGS_ADDR; - else - ftrace_addr = (unsigned long)FTRACE_ADDR; + /* This needs to be done before we call ftrace_update_record */ + ftrace_old_addr = ftrace_get_addr_curr(rec); + + ret = ftrace_update_record(rec, enable); switch (ret) { case FTRACE_UPDATE_IGNORE: @@ -1850,13 +1811,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) case FTRACE_UPDATE_MAKE_NOP: return ftrace_make_nop(NULL, rec, ftrace_addr); - case FTRACE_UPDATE_MODIFY_CALL_REGS: case FTRACE_UPDATE_MODIFY_CALL: - if (rec->flags & FTRACE_FL_REGS) - ftrace_old_addr = (unsigned long)FTRACE_ADDR; - else - ftrace_old_addr = (unsigned long)FTRACE_REGS_ADDR; - return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr); } @@ -1994,6 +1949,7 @@ int __weak ftrace_arch_code_modify_post_process(void) void ftrace_modify_all_code(int command) { int update = command & FTRACE_UPDATE_TRACE_FUNC; + int err = 0; /* * If the ftrace_caller calls a ftrace_ops func directly, @@ -2005,8 +1961,11 @@ void ftrace_modify_all_code(int command) * to make sure the ops are having the right functions * traced. */ - if (update) - ftrace_update_ftrace_func(ftrace_ops_list_func); + if (update) { + err = ftrace_update_ftrace_func(ftrace_ops_list_func); + if (FTRACE_WARN_ON(err)) + return; + } if (command & FTRACE_UPDATE_CALLS) ftrace_replace_code(1); @@ -2019,13 +1978,16 @@ void ftrace_modify_all_code(int command) /* If irqs are disabled, we are in stop machine */ if (!irqs_disabled()) smp_call_function(ftrace_sync_ipi, NULL, 1); - ftrace_update_ftrace_func(ftrace_trace_function); + err = ftrace_update_ftrace_func(ftrace_trace_function); + if (FTRACE_WARN_ON(err)) + return; } if (command & FTRACE_START_FUNC_RET) - ftrace_enable_ftrace_graph_caller(); + err = ftrace_enable_ftrace_graph_caller(); else if (command & FTRACE_STOP_FUNC_RET) - ftrace_disable_ftrace_graph_caller(); + err = ftrace_disable_ftrace_graph_caller(); + FTRACE_WARN_ON(err); } static int __ftrace_modify_code(void *data) @@ -2093,6 +2055,11 @@ static ftrace_func_t saved_ftrace_func; static int ftrace_start_up; static int global_start_up; +static void control_ops_free(struct ftrace_ops *ops) +{ + free_percpu(ops->disabled); +} + static void ftrace_startup_enable(int command) { if (saved_ftrace_func != ftrace_trace_function) { @@ -2108,7 +2075,6 @@ static void ftrace_startup_enable(int command) static int ftrace_startup(struct ftrace_ops *ops, int command) { - bool hash_enable = true; int ret; if (unlikely(ftrace_disabled)) @@ -2121,18 +2087,9 @@ static int ftrace_startup(struct ftrace_ops *ops, int command) ftrace_start_up++; command |= FTRACE_UPDATE_CALLS; - /* ops marked global share the filter hashes */ - if (ops->flags & FTRACE_OPS_FL_GLOBAL) { - ops = &global_ops; - /* Don't update hash if global is already set */ - if (global_start_up) - hash_enable = false; - global_start_up++; - } - ops->flags |= FTRACE_OPS_FL_ENABLED; - if (hash_enable) - ftrace_hash_rec_enable(ops, 1); + + ftrace_hash_rec_enable(ops, 1); ftrace_startup_enable(command); @@ -2141,7 +2098,6 @@ static int ftrace_startup(struct ftrace_ops *ops, int command) static int ftrace_shutdown(struct ftrace_ops *ops, int command) { - bool hash_disable = true; int ret; if (unlikely(ftrace_disabled)) @@ -2159,21 +2115,9 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) */ WARN_ON_ONCE(ftrace_start_up < 0); - if (ops->flags & FTRACE_OPS_FL_GLOBAL) { - ops = &global_ops; - global_start_up--; - WARN_ON_ONCE(global_start_up < 0); - /* Don't update hash if global still has users */ - if (global_start_up) { - WARN_ON_ONCE(!ftrace_start_up); - hash_disable = false; - } - } - - if (hash_disable) - ftrace_hash_rec_disable(ops, 1); + ftrace_hash_rec_disable(ops, 1); - if (ops != &global_ops || !global_start_up) + if (!global_start_up) ops->flags &= ~FTRACE_OPS_FL_ENABLED; command |= FTRACE_UPDATE_CALLS; @@ -2244,7 +2188,6 @@ static void ftrace_shutdown_sysctl(void) } static cycle_t ftrace_update_time; -static unsigned long ftrace_update_cnt; unsigned long ftrace_update_tot_cnt; static inline int ops_traces_mod(struct ftrace_ops *ops) @@ -2300,11 +2243,12 @@ static int referenced_filters(struct dyn_ftrace *rec) return cnt; } -static int ftrace_update_code(struct module *mod) +static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) { struct ftrace_page *pg; struct dyn_ftrace *p; cycle_t start, stop; + unsigned long update_cnt = 0; unsigned long ref = 0; bool test = false; int i; @@ -2330,9 +2274,8 @@ static int ftrace_update_code(struct module *mod) } start = ftrace_now(raw_smp_processor_id()); - ftrace_update_cnt = 0; - for (pg = ftrace_new_pgs; pg; pg = pg->next) { + for (pg = new_pgs; pg; pg = pg->next) { for (i = 0; i < pg->index; i++) { int cnt = ref; @@ -2353,7 +2296,7 @@ static int ftrace_update_code(struct module *mod) if (!ftrace_code_disable(mod, p)) break; - ftrace_update_cnt++; + update_cnt++; /* * If the tracing is enabled, go ahead and enable the record. @@ -2372,11 +2315,9 @@ static int ftrace_update_code(struct module *mod) } } - ftrace_new_pgs = NULL; - stop = ftrace_now(raw_smp_processor_id()); ftrace_update_time = stop - start; - ftrace_update_tot_cnt += ftrace_update_cnt; + ftrace_update_tot_cnt += update_cnt; return 0; } @@ -2468,22 +2409,6 @@ ftrace_allocate_pages(unsigned long num_to_init) return NULL; } -static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) -{ - int cnt; - - if (!num_to_init) { - pr_info("ftrace: No functions to be traced?\n"); - return -1; - } - - cnt = num_to_init / ENTRIES_PER_PAGE; - pr_info("ftrace: allocating %ld entries in %d pages\n", - num_to_init, cnt + 1); - - return 0; -} - #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ struct ftrace_iterator { @@ -2871,7 +2796,9 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, static int ftrace_filter_open(struct inode *inode, struct file *file) { - return ftrace_regex_open(&global_ops, + struct ftrace_ops *ops = inode->i_private; + + return ftrace_regex_open(ops, FTRACE_ITER_FILTER | FTRACE_ITER_DO_HASH, inode, file); } @@ -2879,7 +2806,9 @@ ftrace_filter_open(struct inode *inode, struct file *file) static int ftrace_notrace_open(struct inode *inode, struct file *file) { - return ftrace_regex_open(&global_ops, FTRACE_ITER_NOTRACE, + struct ftrace_ops *ops = inode->i_private; + + return ftrace_regex_open(ops, FTRACE_ITER_NOTRACE, inode, file); } @@ -3532,10 +3461,6 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, struct ftrace_hash *hash; int ret; - /* All global ops uses the global ops filters */ - if (ops->flags & FTRACE_OPS_FL_GLOBAL) - ops = &global_ops; - if (unlikely(ftrace_disabled)) return -ENODEV; @@ -3647,8 +3572,7 @@ int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, } EXPORT_SYMBOL_GPL(ftrace_set_notrace); /** - * ftrace_set_filter - set a function to filter on in ftrace - * @ops - the ops to set the filter with + * ftrace_set_global_filter - set a function to filter on with global tracers * @buf - the string that holds the function filter text. * @len - the length of the string. * @reset - non zero to reset all filters before applying this filter. @@ -3663,8 +3587,7 @@ void ftrace_set_global_filter(unsigned char *buf, int len, int reset) EXPORT_SYMBOL_GPL(ftrace_set_global_filter); /** - * ftrace_set_notrace - set a function to not trace in ftrace - * @ops - the ops to set the notrace filter with + * ftrace_set_global_notrace - set a function to not trace with global tracers * @buf - the string that holds the function notrace text. * @len - the length of the string. * @reset - non zero to reset all filters before applying this filter. @@ -4109,6 +4032,36 @@ static const struct file_operations ftrace_graph_notrace_fops = { }; #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ +void ftrace_create_filter_files(struct ftrace_ops *ops, + struct dentry *parent) +{ + + trace_create_file("set_ftrace_filter", 0644, parent, + ops, &ftrace_filter_fops); + + trace_create_file("set_ftrace_notrace", 0644, parent, + ops, &ftrace_notrace_fops); +} + +/* + * The name "destroy_filter_files" is really a misnomer. Although + * in the future, it may actualy delete the files, but this is + * really intended to make sure the ops passed in are disabled + * and that when this function returns, the caller is free to + * free the ops. + * + * The "destroy" name is only to match the "create" name that this + * should be paired with. + */ +void ftrace_destroy_filter_files(struct ftrace_ops *ops) +{ + mutex_lock(&ftrace_lock); + if (ops->flags & FTRACE_OPS_FL_ENABLED) + ftrace_shutdown(ops, 0); + ops->flags |= FTRACE_OPS_FL_DELETED; + mutex_unlock(&ftrace_lock); +} + static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { @@ -4118,11 +4071,7 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) trace_create_file("enabled_functions", 0444, d_tracer, NULL, &ftrace_enabled_fops); - trace_create_file("set_ftrace_filter", 0644, d_tracer, - NULL, &ftrace_filter_fops); - - trace_create_file("set_ftrace_notrace", 0644, d_tracer, - NULL, &ftrace_notrace_fops); + ftrace_create_filter_files(&global_ops, d_tracer); #ifdef CONFIG_FUNCTION_GRAPH_TRACER trace_create_file("set_graph_function", 0444, d_tracer, @@ -4238,9 +4187,6 @@ static int ftrace_process_locs(struct module *mod, /* Assign the last page to ftrace_pages */ ftrace_pages = pg; - /* These new locations need to be initialized */ - ftrace_new_pgs = start_pg; - /* * We only need to disable interrupts on start up * because we are modifying code that an interrupt @@ -4251,7 +4197,7 @@ static int ftrace_process_locs(struct module *mod, */ if (!mod) local_irq_save(flags); - ftrace_update_code(mod); + ftrace_update_code(mod, start_pg); if (!mod) local_irq_restore(flags); ret = 0; @@ -4315,16 +4261,11 @@ static void ftrace_init_module(struct module *mod, ftrace_process_locs(mod, start, end); } -static int ftrace_module_notify_enter(struct notifier_block *self, - unsigned long val, void *data) +void ftrace_module_init(struct module *mod) { - struct module *mod = data; - - if (val == MODULE_STATE_COMING) - ftrace_init_module(mod, mod->ftrace_callsites, - mod->ftrace_callsites + - mod->num_ftrace_callsites); - return 0; + ftrace_init_module(mod, mod->ftrace_callsites, + mod->ftrace_callsites + + mod->num_ftrace_callsites); } static int ftrace_module_notify_exit(struct notifier_block *self, @@ -4338,11 +4279,6 @@ static int ftrace_module_notify_exit(struct notifier_block *self, return 0; } #else -static int ftrace_module_notify_enter(struct notifier_block *self, - unsigned long val, void *data) -{ - return 0; -} static int ftrace_module_notify_exit(struct notifier_block *self, unsigned long val, void *data) { @@ -4350,40 +4286,32 @@ static int ftrace_module_notify_exit(struct notifier_block *self, } #endif /* CONFIG_MODULES */ -struct notifier_block ftrace_module_enter_nb = { - .notifier_call = ftrace_module_notify_enter, - .priority = INT_MAX, /* Run before anything that can use kprobes */ -}; - struct notifier_block ftrace_module_exit_nb = { .notifier_call = ftrace_module_notify_exit, .priority = INT_MIN, /* Run after anything that can remove kprobes */ }; -extern unsigned long __start_mcount_loc[]; -extern unsigned long __stop_mcount_loc[]; - void __init ftrace_init(void) { - unsigned long count, addr, flags; + extern unsigned long __start_mcount_loc[]; + extern unsigned long __stop_mcount_loc[]; + unsigned long count, flags; int ret; - /* Keep the ftrace pointer to the stub */ - addr = (unsigned long)ftrace_stub; - local_irq_save(flags); - ftrace_dyn_arch_init(&addr); + ret = ftrace_dyn_arch_init(); local_irq_restore(flags); - - /* ftrace_dyn_arch_init places the return code in addr */ - if (addr) + if (ret) goto failed; count = __stop_mcount_loc - __start_mcount_loc; - - ret = ftrace_dyn_table_alloc(count); - if (ret) + if (!count) { + pr_info("ftrace: No functions to be traced?\n"); goto failed; + } + + pr_info("ftrace: allocating %ld entries in %ld pages\n", + count, count / ENTRIES_PER_PAGE + 1); last_ftrace_enabled = ftrace_enabled = 1; @@ -4391,10 +4319,6 @@ void __init ftrace_init(void) __start_mcount_loc, __stop_mcount_loc); - ret = register_module_notifier(&ftrace_module_enter_nb); - if (ret) - pr_warning("Failed to register trace ftrace module enter notifier\n"); - ret = register_module_notifier(&ftrace_module_exit_nb); if (ret) pr_warning("Failed to register trace ftrace module exit notifier\n"); @@ -4431,7 +4355,13 @@ static inline void ftrace_startup_enable(int command) { } (ops)->flags |= FTRACE_OPS_FL_ENABLED; \ ___ret; \ }) -# define ftrace_shutdown(ops, command) __unregister_ftrace_function(ops) +# define ftrace_shutdown(ops, command) \ + ({ \ + int ___ret = __unregister_ftrace_function(ops); \ + if (!___ret) \ + (ops)->flags &= ~FTRACE_OPS_FL_ENABLED; \ + ___ret; \ + }) # define ftrace_startup_sysctl() do { } while (0) # define ftrace_shutdown_sysctl() do { } while (0) @@ -4444,6 +4374,34 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) #endif /* CONFIG_DYNAMIC_FTRACE */ +__init void ftrace_init_global_array_ops(struct trace_array *tr) +{ + tr->ops = &global_ops; + tr->ops->private = tr; +} + +void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func) +{ + /* If we filter on pids, update to use the pid function */ + if (tr->flags & TRACE_ARRAY_FL_GLOBAL) { + if (WARN_ON(tr->ops->func != ftrace_stub)) + printk("ftrace ops had %pS for function\n", + tr->ops->func); + /* Only the top level instance does pid tracing */ + if (!list_empty(&ftrace_pids)) { + set_ftrace_pid_function(func); + func = ftrace_pid_func; + } + } + tr->ops->func = func; + tr->ops->private = tr; +} + +void ftrace_reset_array_ops(struct trace_array *tr) +{ + tr->ops->func = ftrace_stub; +} + static void ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *regs) @@ -4502,9 +4460,16 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, */ preempt_disable_notrace(); do_for_each_ftrace_op(op, ftrace_ops_list) { - if (ftrace_ops_test(op, ip, regs)) + if (ftrace_ops_test(op, ip, regs)) { + if (WARN_ON(!op->func)) { + function_trace_stop = 1; + printk("op=%p %pS\n", op, op); + goto out; + } op->func(ip, parent_ip, op, regs); + } } while_for_each_ftrace_op(op); +out: preempt_enable_notrace(); trace_clear_recursion(bit); } @@ -4909,7 +4874,6 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, #ifdef CONFIG_FUNCTION_GRAPH_TRACER static int ftrace_graph_active; -static struct notifier_block ftrace_suspend_notifier; int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) { @@ -5055,13 +5019,6 @@ ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state, return NOTIFY_DONE; } -/* Just a place holder for function graph */ -static struct ftrace_ops fgraph_ops __read_mostly = { - .func = ftrace_stub, - .flags = FTRACE_OPS_FL_STUB | FTRACE_OPS_FL_GLOBAL | - FTRACE_OPS_FL_RECURSION_SAFE, -}; - static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace) { if (!ftrace_ops_test(&global_ops, trace->func, NULL)) @@ -5086,6 +5043,10 @@ static void update_function_graph_func(void) ftrace_graph_entry = ftrace_graph_entry_test; } +static struct notifier_block ftrace_suspend_notifier = { + .notifier_call = ftrace_suspend_notifier_call, +}; + int register_ftrace_graph(trace_func_graph_ret_t retfunc, trace_func_graph_ent_t entryfunc) { @@ -5099,7 +5060,6 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, goto out; } - ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call; register_pm_notifier(&ftrace_suspend_notifier); ftrace_graph_active++; @@ -5121,7 +5081,10 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, ftrace_graph_entry = ftrace_graph_entry_test; update_function_graph_func(); - ret = ftrace_startup(&fgraph_ops, FTRACE_START_FUNC_RET); + /* Function graph doesn't use the .func field of global_ops */ + global_ops.flags |= FTRACE_OPS_FL_STUB; + + ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET); out: mutex_unlock(&ftrace_lock); @@ -5139,7 +5102,8 @@ void unregister_ftrace_graph(void) ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; ftrace_graph_entry = ftrace_graph_entry_stub; __ftrace_graph_entry = ftrace_graph_entry_stub; - ftrace_shutdown(&fgraph_ops, FTRACE_STOP_FUNC_RET); + ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET); + global_ops.flags &= ~FTRACE_OPS_FL_STUB; unregister_pm_notifier(&ftrace_suspend_notifier); unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 294b8a271a0..ff7027199a9 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -543,7 +543,7 @@ static void rb_wake_up_waiters(struct irq_work *work) * as data is added to any of the @buffer's cpu buffers. Otherwise * it will wait for data to be added to a specific cpu buffer. */ -void ring_buffer_wait(struct ring_buffer *buffer, int cpu) +int ring_buffer_wait(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; DEFINE_WAIT(wait); @@ -557,6 +557,8 @@ void ring_buffer_wait(struct ring_buffer *buffer, int cpu) if (cpu == RING_BUFFER_ALL_CPUS) work = &buffer->irq_work; else { + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return -ENODEV; cpu_buffer = buffer->buffers[cpu]; work = &cpu_buffer->irq_work; } @@ -591,6 +593,7 @@ void ring_buffer_wait(struct ring_buffer *buffer, int cpu) schedule(); finish_wait(&work->waiters, &wait); + return 0; } /** @@ -613,10 +616,6 @@ int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu, struct ring_buffer_per_cpu *cpu_buffer; struct rb_irq_work *work; - if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) || - (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu))) - return POLLIN | POLLRDNORM; - if (cpu == RING_BUFFER_ALL_CPUS) work = &buffer->irq_work; else { @@ -1301,7 +1300,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, * In that off case, we need to allocate for all possible cpus. */ #ifdef CONFIG_HOTPLUG_CPU - get_online_cpus(); + cpu_notifier_register_begin(); cpumask_copy(buffer->cpumask, cpu_online_mask); #else cpumask_copy(buffer->cpumask, cpu_possible_mask); @@ -1324,10 +1323,10 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, #ifdef CONFIG_HOTPLUG_CPU buffer->cpu_notify.notifier_call = rb_cpu_notify; buffer->cpu_notify.priority = 0; - register_cpu_notifier(&buffer->cpu_notify); + __register_cpu_notifier(&buffer->cpu_notify); + cpu_notifier_register_done(); #endif - put_online_cpus(); mutex_init(&buffer->mutex); return buffer; @@ -1341,7 +1340,9 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, fail_free_cpumask: free_cpumask_var(buffer->cpumask); - put_online_cpus(); +#ifdef CONFIG_HOTPLUG_CPU + cpu_notifier_register_done(); +#endif fail_free_buffer: kfree(buffer); @@ -1358,16 +1359,17 @@ ring_buffer_free(struct ring_buffer *buffer) { int cpu; - get_online_cpus(); - #ifdef CONFIG_HOTPLUG_CPU - unregister_cpu_notifier(&buffer->cpu_notify); + cpu_notifier_register_begin(); + __unregister_cpu_notifier(&buffer->cpu_notify); #endif for_each_buffer_cpu(buffer, cpu) rb_free_cpu_buffer(buffer->buffers[cpu]); - put_online_cpus(); +#ifdef CONFIG_HOTPLUG_CPU + cpu_notifier_register_done(); +#endif kfree(buffer->buffers); free_cpumask_var(buffer->cpumask); @@ -2397,6 +2399,13 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, write &= RB_WRITE_MASK; tail = write - length; + /* + * If this is the first commit on the page, then it has the same + * timestamp as the page itself. + */ + if (!tail) + delta = 0; + /* See if we shot pass the end of this buffer page */ if (unlikely(write > BUF_PAGE_SIZE)) return rb_move_tail(cpu_buffer, length, tail, diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index a5457d577b9..0434ff1b808 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -40,8 +40,8 @@ static int write_iteration = 50; module_param(write_iteration, uint, 0644); MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings"); -static int producer_nice = 19; -static int consumer_nice = 19; +static int producer_nice = MAX_NICE; +static int consumer_nice = MAX_NICE; static int producer_fifo = -1; static int consumer_fifo = -1; @@ -308,7 +308,7 @@ static void ring_buffer_producer(void) /* Let the user know that the test is running at low priority */ if (producer_fifo < 0 && consumer_fifo < 0 && - producer_nice == 19 && consumer_nice == 19) + producer_nice == MAX_NICE && consumer_nice == MAX_NICE) trace_printk("WARNING!!! This test is running at lowest priority.\n"); trace_printk("Time: %lld (usecs)\n", time); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 815c878f409..291397e6666 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -73,7 +73,8 @@ static struct tracer_flags dummy_tracer_flags = { .opts = dummy_tracer_opt }; -static int dummy_set_flag(u32 old_flags, u32 bit, int set) +static int +dummy_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { return 0; } @@ -118,7 +119,7 @@ enum ftrace_dump_mode ftrace_dump_on_oops; /* When set, tracing will stop when a WARN*() is hit */ int __disable_trace_on_warning; -static int tracing_set_tracer(const char *buf); +static int tracing_set_tracer(struct trace_array *tr, const char *buf); #define MAX_TRACER_SIZE 100 static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; @@ -180,6 +181,17 @@ static int __init set_trace_boot_options(char *str) } __setup("trace_options=", set_trace_boot_options); +static char trace_boot_clock_buf[MAX_TRACER_SIZE] __initdata; +static char *trace_boot_clock __initdata; + +static int __init set_trace_boot_clock(char *str) +{ + strlcpy(trace_boot_clock_buf, str, MAX_TRACER_SIZE); + trace_boot_clock = trace_boot_clock_buf; + return 0; +} +__setup("trace_clock=", set_trace_boot_clock); + unsigned long long ns2usecs(cycle_t nsec) { @@ -263,7 +275,7 @@ int call_filter_check_discard(struct ftrace_event_call *call, void *rec, } EXPORT_SYMBOL_GPL(call_filter_check_discard); -cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu) +static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu) { u64 ts; @@ -454,6 +466,12 @@ int __trace_puts(unsigned long ip, const char *str, int size) struct print_entry *entry; unsigned long irq_flags; int alloc; + int pc; + + if (!(trace_flags & TRACE_ITER_PRINTK)) + return 0; + + pc = preempt_count(); if (unlikely(tracing_selftest_running || tracing_disabled)) return 0; @@ -463,7 +481,7 @@ int __trace_puts(unsigned long ip, const char *str, int size) local_save_flags(irq_flags); buffer = global_trace.trace_buffer.buffer; event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, - irq_flags, preempt_count()); + irq_flags, pc); if (!event) return 0; @@ -480,6 +498,7 @@ int __trace_puts(unsigned long ip, const char *str, int size) entry->buf[size] = '\0'; __buffer_unlock_commit(buffer, event); + ftrace_trace_stack(buffer, irq_flags, 4, pc); return size; } @@ -497,6 +516,12 @@ int __trace_bputs(unsigned long ip, const char *str) struct bputs_entry *entry; unsigned long irq_flags; int size = sizeof(struct bputs_entry); + int pc; + + if (!(trace_flags & TRACE_ITER_PRINTK)) + return 0; + + pc = preempt_count(); if (unlikely(tracing_selftest_running || tracing_disabled)) return 0; @@ -504,7 +529,7 @@ int __trace_bputs(unsigned long ip, const char *str) local_save_flags(irq_flags); buffer = global_trace.trace_buffer.buffer; event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, - irq_flags, preempt_count()); + irq_flags, pc); if (!event) return 0; @@ -513,6 +538,7 @@ int __trace_bputs(unsigned long ip, const char *str) entry->str = str; __buffer_unlock_commit(buffer, event); + ftrace_trace_stack(buffer, irq_flags, 4, pc); return 1; } @@ -587,7 +613,7 @@ static int alloc_snapshot(struct trace_array *tr) return 0; } -void free_snapshot(struct trace_array *tr) +static void free_snapshot(struct trace_array *tr) { /* * We don't free the ring buffer. instead, resize it because @@ -797,7 +823,7 @@ static struct { { trace_clock_local, "local", 1 }, { trace_clock_global, "global", 1 }, { trace_clock_counter, "counter", 0 }, - { trace_clock_jiffies, "uptime", 1 }, + { trace_clock_jiffies, "uptime", 0 }, { trace_clock, "perf", 1 }, ARCH_TRACE_CLOCKS }; @@ -951,27 +977,9 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) return cnt; } -/* - * ftrace_max_lock is used to protect the swapping of buffers - * when taking a max snapshot. The buffers themselves are - * protected by per_cpu spinlocks. But the action of the swap - * needs its own lock. - * - * This is defined as a arch_spinlock_t in order to help - * with performance when lockdep debugging is enabled. - * - * It is also used in other places outside the update_max_tr - * so it needs to be defined outside of the - * CONFIG_TRACER_MAX_TRACE. - */ -static arch_spinlock_t ftrace_max_lock = - (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; - unsigned long __read_mostly tracing_thresh; #ifdef CONFIG_TRACER_MAX_TRACE -unsigned long __read_mostly tracing_max_latency; - /* * Copy the new maximum trace into the separate maximum-trace * structure. (this way the maximum trace is permanently saved, @@ -988,7 +996,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) max_buf->cpu = cpu; max_buf->time_start = data->preempt_timestamp; - max_data->saved_latency = tracing_max_latency; + max_data->saved_latency = tr->max_latency; max_data->critical_start = data->critical_start; max_data->critical_end = data->critical_end; @@ -1036,14 +1044,14 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) return; } - arch_spin_lock(&ftrace_max_lock); + arch_spin_lock(&tr->max_lock); buf = tr->trace_buffer.buffer; tr->trace_buffer.buffer = tr->max_buffer.buffer; tr->max_buffer.buffer = buf; __update_max_tr(tr, tsk, cpu); - arch_spin_unlock(&ftrace_max_lock); + arch_spin_unlock(&tr->max_lock); } /** @@ -1069,7 +1077,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) return; } - arch_spin_lock(&ftrace_max_lock); + arch_spin_lock(&tr->max_lock); ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->trace_buffer.buffer, cpu); @@ -1087,17 +1095,17 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); __update_max_tr(tr, tsk, cpu); - arch_spin_unlock(&ftrace_max_lock); + arch_spin_unlock(&tr->max_lock); } #endif /* CONFIG_TRACER_MAX_TRACE */ -static void default_wait_pipe(struct trace_iterator *iter) +static int wait_on_pipe(struct trace_iterator *iter) { /* Iterators are static, they should be filled or empty */ if (trace_buffer_iter(iter, iter->cpu_file)) - return; + return 0; - ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file); + return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file); } #ifdef CONFIG_FTRACE_STARTUP_TEST @@ -1208,8 +1216,6 @@ int register_tracer(struct tracer *type) else if (!type->flags->opts) type->flags->opts = dummy_tracer_opt; - if (!type->wait_pipe) - type->wait_pipe = default_wait_pipe; ret = run_tracer_selftest(type); if (ret < 0) @@ -1230,7 +1236,7 @@ int register_tracer(struct tracer *type) printk(KERN_INFO "Starting tracer '%s'\n", type->name); /* Do we want this tracer to start on bootup? */ - tracing_set_tracer(type->name); + tracing_set_tracer(&global_trace, type->name); default_bootup_tracer = NULL; /* disable other selftests, since this will break it. */ tracing_selftest_disabled = true; @@ -1293,22 +1299,71 @@ void tracing_reset_all_online_cpus(void) } } -#define SAVED_CMDLINES 128 +#define SAVED_CMDLINES_DEFAULT 128 #define NO_CMDLINE_MAP UINT_MAX -static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; -static unsigned map_cmdline_to_pid[SAVED_CMDLINES]; -static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN]; -static int cmdline_idx; static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; +struct saved_cmdlines_buffer { + unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; + unsigned *map_cmdline_to_pid; + unsigned cmdline_num; + int cmdline_idx; + char *saved_cmdlines; +}; +static struct saved_cmdlines_buffer *savedcmd; /* temporary disable recording */ static atomic_t trace_record_cmdline_disabled __read_mostly; -static void trace_init_cmdlines(void) +static inline char *get_saved_cmdlines(int idx) +{ + return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN]; +} + +static inline void set_cmdline(int idx, const char *cmdline) +{ + memcpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN); +} + +static int allocate_cmdlines_buffer(unsigned int val, + struct saved_cmdlines_buffer *s) +{ + s->map_cmdline_to_pid = kmalloc(val * sizeof(*s->map_cmdline_to_pid), + GFP_KERNEL); + if (!s->map_cmdline_to_pid) + return -ENOMEM; + + s->saved_cmdlines = kmalloc(val * TASK_COMM_LEN, GFP_KERNEL); + if (!s->saved_cmdlines) { + kfree(s->map_cmdline_to_pid); + return -ENOMEM; + } + + s->cmdline_idx = 0; + s->cmdline_num = val; + memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP, + sizeof(s->map_pid_to_cmdline)); + memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP, + val * sizeof(*s->map_cmdline_to_pid)); + + return 0; +} + +static int trace_create_savedcmd(void) { - memset(&map_pid_to_cmdline, NO_CMDLINE_MAP, sizeof(map_pid_to_cmdline)); - memset(&map_cmdline_to_pid, NO_CMDLINE_MAP, sizeof(map_cmdline_to_pid)); - cmdline_idx = 0; + int ret; + + savedcmd = kmalloc(sizeof(*savedcmd), GFP_KERNEL); + if (!savedcmd) + return -ENOMEM; + + ret = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT, savedcmd); + if (ret < 0) { + kfree(savedcmd); + savedcmd = NULL; + return -ENOMEM; + } + + return 0; } int is_tracing_stopped(void) @@ -1341,7 +1396,7 @@ void tracing_start(void) } /* Prevent the buffers from switching */ - arch_spin_lock(&ftrace_max_lock); + arch_spin_lock(&global_trace.max_lock); buffer = global_trace.trace_buffer.buffer; if (buffer) @@ -1353,9 +1408,8 @@ void tracing_start(void) ring_buffer_record_enable(buffer); #endif - arch_spin_unlock(&ftrace_max_lock); + arch_spin_unlock(&global_trace.max_lock); - ftrace_start(); out: raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); } @@ -1402,13 +1456,12 @@ void tracing_stop(void) struct ring_buffer *buffer; unsigned long flags; - ftrace_stop(); raw_spin_lock_irqsave(&global_trace.start_lock, flags); if (global_trace.stop_count++) goto out; /* Prevent the buffers from switching */ - arch_spin_lock(&ftrace_max_lock); + arch_spin_lock(&global_trace.max_lock); buffer = global_trace.trace_buffer.buffer; if (buffer) @@ -1420,7 +1473,7 @@ void tracing_stop(void) ring_buffer_record_disable(buffer); #endif - arch_spin_unlock(&ftrace_max_lock); + arch_spin_unlock(&global_trace.max_lock); out: raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); @@ -1449,12 +1502,12 @@ static void tracing_stop_tr(struct trace_array *tr) void trace_stop_cmdline_recording(void); -static void trace_save_cmdline(struct task_struct *tsk) +static int trace_save_cmdline(struct task_struct *tsk) { unsigned pid, idx; if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT)) - return; + return 0; /* * It's not the end of the world if we don't get @@ -1463,11 +1516,11 @@ static void trace_save_cmdline(struct task_struct *tsk) * so if we miss here, then better luck next time. */ if (!arch_spin_trylock(&trace_cmdline_lock)) - return; + return 0; - idx = map_pid_to_cmdline[tsk->pid]; + idx = savedcmd->map_pid_to_cmdline[tsk->pid]; if (idx == NO_CMDLINE_MAP) { - idx = (cmdline_idx + 1) % SAVED_CMDLINES; + idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num; /* * Check whether the cmdline buffer at idx has a pid @@ -1475,22 +1528,24 @@ static void trace_save_cmdline(struct task_struct *tsk) * need to clear the map_pid_to_cmdline. Otherwise we * would read the new comm for the old pid. */ - pid = map_cmdline_to_pid[idx]; + pid = savedcmd->map_cmdline_to_pid[idx]; if (pid != NO_CMDLINE_MAP) - map_pid_to_cmdline[pid] = NO_CMDLINE_MAP; + savedcmd->map_pid_to_cmdline[pid] = NO_CMDLINE_MAP; - map_cmdline_to_pid[idx] = tsk->pid; - map_pid_to_cmdline[tsk->pid] = idx; + savedcmd->map_cmdline_to_pid[idx] = tsk->pid; + savedcmd->map_pid_to_cmdline[tsk->pid] = idx; - cmdline_idx = idx; + savedcmd->cmdline_idx = idx; } - memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN); + set_cmdline(idx, tsk->comm); arch_spin_unlock(&trace_cmdline_lock); + + return 1; } -void trace_find_cmdline(int pid, char comm[]) +static void __trace_find_cmdline(int pid, char comm[]) { unsigned map; @@ -1509,13 +1564,19 @@ void trace_find_cmdline(int pid, char comm[]) return; } - preempt_disable(); - arch_spin_lock(&trace_cmdline_lock); - map = map_pid_to_cmdline[pid]; + map = savedcmd->map_pid_to_cmdline[pid]; if (map != NO_CMDLINE_MAP) - strcpy(comm, saved_cmdlines[map]); + strcpy(comm, get_saved_cmdlines(map)); else strcpy(comm, "<...>"); +} + +void trace_find_cmdline(int pid, char comm[]) +{ + preempt_disable(); + arch_spin_lock(&trace_cmdline_lock); + + __trace_find_cmdline(pid, comm); arch_spin_unlock(&trace_cmdline_lock); preempt_enable(); @@ -1529,9 +1590,8 @@ void tracing_record_cmdline(struct task_struct *tsk) if (!__this_cpu_read(trace_cmdline_save)) return; - __this_cpu_write(trace_cmdline_save, false); - - trace_save_cmdline(tsk); + if (trace_save_cmdline(tsk)) + __this_cpu_write(trace_cmdline_save, false); } void @@ -1600,15 +1660,31 @@ void trace_buffer_unlock_commit(struct ring_buffer *buffer, } EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit); +static struct ring_buffer *temp_buffer; + struct ring_buffer_event * trace_event_buffer_lock_reserve(struct ring_buffer **current_rb, struct ftrace_event_file *ftrace_file, int type, unsigned long len, unsigned long flags, int pc) { + struct ring_buffer_event *entry; + *current_rb = ftrace_file->tr->trace_buffer.buffer; - return trace_buffer_lock_reserve(*current_rb, + entry = trace_buffer_lock_reserve(*current_rb, type, len, flags, pc); + /* + * If tracing is off, but we have triggers enabled + * we still need to look at the event data. Use the temp_buffer + * to store the trace event for the tigger to use. It's recusive + * safe and will not be recorded anywhere. + */ + if (!entry && ftrace_file->flags & FTRACE_EVENT_FL_TRIGGER_COND) { + *current_rb = temp_buffer; + entry = trace_buffer_lock_reserve(*current_rb, + type, len, flags, pc); + } + return entry; } EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve); @@ -1718,7 +1794,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, */ barrier(); if (use_stack == 1) { - trace.entries = &__get_cpu_var(ftrace_stack).calls[0]; + trace.entries = this_cpu_ptr(ftrace_stack.calls); trace.max_entries = FTRACE_STACK_MAX_ENTRIES; if (regs) @@ -1967,7 +2043,21 @@ void trace_printk_init_buffers(void) if (alloc_percpu_trace_buffer()) return; - pr_info("ftrace: Allocated trace_printk buffers\n"); + /* trace_printk() is for debug use only. Don't use it in production. */ + + pr_warning("\n**********************************************************\n"); + pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); + pr_warning("** **\n"); + pr_warning("** trace_printk() being used. Allocating extra memory. **\n"); + pr_warning("** **\n"); + pr_warning("** This means that this is a DEBUG kernel and it is **\n"); + pr_warning("** unsafe for produciton use. **\n"); + pr_warning("** **\n"); + pr_warning("** If you see this message and you are not debugging **\n"); + pr_warning("** the kernel, report this immediately to your vendor! **\n"); + pr_warning("** **\n"); + pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); + pr_warning("**********************************************************\n"); /* Expand the buffers to set size */ tracing_update_buffers(); @@ -3121,27 +3211,52 @@ static int tracing_open(struct inode *inode, struct file *file) return ret; } +/* + * Some tracers are not suitable for instance buffers. + * A tracer is always available for the global array (toplevel) + * or if it explicitly states that it is. + */ +static bool +trace_ok_for_array(struct tracer *t, struct trace_array *tr) +{ + return (tr->flags & TRACE_ARRAY_FL_GLOBAL) || t->allow_instances; +} + +/* Find the next tracer that this trace array may use */ +static struct tracer * +get_tracer_for_array(struct trace_array *tr, struct tracer *t) +{ + while (t && !trace_ok_for_array(t, tr)) + t = t->next; + + return t; +} + static void * t_next(struct seq_file *m, void *v, loff_t *pos) { + struct trace_array *tr = m->private; struct tracer *t = v; (*pos)++; if (t) - t = t->next; + t = get_tracer_for_array(tr, t->next); return t; } static void *t_start(struct seq_file *m, loff_t *pos) { + struct trace_array *tr = m->private; struct tracer *t; loff_t l = 0; mutex_lock(&trace_types_lock); - for (t = trace_types; t && l < *pos; t = t_next(m, t, &l)) - ; + + t = get_tracer_for_array(tr, trace_types); + for (; t && l < *pos; t = t_next(m, t, &l)) + ; return t; } @@ -3176,10 +3291,21 @@ static const struct seq_operations show_traces_seq_ops = { static int show_traces_open(struct inode *inode, struct file *file) { + struct trace_array *tr = inode->i_private; + struct seq_file *m; + int ret; + if (tracing_disabled) return -ENODEV; - return seq_open(file, &show_traces_seq_ops); + ret = seq_open(file, &show_traces_seq_ops); + if (ret) + return ret; + + m = file->private_data; + m->private = tr; + + return 0; } static ssize_t @@ -3269,7 +3395,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, mutex_lock(&tracing_cpumask_update_lock); local_irq_disable(); - arch_spin_lock(&ftrace_max_lock); + arch_spin_lock(&tr->max_lock); for_each_tracing_cpu(cpu) { /* * Increase/decrease the disabled counter if we are @@ -3286,7 +3412,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu); } } - arch_spin_unlock(&ftrace_max_lock); + arch_spin_unlock(&tr->max_lock); local_irq_enable(); cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new); @@ -3339,13 +3465,14 @@ static int tracing_trace_options_show(struct seq_file *m, void *v) return 0; } -static int __set_tracer_option(struct tracer *trace, +static int __set_tracer_option(struct trace_array *tr, struct tracer_flags *tracer_flags, struct tracer_opt *opts, int neg) { + struct tracer *trace = tr->current_trace; int ret; - ret = trace->set_flag(tracer_flags->val, opts->bit, !neg); + ret = trace->set_flag(tr, tracer_flags->val, opts->bit, !neg); if (ret) return ret; @@ -3357,8 +3484,9 @@ static int __set_tracer_option(struct tracer *trace, } /* Try to assign a tracer specific option */ -static int set_tracer_option(struct tracer *trace, char *cmp, int neg) +static int set_tracer_option(struct trace_array *tr, char *cmp, int neg) { + struct tracer *trace = tr->current_trace; struct tracer_flags *tracer_flags = trace->flags; struct tracer_opt *opts = NULL; int i; @@ -3367,8 +3495,7 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg) opts = &tracer_flags->opts[i]; if (strcmp(cmp, opts->name) == 0) - return __set_tracer_option(trace, trace->flags, - opts, neg); + return __set_tracer_option(tr, trace->flags, opts, neg); } return -EINVAL; @@ -3391,7 +3518,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) /* Give the tracer a chance to approve the change */ if (tr->current_trace->flag_changed) - if (tr->current_trace->flag_changed(tr->current_trace, mask, !!enabled)) + if (tr->current_trace->flag_changed(tr, mask, !!enabled)) return -EINVAL; if (enabled) @@ -3440,7 +3567,7 @@ static int trace_set_options(struct trace_array *tr, char *option) /* If no option could be set, test the specific tracer options */ if (!trace_options[i]) - ret = set_tracer_option(tr->current_trace, cmp, neg); + ret = set_tracer_option(tr, cmp, neg); mutex_unlock(&trace_types_lock); @@ -3527,6 +3654,7 @@ static const char readme_msg[] = " trace_options\t\t- Set format or modify how tracing happens\n" "\t\t\t Disable an option by adding a suffix 'no' to the\n" "\t\t\t option name\n" + " saved_cmdlines_size\t- echo command number in here to store comm-pid list\n" #ifdef CONFIG_DYNAMIC_FTRACE "\n available_filter_functions - list of functions that can be filtered on\n" " set_ftrace_filter\t- echo function name in here to only trace these\n" @@ -3546,6 +3674,8 @@ static const char readme_msg[] = #ifdef CONFIG_TRACER_SNAPSHOT "\t\t snapshot\n" #endif + "\t\t dump\n" + "\t\t cpudump\n" "\t example: echo do_fault:traceoff > set_ftrace_filter\n" "\t echo do_trap:traceoff:3 > set_ftrace_filter\n" "\t The first one will disable tracing every time do_fault is hit\n" @@ -3638,55 +3768,153 @@ static const struct file_operations tracing_readme_fops = { .llseek = generic_file_llseek, }; +static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos) +{ + unsigned int *ptr = v; + + if (*pos || m->count) + ptr++; + + (*pos)++; + + for (; ptr < &savedcmd->map_cmdline_to_pid[savedcmd->cmdline_num]; + ptr++) { + if (*ptr == -1 || *ptr == NO_CMDLINE_MAP) + continue; + + return ptr; + } + + return NULL; +} + +static void *saved_cmdlines_start(struct seq_file *m, loff_t *pos) +{ + void *v; + loff_t l = 0; + + preempt_disable(); + arch_spin_lock(&trace_cmdline_lock); + + v = &savedcmd->map_cmdline_to_pid[0]; + while (l <= *pos) { + v = saved_cmdlines_next(m, v, &l); + if (!v) + return NULL; + } + + return v; +} + +static void saved_cmdlines_stop(struct seq_file *m, void *v) +{ + arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); +} + +static int saved_cmdlines_show(struct seq_file *m, void *v) +{ + char buf[TASK_COMM_LEN]; + unsigned int *pid = v; + + __trace_find_cmdline(*pid, buf); + seq_printf(m, "%d %s\n", *pid, buf); + return 0; +} + +static const struct seq_operations tracing_saved_cmdlines_seq_ops = { + .start = saved_cmdlines_start, + .next = saved_cmdlines_next, + .stop = saved_cmdlines_stop, + .show = saved_cmdlines_show, +}; + +static int tracing_saved_cmdlines_open(struct inode *inode, struct file *filp) +{ + if (tracing_disabled) + return -ENODEV; + + return seq_open(filp, &tracing_saved_cmdlines_seq_ops); +} + +static const struct file_operations tracing_saved_cmdlines_fops = { + .open = tracing_saved_cmdlines_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + static ssize_t -tracing_saved_cmdlines_read(struct file *file, char __user *ubuf, - size_t cnt, loff_t *ppos) +tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) { - char *buf_comm; - char *file_buf; - char *buf; - int len = 0; - int pid; - int i; + char buf[64]; + int r; + + arch_spin_lock(&trace_cmdline_lock); + r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num); + arch_spin_unlock(&trace_cmdline_lock); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s) +{ + kfree(s->saved_cmdlines); + kfree(s->map_cmdline_to_pid); + kfree(s); +} - file_buf = kmalloc(SAVED_CMDLINES*(16+TASK_COMM_LEN), GFP_KERNEL); - if (!file_buf) +static int tracing_resize_saved_cmdlines(unsigned int val) +{ + struct saved_cmdlines_buffer *s, *savedcmd_temp; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) return -ENOMEM; - buf_comm = kmalloc(TASK_COMM_LEN, GFP_KERNEL); - if (!buf_comm) { - kfree(file_buf); + if (allocate_cmdlines_buffer(val, s) < 0) { + kfree(s); return -ENOMEM; } - buf = file_buf; + arch_spin_lock(&trace_cmdline_lock); + savedcmd_temp = savedcmd; + savedcmd = s; + arch_spin_unlock(&trace_cmdline_lock); + free_saved_cmdlines_buffer(savedcmd_temp); - for (i = 0; i < SAVED_CMDLINES; i++) { - int r; + return 0; +} - pid = map_cmdline_to_pid[i]; - if (pid == -1 || pid == NO_CMDLINE_MAP) - continue; +static ssize_t +tracing_saved_cmdlines_size_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + unsigned long val; + int ret; - trace_find_cmdline(pid, buf_comm); - r = sprintf(buf, "%d %s\n", pid, buf_comm); - buf += r; - len += r; - } + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; - len = simple_read_from_buffer(ubuf, cnt, ppos, - file_buf, len); + /* must have at least 1 entry or less than PID_MAX_DEFAULT */ + if (!val || val > PID_MAX_DEFAULT) + return -EINVAL; - kfree(file_buf); - kfree(buf_comm); + ret = tracing_resize_saved_cmdlines((unsigned int)val); + if (ret < 0) + return ret; - return len; + *ppos += cnt; + + return cnt; } -static const struct file_operations tracing_saved_cmdlines_fops = { - .open = tracing_open_generic, - .read = tracing_saved_cmdlines_read, - .llseek = generic_file_llseek, +static const struct file_operations tracing_saved_cmdlines_size_fops = { + .open = tracing_open_generic, + .read = tracing_saved_cmdlines_size_read, + .write = tracing_saved_cmdlines_size_write, }; static ssize_t @@ -3869,10 +4097,26 @@ create_trace_option_files(struct trace_array *tr, struct tracer *tracer); static void destroy_trace_option_files(struct trace_option_dentry *topts); -static int tracing_set_tracer(const char *buf) +/* + * Used to clear out the tracer before deletion of an instance. + * Must have trace_types_lock held. + */ +static void tracing_set_nop(struct trace_array *tr) +{ + if (tr->current_trace == &nop_trace) + return; + + tr->current_trace->enabled--; + + if (tr->current_trace->reset) + tr->current_trace->reset(tr); + + tr->current_trace = &nop_trace; +} + +static int tracing_set_tracer(struct trace_array *tr, const char *buf) { static struct trace_option_dentry *topts; - struct trace_array *tr = &global_trace; struct tracer *t; #ifdef CONFIG_TRACER_MAX_TRACE bool had_max_tr; @@ -3900,9 +4144,15 @@ static int tracing_set_tracer(const char *buf) if (t == tr->current_trace) goto out; + /* Some tracers are only allowed for the top level buffer */ + if (!trace_ok_for_array(t, tr)) { + ret = -EINVAL; + goto out; + } + trace_branch_disable(); - tr->current_trace->enabled = false; + tr->current_trace->enabled--; if (tr->current_trace->reset) tr->current_trace->reset(tr); @@ -3925,9 +4175,11 @@ static int tracing_set_tracer(const char *buf) free_snapshot(tr); } #endif - destroy_trace_option_files(topts); - - topts = create_trace_option_files(tr, t); + /* Currently, only the top instance has options */ + if (tr->flags & TRACE_ARRAY_FL_GLOBAL) { + destroy_trace_option_files(topts); + topts = create_trace_option_files(tr, t); + } #ifdef CONFIG_TRACER_MAX_TRACE if (t->use_max_tr && !had_max_tr) { @@ -3944,7 +4196,7 @@ static int tracing_set_tracer(const char *buf) } tr->current_trace = t; - tr->current_trace->enabled = true; + tr->current_trace->enabled++; trace_branch_enable(tr); out: mutex_unlock(&trace_types_lock); @@ -3956,6 +4208,7 @@ static ssize_t tracing_set_trace_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { + struct trace_array *tr = filp->private_data; char buf[MAX_TRACER_SIZE+1]; int i; size_t ret; @@ -3975,7 +4228,7 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, for (i = cnt - 1; i > 0 && isspace(buf[i]); i--) buf[i] = 0; - err = tracing_set_tracer(buf); + err = tracing_set_tracer(tr, buf); if (err) return err; @@ -4133,29 +4386,11 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table) return trace_poll(iter, filp, poll_table); } -/* - * This is a make-shift waitqueue. - * A tracer might use this callback on some rare cases: - * - * 1) the current tracer might hold the runqueue lock when it wakes up - * a reader, hence a deadlock (sched, function, and function graph tracers) - * 2) the function tracers, trace all functions, we don't want - * the overhead of calling wake_up and friends - * (and tracing them too) - * - * Anyway, this is really very primitive wakeup. - */ -void poll_wait_pipe(struct trace_iterator *iter) -{ - set_current_state(TASK_INTERRUPTIBLE); - /* sleep for 100 msecs, and try again. */ - schedule_timeout(HZ / 10); -} - /* Must be called with trace_types_lock mutex held. */ static int tracing_wait_pipe(struct file *filp) { struct trace_iterator *iter = filp->private_data; + int ret; while (trace_empty(iter)) { @@ -4163,15 +4398,6 @@ static int tracing_wait_pipe(struct file *filp) return -EAGAIN; } - mutex_unlock(&iter->mutex); - - iter->trace->wait_pipe(iter); - - mutex_lock(&iter->mutex); - - if (signal_pending(current)) - return -EINTR; - /* * We block until we read something and tracing is disabled. * We still block if tracing is disabled, but we have never @@ -4183,6 +4409,18 @@ static int tracing_wait_pipe(struct file *filp) */ if (!tracing_is_on() && iter->pos) break; + + mutex_unlock(&iter->mutex); + + ret = wait_on_pipe(iter); + + mutex_lock(&iter->mutex); + + if (ret) + return ret; + + if (signal_pending(current)) + return -EINTR; } return 1; @@ -4300,8 +4538,6 @@ static void tracing_spd_release_pipe(struct splice_pipe_desc *spd, static const struct pipe_buf_operations tracing_pipe_buf_ops = { .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, .confirm = generic_pipe_buf_confirm, .release = generic_pipe_buf_release, .steal = generic_pipe_buf_steal, @@ -4396,7 +4632,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, trace_access_lock(iter->cpu_file); /* Fill as many pages as possible. */ - for (i = 0, rem = len; i < pipe->buffers && rem; i++) { + for (i = 0, rem = len; i < spd.nr_pages_max && rem; i++) { spd.pages[i] = alloc_page(GFP_KERNEL); if (!spd.pages[i]) break; @@ -4683,25 +4919,10 @@ static int tracing_clock_show(struct seq_file *m, void *v) return 0; } -static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *fpos) +static int tracing_set_clock(struct trace_array *tr, const char *clockstr) { - struct seq_file *m = filp->private_data; - struct trace_array *tr = m->private; - char buf[64]; - const char *clockstr; int i; - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - clockstr = strstrip(buf); - for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) { if (strcmp(trace_clocks[i].name, clockstr) == 0) break; @@ -4729,6 +4950,32 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, mutex_unlock(&trace_types_lock); + return 0; +} + +static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *fpos) +{ + struct seq_file *m = filp->private_data; + struct trace_array *tr = m->private; + char buf[64]; + const char *clockstr; + int ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + clockstr = strstrip(buf); + + ret = tracing_set_clock(tr, clockstr); + if (ret) + return ret; + *fpos += cnt; return cnt; @@ -5096,8 +5343,12 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, goto out_unlock; } mutex_unlock(&trace_types_lock); - iter->trace->wait_pipe(iter); + ret = wait_on_pipe(iter); mutex_lock(&trace_types_lock); + if (ret) { + size = ret; + goto out_unlock; + } if (signal_pending(current)) { size = -EINTR; goto out_unlock; @@ -5178,8 +5429,6 @@ static void buffer_pipe_buf_get(struct pipe_inode_info *pipe, /* Pipe buffer operations for a buffer. */ static const struct pipe_buf_operations buffer_pipe_buf_ops = { .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, .confirm = generic_pipe_buf_confirm, .release = buffer_pipe_buf_release, .steal = generic_pipe_buf_steal, @@ -5255,7 +5504,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, trace_access_lock(iter->cpu_file); entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file); - for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) { + for (i = 0; i < spd.nr_pages_max && len && entries; i++, len -= PAGE_SIZE) { struct page *page; int r; @@ -5309,8 +5558,10 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, goto out; } mutex_unlock(&trace_types_lock); - iter->trace->wait_pipe(iter); + ret = wait_on_pipe(iter); mutex_lock(&trace_types_lock); + if (ret) + goto out; if (signal_pending(current)) { ret = -EINTR; goto out; @@ -5689,7 +5940,7 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, if (!!(topt->flags->val & topt->opt->bit) != val) { mutex_lock(&trace_types_lock); - ret = __set_tracer_option(topt->tr->current_trace, topt->flags, + ret = __set_tracer_option(topt->tr, topt->flags, topt->opt, !val); mutex_unlock(&trace_types_lock); if (ret) @@ -6003,6 +6254,28 @@ static int allocate_trace_buffers(struct trace_array *tr, int size) return 0; } +static void free_trace_buffer(struct trace_buffer *buf) +{ + if (buf->buffer) { + ring_buffer_free(buf->buffer); + buf->buffer = NULL; + free_percpu(buf->data); + buf->data = NULL; + } +} + +static void free_trace_buffers(struct trace_array *tr) +{ + if (!tr) + return; + + free_trace_buffer(&tr->trace_buffer); + +#ifdef CONFIG_TRACER_MAX_TRACE + free_trace_buffer(&tr->max_buffer); +#endif +} + static int new_instance_create(const char *name) { struct trace_array *tr; @@ -6032,6 +6305,8 @@ static int new_instance_create(const char *name) raw_spin_lock_init(&tr->start_lock); + tr->max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; + tr->current_trace = &nop_trace; INIT_LIST_HEAD(&tr->systems); @@ -6059,8 +6334,7 @@ static int new_instance_create(const char *name) return 0; out_free_tr: - if (tr->trace_buffer.buffer) - ring_buffer_free(tr->trace_buffer.buffer); + free_trace_buffers(tr); free_cpumask_var(tr->tracing_cpumask); kfree(tr->name); kfree(tr); @@ -6096,10 +6370,11 @@ static int instance_delete(const char *name) list_del(&tr->list); + tracing_set_nop(tr); event_trace_del_tracer(tr); + ftrace_destroy_function_files(tr); debugfs_remove_recursive(tr->dir); - free_percpu(tr->trace_buffer.data); - ring_buffer_free(tr->trace_buffer.buffer); + free_trace_buffers(tr); kfree(tr->name); kfree(tr); @@ -6191,6 +6466,12 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) { int cpu; + trace_create_file("available_tracers", 0444, d_tracer, + tr, &show_traces_fops); + + trace_create_file("current_tracer", 0644, d_tracer, + tr, &set_tracer_fops); + trace_create_file("tracing_cpumask", 0644, d_tracer, tr, &tracing_cpumask_fops); @@ -6221,6 +6502,14 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) trace_create_file("tracing_on", 0644, d_tracer, tr, &rb_simple_fops); +#ifdef CONFIG_TRACER_MAX_TRACE + trace_create_file("tracing_max_latency", 0644, d_tracer, + &tr->max_latency, &tracing_max_lat_fops); +#endif + + if (ftrace_create_function_files(tr, d_tracer)) + WARN(1, "Could not allocate function filter files"); + #ifdef CONFIG_TRACER_SNAPSHOT trace_create_file("snapshot", 0644, d_tracer, tr, &snapshot_fops); @@ -6243,17 +6532,6 @@ static __init int tracer_init_debugfs(void) init_tracer_debugfs(&global_trace, d_tracer); - trace_create_file("available_tracers", 0444, d_tracer, - &global_trace, &show_traces_fops); - - trace_create_file("current_tracer", 0644, d_tracer, - &global_trace, &set_tracer_fops); - -#ifdef CONFIG_TRACER_MAX_TRACE - trace_create_file("tracing_max_latency", 0644, d_tracer, - &tracing_max_latency, &tracing_max_lat_fops); -#endif - trace_create_file("tracing_thresh", 0644, d_tracer, &tracing_thresh, &tracing_max_lat_fops); @@ -6263,6 +6541,9 @@ static __init int tracer_init_debugfs(void) trace_create_file("saved_cmdlines", 0444, d_tracer, NULL, &tracing_saved_cmdlines_fops); + trace_create_file("saved_cmdlines_size", 0644, d_tracer, + NULL, &tracing_saved_cmdlines_size_fops); + #ifdef CONFIG_DYNAMIC_FTRACE trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, &ftrace_update_tot_cnt, &tracing_dyn_info_fops); @@ -6494,17 +6775,30 @@ __init static int tracer_alloc_buffers(void) raw_spin_lock_init(&global_trace.start_lock); + /* Used for event triggers */ + temp_buffer = ring_buffer_alloc(PAGE_SIZE, RB_FL_OVERWRITE); + if (!temp_buffer) + goto out_free_cpumask; + + if (trace_create_savedcmd() < 0) + goto out_free_temp_buffer; + /* TODO: make the number of buffers hot pluggable with CPUS */ if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) { printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); WARN_ON(1); - goto out_free_cpumask; + goto out_free_savedcmd; } if (global_trace.buffer_disabled) tracing_off(); - trace_init_cmdlines(); + if (trace_boot_clock) { + ret = tracing_set_clock(&global_trace, trace_boot_clock); + if (ret < 0) + pr_warning("Trace clock %s not defined, going back to default\n", + trace_boot_clock); + } /* * register_tracer() might reference current_trace, so it @@ -6513,6 +6807,10 @@ __init static int tracer_alloc_buffers(void) */ global_trace.current_trace = &nop_trace; + global_trace.max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; + + ftrace_init_global_array_ops(&global_trace); + register_tracer(&nop_trace); /* All seems OK, enable tracing */ @@ -6540,11 +6838,11 @@ __init static int tracer_alloc_buffers(void) return 0; +out_free_savedcmd: + free_saved_cmdlines_buffer(savedcmd); +out_free_temp_buffer: + ring_buffer_free(temp_buffer); out_free_cpumask: - free_percpu(global_trace.trace_buffer.data); -#ifdef CONFIG_TRACER_MAX_TRACE - free_percpu(global_trace.max_buffer.data); -#endif free_cpumask_var(global_trace.tracing_cpumask); out_free_buffer_mask: free_cpumask_var(tracing_buffer_mask); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 02b592f2d4b..9258f5a815d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -13,6 +13,7 @@ #include <linux/hw_breakpoint.h> #include <linux/trace_seq.h> #include <linux/ftrace_event.h> +#include <linux/compiler.h> #ifdef CONFIG_FTRACE_SYSCALLS #include <asm/unistd.h> /* For NR_SYSCALLS */ @@ -189,7 +190,22 @@ struct trace_array { */ struct trace_buffer max_buffer; bool allocated_snapshot; + unsigned long max_latency; #endif + /* + * max_lock is used to protect the swapping of buffers + * when taking a max snapshot. The buffers themselves are + * protected by per_cpu spinlocks. But the action of the swap + * needs its own lock. + * + * This is defined as a arch_spinlock_t in order to help + * with performance when lockdep debugging is enabled. + * + * It is also used in other places outside the update_max_tr + * so it needs to be defined outside of the + * CONFIG_TRACER_MAX_TRACE. + */ + arch_spinlock_t max_lock; int buffer_disabled; #ifdef CONFIG_FTRACE_SYSCALLS int sys_refcount_enter; @@ -210,6 +226,11 @@ struct trace_array { struct list_head events; cpumask_var_t tracing_cpumask; /* only trace on set CPUs */ int ref; +#ifdef CONFIG_FUNCTION_TRACER + struct ftrace_ops *ops; + /* function tracing enabled */ + int function_enabled; +#endif }; enum { @@ -231,6 +252,9 @@ static inline struct trace_array *top_trace_array(void) { struct trace_array *tr; + if (list_empty(&ftrace_trace_arrays)) + return NULL; + tr = list_entry(ftrace_trace_arrays.prev, typeof(*tr), list); WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL)); @@ -317,7 +341,6 @@ struct tracer_flags { * @stop: called when tracing is paused (echo 0 > tracing_enabled) * @open: called when the trace file is opened * @pipe_open: called when the trace_pipe file is opened - * @wait_pipe: override how the user waits for traces on trace_pipe * @close: called when the trace file is released * @pipe_close: called when the trace_pipe file is released * @read: override the default read callback on trace_pipe @@ -336,7 +359,6 @@ struct tracer { void (*stop)(struct trace_array *tr); void (*open)(struct trace_iterator *iter); void (*pipe_open)(struct trace_iterator *iter); - void (*wait_pipe)(struct trace_iterator *iter); void (*close)(struct trace_iterator *iter); void (*pipe_close)(struct trace_iterator *iter); ssize_t (*read)(struct trace_iterator *iter, @@ -355,14 +377,16 @@ struct tracer { void (*print_header)(struct seq_file *m); enum print_line_t (*print_line)(struct trace_iterator *iter); /* If you handled the flag setting, return 0 */ - int (*set_flag)(u32 old_flags, u32 bit, int set); + int (*set_flag)(struct trace_array *tr, + u32 old_flags, u32 bit, int set); /* Return 0 if OK with change, else return non-zero */ - int (*flag_changed)(struct tracer *tracer, + int (*flag_changed)(struct trace_array *tr, u32 mask, int set); struct tracer *next; struct tracer_flags *flags; + int enabled; bool print_max; - bool enabled; + bool allow_instances; #ifdef CONFIG_TRACER_MAX_TRACE bool use_max_tr; #endif @@ -408,13 +432,7 @@ enum { TRACE_FTRACE_IRQ_BIT, TRACE_FTRACE_SIRQ_BIT, - /* GLOBAL_BITs must be greater than FTRACE_BITs */ - TRACE_GLOBAL_BIT, - TRACE_GLOBAL_NMI_BIT, - TRACE_GLOBAL_IRQ_BIT, - TRACE_GLOBAL_SIRQ_BIT, - - /* INTERNAL_BITs must be greater than GLOBAL_BITs */ + /* INTERNAL_BITs must be greater than FTRACE_BITs */ TRACE_INTERNAL_BIT, TRACE_INTERNAL_NMI_BIT, TRACE_INTERNAL_IRQ_BIT, @@ -441,9 +459,6 @@ enum { #define TRACE_FTRACE_START TRACE_FTRACE_BIT #define TRACE_FTRACE_MAX ((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1) -#define TRACE_GLOBAL_START TRACE_GLOBAL_BIT -#define TRACE_GLOBAL_MAX ((1 << (TRACE_GLOBAL_START + TRACE_CONTEXT_BITS)) - 1) - #define TRACE_LIST_START TRACE_INTERNAL_BIT #define TRACE_LIST_MAX ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1) @@ -552,8 +567,6 @@ void trace_init_global_iter(struct trace_iterator *iter); void tracing_iter_reset(struct trace_iterator *iter, int cpu); -void poll_wait_pipe(struct trace_iterator *iter); - void tracing_sched_switch_trace(struct trace_array *tr, struct task_struct *prev, struct task_struct *next, @@ -600,8 +613,6 @@ extern unsigned long nsecs_to_usecs(unsigned long nsecs); extern unsigned long tracing_thresh; #ifdef CONFIG_TRACER_MAX_TRACE -extern unsigned long tracing_max_latency; - void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu); @@ -716,6 +727,8 @@ extern unsigned long trace_flags; #define TRACE_GRAPH_PRINT_PROC 0x8 #define TRACE_GRAPH_PRINT_DURATION 0x10 #define TRACE_GRAPH_PRINT_ABS_TIME 0x20 +#define TRACE_GRAPH_PRINT_IRQS 0x40 +#define TRACE_GRAPH_PRINT_TAIL 0x80 #define TRACE_GRAPH_PRINT_FILL_SHIFT 28 #define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT) @@ -812,13 +825,45 @@ static inline int ftrace_trace_task(struct task_struct *task) return test_tsk_trace_trace(task); } extern int ftrace_is_dead(void); +int ftrace_create_function_files(struct trace_array *tr, + struct dentry *parent); +void ftrace_destroy_function_files(struct trace_array *tr); +void ftrace_init_global_array_ops(struct trace_array *tr); +void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func); +void ftrace_reset_array_ops(struct trace_array *tr); +int using_ftrace_ops_list_func(void); #else static inline int ftrace_trace_task(struct task_struct *task) { return 1; } static inline int ftrace_is_dead(void) { return 0; } -#endif +static inline int +ftrace_create_function_files(struct trace_array *tr, + struct dentry *parent) +{ + return 0; +} +static inline void ftrace_destroy_function_files(struct trace_array *tr) { } +static inline __init void +ftrace_init_global_array_ops(struct trace_array *tr) { } +static inline void ftrace_reset_array_ops(struct trace_array *tr) { } +/* ftace_func_t type is not defined, use macro instead of static inline */ +#define ftrace_init_array_ops(tr, func) do { } while (0) +#endif /* CONFIG_FUNCTION_TRACER */ + +#if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE) +void ftrace_create_filter_files(struct ftrace_ops *ops, + struct dentry *parent); +void ftrace_destroy_filter_files(struct ftrace_ops *ops); +#else +/* + * The ops parameter passed in is usually undefined. + * This must be a macro. + */ +#define ftrace_create_filter_files(ops, parent) do { } while (0) +#define ftrace_destroy_filter_files(ops) do { } while (0) +#endif /* CONFIG_FUNCTION_TRACER && CONFIG_DYNAMIC_FTRACE */ int ftrace_event_is_function(struct ftrace_event_call *call); @@ -1249,7 +1294,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled); #undef FTRACE_ENTRY #define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ extern struct ftrace_event_call \ - __attribute__((__aligned__(4))) event_##call; + __aligned(4) event_##call; #undef FTRACE_ENTRY_DUP #define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \ FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \ diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c new file mode 100644 index 00000000000..40a14cbcf8e --- /dev/null +++ b/kernel/trace/trace_benchmark.c @@ -0,0 +1,198 @@ +#include <linux/delay.h> +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/trace_clock.h> + +#define CREATE_TRACE_POINTS +#include "trace_benchmark.h" + +static struct task_struct *bm_event_thread; + +static char bm_str[BENCHMARK_EVENT_STRLEN] = "START"; + +static u64 bm_total; +static u64 bm_totalsq; +static u64 bm_last; +static u64 bm_max; +static u64 bm_min; +static u64 bm_first; +static u64 bm_cnt; +static u64 bm_stddev; +static unsigned int bm_avg; +static unsigned int bm_std; + +/* + * This gets called in a loop recording the time it took to write + * the tracepoint. What it writes is the time statistics of the last + * tracepoint write. As there is nothing to write the first time + * it simply writes "START". As the first write is cold cache and + * the rest is hot, we save off that time in bm_first and it is + * reported as "first", which is shown in the second write to the + * tracepoint. The "first" field is writen within the statics from + * then on but never changes. + */ +static void trace_do_benchmark(void) +{ + u64 start; + u64 stop; + u64 delta; + u64 stddev; + u64 seed; + u64 last_seed; + unsigned int avg; + unsigned int std = 0; + + /* Only run if the tracepoint is actually active */ + if (!trace_benchmark_event_enabled()) + return; + + local_irq_disable(); + start = trace_clock_local(); + trace_benchmark_event(bm_str); + stop = trace_clock_local(); + local_irq_enable(); + + bm_cnt++; + + delta = stop - start; + + /* + * The first read is cold cached, keep it separate from the + * other calculations. + */ + if (bm_cnt == 1) { + bm_first = delta; + scnprintf(bm_str, BENCHMARK_EVENT_STRLEN, + "first=%llu [COLD CACHED]", bm_first); + return; + } + + bm_last = delta; + + if (delta > bm_max) + bm_max = delta; + if (!bm_min || delta < bm_min) + bm_min = delta; + + /* + * When bm_cnt is greater than UINT_MAX, it breaks the statistics + * accounting. Freeze the statistics when that happens. + * We should have enough data for the avg and stddev anyway. + */ + if (bm_cnt > UINT_MAX) { + scnprintf(bm_str, BENCHMARK_EVENT_STRLEN, + "last=%llu first=%llu max=%llu min=%llu ** avg=%u std=%d std^2=%lld", + bm_last, bm_first, bm_max, bm_min, bm_avg, bm_std, bm_stddev); + return; + } + + bm_total += delta; + bm_totalsq += delta * delta; + + + if (bm_cnt > 1) { + /* + * Apply Welford's method to calculate standard deviation: + * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2) + */ + stddev = (u64)bm_cnt * bm_totalsq - bm_total * bm_total; + do_div(stddev, (u32)bm_cnt); + do_div(stddev, (u32)bm_cnt - 1); + } else + stddev = 0; + + delta = bm_total; + do_div(delta, bm_cnt); + avg = delta; + + if (stddev > 0) { + int i = 0; + /* + * stddev is the square of standard deviation but + * we want the actualy number. Use the average + * as our seed to find the std. + * + * The next try is: + * x = (x + N/x) / 2 + * + * Where N is the squared number to find the square + * root of. + */ + seed = avg; + do { + last_seed = seed; + seed = stddev; + if (!last_seed) + break; + do_div(seed, last_seed); + seed += last_seed; + do_div(seed, 2); + } while (i++ < 10 && last_seed != seed); + + std = seed; + } + + scnprintf(bm_str, BENCHMARK_EVENT_STRLEN, + "last=%llu first=%llu max=%llu min=%llu avg=%u std=%d std^2=%lld", + bm_last, bm_first, bm_max, bm_min, avg, std, stddev); + + bm_std = std; + bm_avg = avg; + bm_stddev = stddev; +} + +static int benchmark_event_kthread(void *arg) +{ + /* sleep a bit to make sure the tracepoint gets activated */ + msleep(100); + + while (!kthread_should_stop()) { + + trace_do_benchmark(); + + /* + * We don't go to sleep, but let others + * run as well. + */ + cond_resched(); + } + + return 0; +} + +/* + * When the benchmark tracepoint is enabled, it calls this + * function and the thread that calls the tracepoint is created. + */ +void trace_benchmark_reg(void) +{ + bm_event_thread = kthread_run(benchmark_event_kthread, + NULL, "event_benchmark"); + WARN_ON(!bm_event_thread); +} + +/* + * When the benchmark tracepoint is disabled, it calls this + * function and the thread that calls the tracepoint is deleted + * and all the numbers are reset. + */ +void trace_benchmark_unreg(void) +{ + if (!bm_event_thread) + return; + + kthread_stop(bm_event_thread); + + strcpy(bm_str, "START"); + bm_total = 0; + bm_totalsq = 0; + bm_last = 0; + bm_max = 0; + bm_min = 0; + bm_cnt = 0; + /* These don't need to be reset but reset them anyway */ + bm_first = 0; + bm_std = 0; + bm_avg = 0; + bm_stddev = 0; +} diff --git a/kernel/trace/trace_benchmark.h b/kernel/trace/trace_benchmark.h new file mode 100644 index 00000000000..3c1df1df4e2 --- /dev/null +++ b/kernel/trace/trace_benchmark.h @@ -0,0 +1,41 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM benchmark + +#if !defined(_TRACE_BENCHMARK_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_BENCHMARK_H + +#include <linux/tracepoint.h> + +extern void trace_benchmark_reg(void); +extern void trace_benchmark_unreg(void); + +#define BENCHMARK_EVENT_STRLEN 128 + +TRACE_EVENT_FN(benchmark_event, + + TP_PROTO(const char *str), + + TP_ARGS(str), + + TP_STRUCT__entry( + __array( char, str, BENCHMARK_EVENT_STRLEN ) + ), + + TP_fast_assign( + memcpy(__entry->str, str, BENCHMARK_EVENT_STRLEN); + ), + + TP_printk("%s", __entry->str), + + trace_benchmark_reg, trace_benchmark_unreg +); + +#endif /* _TRACE_BENCHMARK_H */ + +#undef TRACE_INCLUDE_FILE +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_benchmark + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 26dc348332b..57b67b1f24d 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -59,13 +59,14 @@ u64 notrace trace_clock(void) /* * trace_jiffy_clock(): Simply use jiffies as a clock counter. + * Note that this use of jiffies_64 is not completely safe on + * 32-bit systems. But the window is tiny, and the effect if + * we are affected is that we will have an obviously bogus + * timestamp on a trace event - i.e. not life threatening. */ u64 notrace trace_clock_jiffies(void) { - u64 jiffy = jiffies - INITIAL_JIFFIES; - - /* Return nsecs */ - return (u64)jiffies_to_usecs(jiffy) * 1000ULL; + return jiffies_64_to_clock_t(jiffies_64 - INITIAL_JIFFIES); } /* diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index e854f420e03..5d12bb407b4 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -31,9 +31,25 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event, } /* The ftrace function trace is allowed only for root. */ - if (ftrace_event_is_function(tp_event) && - perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) - return -EPERM; + if (ftrace_event_is_function(tp_event)) { + if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* + * We don't allow user space callchains for function trace + * event, due to issues with page faults while tracing page + * fault handler and its overall trickiness nature. + */ + if (!p_event->attr.exclude_callchain_user) + return -EINVAL; + + /* + * Same reason to disable user stack dump as for user space + * callchains above. + */ + if (p_event->attr.sample_type & PERF_SAMPLE_STACK_USER) + return -EINVAL; + } /* No tracing, just counting, so no obvious leak */ if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW)) @@ -232,8 +248,8 @@ void perf_trace_del(struct perf_event *p_event, int flags) tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event); } -__kprobes void *perf_trace_buf_prepare(int size, unsigned short type, - struct pt_regs *regs, int *rctxp) +void *perf_trace_buf_prepare(int size, unsigned short type, + struct pt_regs *regs, int *rctxp) { struct trace_entry *entry; unsigned long flags; @@ -265,6 +281,7 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type, return raw_data; } EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); +NOKPROBE_SYMBOL(perf_trace_buf_prepare); #ifdef CONFIG_FUNCTION_TRACER static void diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index e71ffd4eccb..2de53628689 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -27,12 +27,6 @@ DEFINE_MUTEX(event_mutex); -DEFINE_MUTEX(event_storage_mutex); -EXPORT_SYMBOL_GPL(event_storage_mutex); - -char event_storage[EVENT_STORAGE_SIZE]; -EXPORT_SYMBOL_GPL(event_storage); - LIST_HEAD(ftrace_events); static LIST_HEAD(ftrace_common_fields); @@ -194,29 +188,60 @@ int trace_event_raw_init(struct ftrace_event_call *call) } EXPORT_SYMBOL_GPL(trace_event_raw_init); +void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer, + struct ftrace_event_file *ftrace_file, + unsigned long len) +{ + struct ftrace_event_call *event_call = ftrace_file->event_call; + + local_save_flags(fbuffer->flags); + fbuffer->pc = preempt_count(); + fbuffer->ftrace_file = ftrace_file; + + fbuffer->event = + trace_event_buffer_lock_reserve(&fbuffer->buffer, ftrace_file, + event_call->event.type, len, + fbuffer->flags, fbuffer->pc); + if (!fbuffer->event) + return NULL; + + fbuffer->entry = ring_buffer_event_data(fbuffer->event); + return fbuffer->entry; +} +EXPORT_SYMBOL_GPL(ftrace_event_buffer_reserve); + +void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer) +{ + event_trigger_unlock_commit(fbuffer->ftrace_file, fbuffer->buffer, + fbuffer->event, fbuffer->entry, + fbuffer->flags, fbuffer->pc); +} +EXPORT_SYMBOL_GPL(ftrace_event_buffer_commit); + int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type, void *data) { struct ftrace_event_file *file = data; + WARN_ON(!(call->flags & TRACE_EVENT_FL_TRACEPOINT)); switch (type) { case TRACE_REG_REGISTER: - return tracepoint_probe_register(call->name, + return tracepoint_probe_register(call->tp, call->class->probe, file); case TRACE_REG_UNREGISTER: - tracepoint_probe_unregister(call->name, + tracepoint_probe_unregister(call->tp, call->class->probe, file); return 0; #ifdef CONFIG_PERF_EVENTS case TRACE_REG_PERF_REGISTER: - return tracepoint_probe_register(call->name, + return tracepoint_probe_register(call->tp, call->class->perf_probe, call); case TRACE_REG_PERF_UNREGISTER: - tracepoint_probe_unregister(call->name, + tracepoint_probe_unregister(call->tp, call->class->perf_probe, call); return 0; @@ -328,7 +353,7 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file, if (ret) { tracing_stop_cmdline_record(); pr_info("event trace: Could not enable event " - "%s\n", call->name); + "%s\n", ftrace_event_name(call)); break; } set_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags); @@ -445,6 +470,7 @@ static void remove_event_file_dir(struct ftrace_event_file *file) list_del(&file->list); remove_subsystem(file->system); + free_event_filter(file->filter); kmem_cache_free(file_cachep, file); } @@ -457,27 +483,29 @@ __ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match, { struct ftrace_event_file *file; struct ftrace_event_call *call; + const char *name; int ret = -EINVAL; list_for_each_entry(file, &tr->events, list) { call = file->event_call; + name = ftrace_event_name(call); - if (!call->name || !call->class || !call->class->reg) + if (!name || !call->class || !call->class->reg) continue; if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) continue; if (match && - strcmp(match, call->name) != 0 && + strcmp(match, name) != 0 && strcmp(match, call->class->system) != 0) continue; if (sub && strcmp(sub, call->class->system) != 0) continue; - if (event && strcmp(event, call->name) != 0) + if (event && strcmp(event, name) != 0) continue; ftrace_event_enable_disable(file, set); @@ -547,6 +575,9 @@ int trace_set_clr_event(const char *system, const char *event, int set) { struct trace_array *tr = top_trace_array(); + if (!tr) + return -ENODEV; + return __ftrace_set_clr_event(tr, NULL, system, event, set); } EXPORT_SYMBOL_GPL(trace_set_clr_event); @@ -675,7 +706,7 @@ static int t_show(struct seq_file *m, void *v) if (strcmp(call->class->system, TRACE_SYSTEM) != 0) seq_printf(m, "%s:", call->class->system); - seq_printf(m, "%s\n", call->name); + seq_printf(m, "%s\n", ftrace_event_name(call)); return 0; } @@ -768,7 +799,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, mutex_lock(&event_mutex); list_for_each_entry(file, &tr->events, list) { call = file->event_call; - if (!call->name || !call->class || !call->class->reg) + if (!ftrace_event_name(call) || !call->class || !call->class->reg) continue; if (system && strcmp(call->class->system, system->name) != 0) @@ -883,7 +914,7 @@ static int f_show(struct seq_file *m, void *v) switch ((unsigned long)v) { case FORMAT_HEADER: - seq_printf(m, "name: %s\n", call->name); + seq_printf(m, "name: %s\n", ftrace_event_name(call)); seq_printf(m, "ID: %d\n", call->event.type); seq_printf(m, "format:\n"); return 0; @@ -1503,6 +1534,7 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file) struct trace_array *tr = file->tr; struct list_head *head; struct dentry *d_events; + const char *name; int ret; /* @@ -1516,10 +1548,11 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file) } else d_events = parent; - file->dir = debugfs_create_dir(call->name, d_events); + name = ftrace_event_name(call); + file->dir = debugfs_create_dir(name, d_events); if (!file->dir) { pr_warning("Could not create debugfs '%s' directory\n", - call->name); + name); return -1; } @@ -1543,7 +1576,7 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file) ret = call->class->define_fields(call); if (ret < 0) { pr_warning("Could not initialize trace point" - " events/%s\n", call->name); + " events/%s\n", name); return -1; } } @@ -1607,15 +1640,17 @@ static void event_remove(struct ftrace_event_call *call) static int event_init(struct ftrace_event_call *call) { int ret = 0; + const char *name; - if (WARN_ON(!call->name)) + name = ftrace_event_name(call); + if (WARN_ON(!name)) return -EINVAL; if (call->class->raw_init) { ret = call->class->raw_init(call); if (ret < 0 && ret != -ENOSYS) pr_warn("Could not initialize trace events/%s\n", - call->name); + name); } return ret; @@ -1777,6 +1812,16 @@ static void trace_module_add_events(struct module *mod) { struct ftrace_event_call **call, **start, **end; + if (!mod->num_trace_events) + return; + + /* Don't add infrastructure for mods without tracepoints */ + if (trace_module_has_bad_taint(mod)) { + pr_err("%s: module has bad taint, not creating trace events\n", + mod->name); + return; + } + start = mod->trace_events; end = mod->trace_events + mod->num_trace_events; @@ -1851,7 +1896,7 @@ __trace_add_event_dirs(struct trace_array *tr) ret = __trace_add_new_event(call, tr); if (ret < 0) pr_warning("Could not create directory for event %s\n", - call->name); + ftrace_event_name(call)); } } @@ -1860,18 +1905,20 @@ find_event_file(struct trace_array *tr, const char *system, const char *event) { struct ftrace_event_file *file; struct ftrace_event_call *call; + const char *name; list_for_each_entry(file, &tr->events, list) { call = file->event_call; + name = ftrace_event_name(call); - if (!call->name || !call->class || !call->class->reg) + if (!name || !call->class || !call->class->reg) continue; if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) continue; - if (strcmp(event, call->name) == 0 && + if (strcmp(event, name) == 0 && strcmp(system, call->class->system) == 0) return file; } @@ -1939,7 +1986,7 @@ event_enable_print(struct seq_file *m, unsigned long ip, seq_printf(m, "%s:%s:%s", data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR, data->file->event_call->class->system, - data->file->event_call->name); + ftrace_event_name(data->file->event_call)); if (data->count == -1) seq_printf(m, ":unlimited\n"); @@ -2022,6 +2069,9 @@ event_enable_func(struct ftrace_hash *hash, bool enable; int ret; + if (!tr) + return -ENODEV; + /* hash funcs only work with set_ftrace_filter */ if (!enabled || !param) return -EINVAL; @@ -2159,7 +2209,7 @@ __trace_early_add_event_dirs(struct trace_array *tr) ret = event_create_dir(tr->event_dir, file); if (ret < 0) pr_warning("Could not create directory for event %s\n", - file->event_call->name); + ftrace_event_name(file->event_call)); } } @@ -2183,7 +2233,7 @@ __trace_early_add_events(struct trace_array *tr) ret = __trace_early_add_new_event(call, tr); if (ret < 0) pr_warning("Could not create early event %s\n", - call->name); + ftrace_event_name(call)); } } @@ -2353,6 +2403,9 @@ static __init int event_trace_enable(void) char *token; int ret; + if (!tr) + return -ENODEV; + for_each_event(iter, __start_ftrace_events, __stop_ftrace_events) { call = *iter; @@ -2399,6 +2452,8 @@ static __init int event_trace_init(void) int ret; tr = top_trace_array(); + if (!tr) + return -ENODEV; d_tracer = tracing_init_dentry(); if (!d_tracer) @@ -2492,6 +2547,8 @@ static __init void event_trace_self_tests(void) int ret; tr = top_trace_array(); + if (!tr) + return; pr_info("Running tests on trace events:\n"); @@ -2515,7 +2572,7 @@ static __init void event_trace_self_tests(void) continue; #endif - pr_info("Testing event %s: ", call->name); + pr_info("Testing event %s: ", ftrace_event_name(call)); /* * If an event is already enabled, someone is using diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 8efbb69b04f..4747b476a03 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -77,7 +77,7 @@ event_triggers_call(struct ftrace_event_file *file, void *rec) data->ops->func(data); continue; } - filter = rcu_dereference(data->filter); + filter = rcu_dereference_sched(data->filter); if (filter && !filter_match_preds(filter, rec)) continue; if (data->cmd_ops->post_trigger) { @@ -1095,7 +1095,7 @@ event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, seq_printf(m, "%s:%s:%s", enable_data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR, enable_data->file->event_call->class->system, - enable_data->file->event_call->name); + ftrace_event_name(enable_data->file->event_call)); if (data->count == -1) seq_puts(m, ":unlimited"); diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 7c3e3e72e2b..d4ddde28a81 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -95,15 +95,12 @@ static void __always_unused ____ftrace_check_##name(void) \ #undef __array #define __array(type, item, len) \ do { \ + char *type_str = #type"["__stringify(len)"]"; \ BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ - mutex_lock(&event_storage_mutex); \ - snprintf(event_storage, sizeof(event_storage), \ - "%s[%d]", #type, len); \ - ret = trace_define_field(event_call, event_storage, #item, \ + ret = trace_define_field(event_call, type_str, #item, \ offsetof(typeof(field), item), \ sizeof(field.item), \ is_signed_type(type), filter_type); \ - mutex_unlock(&event_storage_mutex); \ if (ret) \ return ret; \ } while (0); @@ -176,9 +173,11 @@ struct ftrace_event_class __refdata event_class_ftrace_##call = { \ }; \ \ struct ftrace_event_call __used event_##call = { \ - .name = #call, \ - .event.type = etype, \ .class = &event_class_ftrace_##call, \ + { \ + .name = #call, \ + }, \ + .event.type = etype, \ .print_fmt = print, \ .flags = TRACE_EVENT_FL_IGNORE_ENABLE | TRACE_EVENT_FL_USE_CALL_FILTER, \ }; \ diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 38fe1483c50..57f0ec962d2 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -13,33 +13,106 @@ #include <linux/debugfs.h> #include <linux/uaccess.h> #include <linux/ftrace.h> +#include <linux/slab.h> #include <linux/fs.h> #include "trace.h" -/* function tracing enabled */ -static int ftrace_function_enabled; +static void tracing_start_function_trace(struct trace_array *tr); +static void tracing_stop_function_trace(struct trace_array *tr); +static void +function_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *pt_regs); +static void +function_stack_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *pt_regs); +static struct tracer_flags func_flags; + +/* Our option */ +enum { + TRACE_FUNC_OPT_STACK = 0x1, +}; + +static int allocate_ftrace_ops(struct trace_array *tr) +{ + struct ftrace_ops *ops; -static struct trace_array *func_trace; + ops = kzalloc(sizeof(*ops), GFP_KERNEL); + if (!ops) + return -ENOMEM; + + /* Currently only the non stack verision is supported */ + ops->func = function_trace_call; + ops->flags = FTRACE_OPS_FL_RECURSION_SAFE; + + tr->ops = ops; + ops->private = tr; + return 0; +} -static void tracing_start_function_trace(void); -static void tracing_stop_function_trace(void); + +int ftrace_create_function_files(struct trace_array *tr, + struct dentry *parent) +{ + int ret; + + /* + * The top level array uses the "global_ops", and the files are + * created on boot up. + */ + if (tr->flags & TRACE_ARRAY_FL_GLOBAL) + return 0; + + ret = allocate_ftrace_ops(tr); + if (ret) + return ret; + + ftrace_create_filter_files(tr->ops, parent); + + return 0; +} + +void ftrace_destroy_function_files(struct trace_array *tr) +{ + ftrace_destroy_filter_files(tr->ops); + kfree(tr->ops); + tr->ops = NULL; +} static int function_trace_init(struct trace_array *tr) { - func_trace = tr; + ftrace_func_t func; + + /* + * Instance trace_arrays get their ops allocated + * at instance creation. Unless it failed + * the allocation. + */ + if (!tr->ops) + return -ENOMEM; + + /* Currently only the global instance can do stack tracing */ + if (tr->flags & TRACE_ARRAY_FL_GLOBAL && + func_flags.val & TRACE_FUNC_OPT_STACK) + func = function_stack_trace_call; + else + func = function_trace_call; + + ftrace_init_array_ops(tr, func); + tr->trace_buffer.cpu = get_cpu(); put_cpu(); tracing_start_cmdline_record(); - tracing_start_function_trace(); + tracing_start_function_trace(tr); return 0; } static void function_trace_reset(struct trace_array *tr) { - tracing_stop_function_trace(); + tracing_stop_function_trace(tr); tracing_stop_cmdline_record(); + ftrace_reset_array_ops(tr); } static void function_trace_start(struct trace_array *tr) @@ -47,25 +120,18 @@ static void function_trace_start(struct trace_array *tr) tracing_reset_online_cpus(&tr->trace_buffer); } -/* Our option */ -enum { - TRACE_FUNC_OPT_STACK = 0x1, -}; - -static struct tracer_flags func_flags; - static void function_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *pt_regs) { - struct trace_array *tr = func_trace; + struct trace_array *tr = op->private; struct trace_array_cpu *data; unsigned long flags; int bit; int cpu; int pc; - if (unlikely(!ftrace_function_enabled)) + if (unlikely(!tr->function_enabled)) return; pc = preempt_count(); @@ -91,14 +157,14 @@ static void function_stack_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *pt_regs) { - struct trace_array *tr = func_trace; + struct trace_array *tr = op->private; struct trace_array_cpu *data; unsigned long flags; long disabled; int cpu; int pc; - if (unlikely(!ftrace_function_enabled)) + if (unlikely(!tr->function_enabled)) return; /* @@ -128,19 +194,6 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, local_irq_restore(flags); } - -static struct ftrace_ops trace_ops __read_mostly = -{ - .func = function_trace_call, - .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, -}; - -static struct ftrace_ops trace_stack_ops __read_mostly = -{ - .func = function_stack_trace_call, - .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, -}; - static struct tracer_opt func_opts[] = { #ifdef CONFIG_STACKTRACE { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) }, @@ -153,29 +206,21 @@ static struct tracer_flags func_flags = { .opts = func_opts }; -static void tracing_start_function_trace(void) +static void tracing_start_function_trace(struct trace_array *tr) { - ftrace_function_enabled = 0; - - if (func_flags.val & TRACE_FUNC_OPT_STACK) - register_ftrace_function(&trace_stack_ops); - else - register_ftrace_function(&trace_ops); - - ftrace_function_enabled = 1; + tr->function_enabled = 0; + register_ftrace_function(tr->ops); + tr->function_enabled = 1; } -static void tracing_stop_function_trace(void) +static void tracing_stop_function_trace(struct trace_array *tr) { - ftrace_function_enabled = 0; - - if (func_flags.val & TRACE_FUNC_OPT_STACK) - unregister_ftrace_function(&trace_stack_ops); - else - unregister_ftrace_function(&trace_ops); + tr->function_enabled = 0; + unregister_ftrace_function(tr->ops); } -static int func_set_flag(u32 old_flags, u32 bit, int set) +static int +func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { switch (bit) { case TRACE_FUNC_OPT_STACK: @@ -183,12 +228,14 @@ static int func_set_flag(u32 old_flags, u32 bit, int set) if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK)) break; + unregister_ftrace_function(tr->ops); + if (set) { - unregister_ftrace_function(&trace_ops); - register_ftrace_function(&trace_stack_ops); + tr->ops->func = function_stack_trace_call; + register_ftrace_function(tr->ops); } else { - unregister_ftrace_function(&trace_stack_ops); - register_ftrace_function(&trace_ops); + tr->ops->func = function_trace_call; + register_ftrace_function(tr->ops); } break; @@ -205,9 +252,9 @@ static struct tracer function_trace __tracer_data = .init = function_trace_init, .reset = function_trace_reset, .start = function_trace_start, - .wait_pipe = poll_wait_pipe, .flags = &func_flags, .set_flag = func_set_flag, + .allow_instances = true, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_function, #endif diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 0b99120d395..4de3e57f723 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -38,15 +38,6 @@ struct fgraph_data { #define TRACE_GRAPH_INDENT 2 -/* Flag options */ -#define TRACE_GRAPH_PRINT_OVERRUN 0x1 -#define TRACE_GRAPH_PRINT_CPU 0x2 -#define TRACE_GRAPH_PRINT_OVERHEAD 0x4 -#define TRACE_GRAPH_PRINT_PROC 0x8 -#define TRACE_GRAPH_PRINT_DURATION 0x10 -#define TRACE_GRAPH_PRINT_ABS_TIME 0x20 -#define TRACE_GRAPH_PRINT_IRQS 0x40 - static unsigned int max_depth; static struct tracer_opt trace_opts[] = { @@ -64,11 +55,13 @@ static struct tracer_opt trace_opts[] = { { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) }, /* Display interrupts */ { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) }, + /* Display function name after trailing } */ + { TRACER_OPT(funcgraph-tail, TRACE_GRAPH_PRINT_TAIL) }, { } /* Empty entry */ }; static struct tracer_flags tracer_flags = { - /* Don't display overruns and proc by default */ + /* Don't display overruns, proc, or tail by default */ .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD | TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS, .opts = trace_opts @@ -1176,9 +1169,10 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, * If the return function does not have a matching entry, * then the entry was lost. Instead of just printing * the '}' and letting the user guess what function this - * belongs to, write out the function name. + * belongs to, write out the function name. Always do + * that if the funcgraph-tail option is enabled. */ - if (func_match) { + if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) { ret = trace_seq_puts(s, "}\n"); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -1476,7 +1470,8 @@ void graph_trace_close(struct trace_iterator *iter) } } -static int func_graph_set_flag(u32 old_flags, u32 bit, int set) +static int +func_graph_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { if (bit == TRACE_GRAPH_PRINT_IRQS) ftrace_graph_skip_irqs = !set; @@ -1504,7 +1499,6 @@ static struct tracer graph_trace __tracer_data = { .pipe_open = graph_trace_open, .close = graph_trace_close, .pipe_close = graph_trace_close, - .wait_pipe = poll_wait_pipe, .init = graph_trace_init, .reset = graph_trace_reset, .print_line = print_graph_function, diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 2aefbee93a6..9bb104f748d 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -151,16 +151,11 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip, atomic_dec(&data->disabled); } - -static struct ftrace_ops trace_ops __read_mostly = -{ - .func = irqsoff_tracer_call, - .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, -}; #endif /* CONFIG_FUNCTION_TRACER */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER -static int irqsoff_set_flag(u32 old_flags, u32 bit, int set) +static int +irqsoff_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { int cpu; @@ -175,7 +170,7 @@ static int irqsoff_set_flag(u32 old_flags, u32 bit, int set) for_each_possible_cpu(cpu) per_cpu(tracing_cpu, cpu) = 0; - tracing_max_latency = 0; + tr->max_latency = 0; tracing_reset_online_cpus(&irqsoff_trace->trace_buffer); return start_irqsoff_tracer(irqsoff_trace, set); @@ -266,7 +261,8 @@ __trace_function(struct trace_array *tr, #else #define __trace_function trace_function -static int irqsoff_set_flag(u32 old_flags, u32 bit, int set) +static int +irqsoff_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { return -EINVAL; } @@ -301,13 +297,13 @@ static void irqsoff_print_header(struct seq_file *s) /* * Should this new latency be reported/recorded? */ -static int report_latency(cycle_t delta) +static int report_latency(struct trace_array *tr, cycle_t delta) { if (tracing_thresh) { if (delta < tracing_thresh) return 0; } else { - if (delta <= tracing_max_latency) + if (delta <= tr->max_latency) return 0; } return 1; @@ -331,13 +327,13 @@ check_critical_timing(struct trace_array *tr, pc = preempt_count(); - if (!report_latency(delta)) + if (!report_latency(tr, delta)) goto out; raw_spin_lock_irqsave(&max_trace_lock, flags); /* check if we are still the max latency */ - if (!report_latency(delta)) + if (!report_latency(tr, delta)) goto out_unlock; __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); @@ -350,7 +346,7 @@ check_critical_timing(struct trace_array *tr, data->critical_end = parent_ip; if (likely(!is_tracing_stopped())) { - tracing_max_latency = delta; + tr->max_latency = delta; update_max_tr_single(tr, current, cpu); } @@ -498,14 +494,14 @@ void trace_hardirqs_off(void) } EXPORT_SYMBOL(trace_hardirqs_off); -void trace_hardirqs_on_caller(unsigned long caller_addr) +__visible void trace_hardirqs_on_caller(unsigned long caller_addr) { if (!preempt_trace() && irq_trace()) stop_critical_timing(CALLER_ADDR0, caller_addr); } EXPORT_SYMBOL(trace_hardirqs_on_caller); -void trace_hardirqs_off_caller(unsigned long caller_addr) +__visible void trace_hardirqs_off_caller(unsigned long caller_addr) { if (!preempt_trace() && irq_trace()) start_critical_timing(CALLER_ADDR0, caller_addr); @@ -529,7 +525,7 @@ void trace_preempt_off(unsigned long a0, unsigned long a1) } #endif /* CONFIG_PREEMPT_TRACER */ -static int register_irqsoff_function(int graph, int set) +static int register_irqsoff_function(struct trace_array *tr, int graph, int set) { int ret; @@ -541,7 +537,7 @@ static int register_irqsoff_function(int graph, int set) ret = register_ftrace_graph(&irqsoff_graph_return, &irqsoff_graph_entry); else - ret = register_ftrace_function(&trace_ops); + ret = register_ftrace_function(tr->ops); if (!ret) function_enabled = true; @@ -549,7 +545,7 @@ static int register_irqsoff_function(int graph, int set) return ret; } -static void unregister_irqsoff_function(int graph) +static void unregister_irqsoff_function(struct trace_array *tr, int graph) { if (!function_enabled) return; @@ -557,23 +553,25 @@ static void unregister_irqsoff_function(int graph) if (graph) unregister_ftrace_graph(); else - unregister_ftrace_function(&trace_ops); + unregister_ftrace_function(tr->ops); function_enabled = false; } -static void irqsoff_function_set(int set) +static void irqsoff_function_set(struct trace_array *tr, int set) { if (set) - register_irqsoff_function(is_graph(), 1); + register_irqsoff_function(tr, is_graph(), 1); else - unregister_irqsoff_function(is_graph()); + unregister_irqsoff_function(tr, is_graph()); } -static int irqsoff_flag_changed(struct tracer *tracer, u32 mask, int set) +static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set) { + struct tracer *tracer = tr->current_trace; + if (mask & TRACE_ITER_FUNCTION) - irqsoff_function_set(set); + irqsoff_function_set(tr, set); return trace_keep_overwrite(tracer, mask, set); } @@ -582,7 +580,7 @@ static int start_irqsoff_tracer(struct trace_array *tr, int graph) { int ret; - ret = register_irqsoff_function(graph, 0); + ret = register_irqsoff_function(tr, graph, 0); if (!ret && tracing_is_enabled()) tracer_enabled = 1; @@ -596,25 +594,37 @@ static void stop_irqsoff_tracer(struct trace_array *tr, int graph) { tracer_enabled = 0; - unregister_irqsoff_function(graph); + unregister_irqsoff_function(tr, graph); } -static void __irqsoff_tracer_init(struct trace_array *tr) +static bool irqsoff_busy; + +static int __irqsoff_tracer_init(struct trace_array *tr) { + if (irqsoff_busy) + return -EBUSY; + save_flags = trace_flags; /* non overwrite screws up the latency tracers */ set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1); - tracing_max_latency = 0; + tr->max_latency = 0; irqsoff_trace = tr; /* make sure that the tracer is visible */ smp_wmb(); tracing_reset_online_cpus(&tr->trace_buffer); - if (start_irqsoff_tracer(tr, is_graph())) + ftrace_init_array_ops(tr, irqsoff_tracer_call); + + /* Only toplevel instance supports graph tracing */ + if (start_irqsoff_tracer(tr, (tr->flags & TRACE_ARRAY_FL_GLOBAL && + is_graph()))) printk(KERN_ERR "failed to start irqsoff tracer\n"); + + irqsoff_busy = true; + return 0; } static void irqsoff_tracer_reset(struct trace_array *tr) @@ -626,6 +636,9 @@ static void irqsoff_tracer_reset(struct trace_array *tr) set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); + ftrace_reset_array_ops(tr); + + irqsoff_busy = false; } static void irqsoff_tracer_start(struct trace_array *tr) @@ -643,8 +656,7 @@ static int irqsoff_tracer_init(struct trace_array *tr) { trace_type = TRACER_IRQS_OFF; - __irqsoff_tracer_init(tr); - return 0; + return __irqsoff_tracer_init(tr); } static struct tracer irqsoff_tracer __read_mostly = { @@ -664,6 +676,7 @@ static struct tracer irqsoff_tracer __read_mostly = #endif .open = irqsoff_trace_open, .close = irqsoff_trace_close, + .allow_instances = true, .use_max_tr = true, }; # define register_irqsoff(trace) register_tracer(&trace) @@ -676,8 +689,7 @@ static int preemptoff_tracer_init(struct trace_array *tr) { trace_type = TRACER_PREEMPT_OFF; - __irqsoff_tracer_init(tr); - return 0; + return __irqsoff_tracer_init(tr); } static struct tracer preemptoff_tracer __read_mostly = @@ -698,6 +710,7 @@ static struct tracer preemptoff_tracer __read_mostly = #endif .open = irqsoff_trace_open, .close = irqsoff_trace_close, + .allow_instances = true, .use_max_tr = true, }; # define register_preemptoff(trace) register_tracer(&trace) @@ -712,8 +725,7 @@ static int preemptirqsoff_tracer_init(struct trace_array *tr) { trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF; - __irqsoff_tracer_init(tr); - return 0; + return __irqsoff_tracer_init(tr); } static struct tracer preemptirqsoff_tracer __read_mostly = @@ -734,6 +746,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly = #endif .open = irqsoff_trace_open, .close = irqsoff_trace_close, + .allow_instances = true, .use_max_tr = true, }; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index bdbae450c13..282f6e4e553 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -35,37 +35,32 @@ struct trace_kprobe { struct trace_probe tp; }; -struct event_file_link { - struct ftrace_event_file *file; - struct list_head list; -}; - #define SIZEOF_TRACE_KPROBE(n) \ (offsetof(struct trace_kprobe, tp.args) + \ (sizeof(struct probe_arg) * (n))) -static __kprobes bool trace_kprobe_is_return(struct trace_kprobe *tk) +static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk) { return tk->rp.handler != NULL; } -static __kprobes const char *trace_kprobe_symbol(struct trace_kprobe *tk) +static nokprobe_inline const char *trace_kprobe_symbol(struct trace_kprobe *tk) { return tk->symbol ? tk->symbol : "unknown"; } -static __kprobes unsigned long trace_kprobe_offset(struct trace_kprobe *tk) +static nokprobe_inline unsigned long trace_kprobe_offset(struct trace_kprobe *tk) { return tk->rp.kp.offset; } -static __kprobes bool trace_kprobe_has_gone(struct trace_kprobe *tk) +static nokprobe_inline bool trace_kprobe_has_gone(struct trace_kprobe *tk) { return !!(kprobe_gone(&tk->rp.kp)); } -static __kprobes bool trace_kprobe_within_module(struct trace_kprobe *tk, +static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk, struct module *mod) { int len = strlen(mod->name); @@ -73,7 +68,7 @@ static __kprobes bool trace_kprobe_within_module(struct trace_kprobe *tk, return strncmp(mod->name, name, len) == 0 && name[len] == ':'; } -static __kprobes bool trace_kprobe_is_on_module(struct trace_kprobe *tk) +static nokprobe_inline bool trace_kprobe_is_on_module(struct trace_kprobe *tk) { return !!strchr(trace_kprobe_symbol(tk), ':'); } @@ -137,19 +132,21 @@ struct symbol_cache *alloc_symbol_cache(const char *sym, long offset) * Kprobes-specific fetch functions */ #define DEFINE_FETCH_stack(type) \ -static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ +static void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs, \ void *offset, void *dest) \ { \ *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \ (unsigned int)((unsigned long)offset)); \ -} +} \ +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(stack, type)); + DEFINE_BASIC_FETCH_FUNCS(stack) /* No string on the stack entry */ #define fetch_stack_string NULL #define fetch_stack_string_size NULL #define DEFINE_FETCH_memory(type) \ -static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ +static void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs, \ void *addr, void *dest) \ { \ type retval; \ @@ -157,14 +154,16 @@ static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ *(type *)dest = 0; \ else \ *(type *)dest = retval; \ -} +} \ +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, type)); + DEFINE_BASIC_FETCH_FUNCS(memory) /* * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max * length and relative data location. */ -static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, - void *addr, void *dest) +static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, + void *addr, void *dest) { long ret; int maxlen = get_rloc_len(*(u32 *)dest); @@ -198,10 +197,11 @@ static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, get_rloc_offs(*(u32 *)dest)); } } +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string)); /* Return the length of string -- including null terminal byte */ -static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, - void *addr, void *dest) +static void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, + void *addr, void *dest) { mm_segment_t old_fs; int ret, len = 0; @@ -224,17 +224,19 @@ static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, else *(u32 *)dest = len; } +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string_size)); #define DEFINE_FETCH_symbol(type) \ -__kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs, \ - void *data, void *dest) \ +void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs, void *data, void *dest)\ { \ struct symbol_cache *sc = data; \ if (sc->addr) \ fetch_memory_##type(regs, (void *)sc->addr, dest); \ else \ *(type *)dest = 0; \ -} +} \ +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(symbol, type)); + DEFINE_BASIC_FETCH_FUNCS(symbol) DEFINE_FETCH_symbol(string) DEFINE_FETCH_symbol(string_size) @@ -346,7 +348,7 @@ static struct trace_kprobe *find_trace_kprobe(const char *event, struct trace_kprobe *tk; list_for_each_entry(tk, &probe_list, list) - if (strcmp(tk->tp.call.name, event) == 0 && + if (strcmp(ftrace_event_name(&tk->tp.call), event) == 0 && strcmp(tk->tp.call.class->system, group) == 0) return tk; return NULL; @@ -387,18 +389,6 @@ enable_trace_kprobe(struct trace_kprobe *tk, struct ftrace_event_file *file) return ret; } -static struct event_file_link * -find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file) -{ - struct event_file_link *link; - - list_for_each_entry(link, &tp->files, list) - if (link->file == file) - return link; - - return NULL; -} - /* * Disable trace_probe * if the file is NULL, disable "perf" handler, or disable "trace" handler. @@ -533,7 +523,8 @@ static int register_trace_kprobe(struct trace_kprobe *tk) mutex_lock(&probe_lock); /* Delete old (same name) event if exist */ - old_tk = find_trace_kprobe(tk->tp.call.name, tk->tp.call.class->system); + old_tk = find_trace_kprobe(ftrace_event_name(&tk->tp.call), + tk->tp.call.class->system); if (old_tk) { ret = unregister_trace_kprobe(old_tk); if (ret < 0) @@ -581,7 +572,8 @@ static int trace_kprobe_module_callback(struct notifier_block *nb, if (ret) pr_warning("Failed to re-register probe %s on" "%s: %d\n", - tk->tp.call.name, mod->name, ret); + ftrace_event_name(&tk->tp.call), + mod->name, ret); } } mutex_unlock(&probe_lock); @@ -835,7 +827,8 @@ static int probes_seq_show(struct seq_file *m, void *v) int i; seq_printf(m, "%c", trace_kprobe_is_return(tk) ? 'r' : 'p'); - seq_printf(m, ":%s/%s", tk->tp.call.class->system, tk->tp.call.name); + seq_printf(m, ":%s/%s", tk->tp.call.class->system, + ftrace_event_name(&tk->tp.call)); if (!tk->symbol) seq_printf(m, " 0x%p", tk->rp.kp.addr); @@ -893,7 +886,8 @@ static int probes_profile_seq_show(struct seq_file *m, void *v) { struct trace_kprobe *tk = v; - seq_printf(m, " %-44s %15lu %15lu\n", tk->tp.call.name, tk->nhit, + seq_printf(m, " %-44s %15lu %15lu\n", + ftrace_event_name(&tk->tp.call), tk->nhit, tk->rp.kp.nmissed); return 0; @@ -920,7 +914,7 @@ static const struct file_operations kprobe_profile_ops = { }; /* Kprobe handler */ -static __kprobes void +static nokprobe_inline void __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, struct ftrace_event_file *ftrace_file) { @@ -956,7 +950,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, entry, irq_flags, pc, regs); } -static __kprobes void +static void kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs) { struct event_file_link *link; @@ -964,9 +958,10 @@ kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs) list_for_each_entry_rcu(link, &tk->tp.files, list) __kprobe_trace_func(tk, regs, link->file); } +NOKPROBE_SYMBOL(kprobe_trace_func); /* Kretprobe handler */ -static __kprobes void +static nokprobe_inline void __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, struct pt_regs *regs, struct ftrace_event_file *ftrace_file) @@ -1004,7 +999,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, entry, irq_flags, pc, regs); } -static __kprobes void +static void kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, struct pt_regs *regs) { @@ -1013,6 +1008,7 @@ kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, list_for_each_entry_rcu(link, &tk->tp.files, list) __kretprobe_trace_func(tk, ri, regs, link->file); } +NOKPROBE_SYMBOL(kretprobe_trace_func); /* Event entry printers */ static enum print_line_t @@ -1028,7 +1024,7 @@ print_kprobe_event(struct trace_iterator *iter, int flags, field = (struct kprobe_trace_entry_head *)iter->ent; tp = container_of(event, struct trace_probe, call.event); - if (!trace_seq_printf(s, "%s: (", tp->call.name)) + if (!trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call))) goto partial; if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) @@ -1064,7 +1060,7 @@ print_kretprobe_event(struct trace_iterator *iter, int flags, field = (struct kretprobe_trace_entry_head *)iter->ent; tp = container_of(event, struct trace_probe, call.event); - if (!trace_seq_printf(s, "%s: (", tp->call.name)) + if (!trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call))) goto partial; if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET)) @@ -1144,7 +1140,7 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) #ifdef CONFIG_PERF_EVENTS /* Kprobe profile handler */ -static __kprobes void +static void kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) { struct ftrace_event_call *call = &tk->tp.call; @@ -1171,9 +1167,10 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); } +NOKPROBE_SYMBOL(kprobe_perf_func); /* Kretprobe profile handler */ -static __kprobes void +static void kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, struct pt_regs *regs) { @@ -1201,6 +1198,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); } +NOKPROBE_SYMBOL(kretprobe_perf_func); #endif /* CONFIG_PERF_EVENTS */ /* @@ -1209,9 +1207,8 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, * kprobe_trace_self_tests_init() does enable_trace_probe/disable_trace_probe * lockless, but we can't race with this __init function. */ -static __kprobes -int kprobe_register(struct ftrace_event_call *event, - enum trace_reg type, void *data) +static int kprobe_register(struct ftrace_event_call *event, + enum trace_reg type, void *data) { struct trace_kprobe *tk = (struct trace_kprobe *)event->data; struct ftrace_event_file *file = data; @@ -1237,8 +1234,7 @@ int kprobe_register(struct ftrace_event_call *event, return 0; } -static __kprobes -int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) +static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) { struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp); @@ -1252,9 +1248,10 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) #endif return 0; /* We don't tweek kernel, so just return 0 */ } +NOKPROBE_SYMBOL(kprobe_dispatcher); -static __kprobes -int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) +static int +kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) { struct trace_kprobe *tk = container_of(ri->rp, struct trace_kprobe, rp); @@ -1268,6 +1265,7 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) #endif return 0; /* We don't tweek kernel, so just return 0 */ } +NOKPROBE_SYMBOL(kretprobe_dispatcher); static struct trace_event_functions kretprobe_funcs = { .trace = print_kretprobe_event @@ -1303,7 +1301,8 @@ static int register_kprobe_event(struct trace_kprobe *tk) call->data = tk; ret = trace_add_event_call(call); if (ret) { - pr_info("Failed to register kprobe event: %s\n", call->name); + pr_info("Failed to register kprobe event: %s\n", + ftrace_event_name(call)); kfree(call->print_fmt); unregister_ftrace_event(&call->event); } @@ -1389,6 +1388,9 @@ static __init int kprobe_trace_self_tests_init(void) struct trace_kprobe *tk; struct ftrace_event_file *file; + if (tracing_is_disabled()) + return -ENODEV; + target = kprobe_trace_selftest_target; pr_info("Testing kprobe tracing: "); diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c index 394f94417e2..fcf0a9e4891 100644 --- a/kernel/trace/trace_nop.c +++ b/kernel/trace/trace_nop.c @@ -62,7 +62,7 @@ static void nop_trace_reset(struct trace_array *tr) * If you don't implement it, then the flag setting will be * automatically accepted. */ -static int nop_set_flag(u32 old_flags, u32 bit, int set) +static int nop_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { /* * Note that you don't need to update nop_flags.val yourself. @@ -91,11 +91,11 @@ struct tracer nop_trace __read_mostly = .name = "nop", .init = nop_trace_init, .reset = nop_trace_reset, - .wait_pipe = poll_wait_pipe, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_nop, #endif .flags = &nop_flags, - .set_flag = nop_set_flag + .set_flag = nop_set_flag, + .allow_instances = true, }; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index ed32284fbe3..f3dad80c20b 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -126,6 +126,34 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...) EXPORT_SYMBOL_GPL(trace_seq_printf); /** + * trace_seq_bitmask - put a list of longs as a bitmask print output + * @s: trace sequence descriptor + * @maskp: points to an array of unsigned longs that represent a bitmask + * @nmaskbits: The number of bits that are valid in @maskp + * + * It returns 0 if the trace oversizes the buffer's free + * space, 1 otherwise. + * + * Writes a ASCII representation of a bitmask string into @s. + */ +int +trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp, + int nmaskbits) +{ + int len = (PAGE_SIZE - 1) - s->len; + int ret; + + if (s->full || !len) + return 0; + + ret = bitmap_scnprintf(s->buffer, len, maskp, nmaskbits); + s->len += ret; + + return 1; +} +EXPORT_SYMBOL_GPL(trace_seq_bitmask); + +/** * trace_seq_vprintf - sequence printing of trace information * @s: trace sequence descriptor * @fmt: printf format string @@ -399,6 +427,19 @@ EXPORT_SYMBOL(ftrace_print_symbols_seq_u64); #endif const char * +ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, + unsigned int bitmask_size) +{ + const char *ret = p->buffer + p->len; + + trace_seq_bitmask(p, bitmask_ptr, bitmask_size * 8); + trace_seq_putc(p, 0); + + return ret; +} +EXPORT_SYMBOL_GPL(ftrace_print_bitmask_seq); + +const char * ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) { int i; @@ -431,7 +472,7 @@ int ftrace_raw_output_prep(struct trace_iterator *iter, } trace_seq_init(p); - ret = trace_seq_printf(s, "%s: ", event->name); + ret = trace_seq_printf(s, "%s: ", ftrace_event_name(event)); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -439,6 +480,37 @@ int ftrace_raw_output_prep(struct trace_iterator *iter, } EXPORT_SYMBOL(ftrace_raw_output_prep); +static int ftrace_output_raw(struct trace_iterator *iter, char *name, + char *fmt, va_list ap) +{ + struct trace_seq *s = &iter->seq; + int ret; + + ret = trace_seq_printf(s, "%s: ", name); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + ret = trace_seq_vprintf(s, fmt, ap); + + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...) +{ + va_list ap; + int ret; + + va_start(ap, fmt); + ret = ftrace_output_raw(iter, name, fmt, ap); + va_end(ap); + + return ret; +} +EXPORT_SYMBOL_GPL(ftrace_output_call); + #ifdef CONFIG_KRETPROBES static inline const char *kretprobed(const char *name) { diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 8364a421b4d..d4b9fc22cd2 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -37,13 +37,13 @@ const char *reserved_field_names[] = { /* Printing in basic type function template */ #define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt) \ -__kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ - const char *name, \ - void *data, void *ent) \ +int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \ + void *data, void *ent) \ { \ return trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \ } \ -const char PRINT_TYPE_FMT_NAME(type)[] = fmt; +const char PRINT_TYPE_FMT_NAME(type)[] = fmt; \ +NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(type)); DEFINE_BASIC_PRINT_TYPE_FUNC(u8 , "0x%x") DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "0x%x") @@ -55,9 +55,8 @@ DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%d") DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%Ld") /* Print type function for string type */ -__kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, - const char *name, - void *data, void *ent) +int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, const char *name, + void *data, void *ent) { int len = *(u32 *)data >> 16; @@ -67,6 +66,7 @@ __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, return trace_seq_printf(s, " %s=\"%s\"", name, (const char *)get_loc_data(data, ent)); } +NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(string)); const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; @@ -81,23 +81,24 @@ const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; /* Data fetch function templates */ #define DEFINE_FETCH_reg(type) \ -__kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \ - void *offset, void *dest) \ +void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, void *offset, void *dest) \ { \ *(type *)dest = (type)regs_get_register(regs, \ (unsigned int)((unsigned long)offset)); \ -} +} \ +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(reg, type)); DEFINE_BASIC_FETCH_FUNCS(reg) /* No string on the register */ #define fetch_reg_string NULL #define fetch_reg_string_size NULL #define DEFINE_FETCH_retval(type) \ -__kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs, \ - void *dummy, void *dest) \ +void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs, \ + void *dummy, void *dest) \ { \ *(type *)dest = (type)regs_return_value(regs); \ -} +} \ +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(retval, type)); DEFINE_BASIC_FETCH_FUNCS(retval) /* No string on the retval */ #define fetch_retval_string NULL @@ -112,8 +113,8 @@ struct deref_fetch_param { }; #define DEFINE_FETCH_deref(type) \ -__kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs, \ - void *data, void *dest) \ +void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs, \ + void *data, void *dest) \ { \ struct deref_fetch_param *dprm = data; \ unsigned long addr; \ @@ -123,12 +124,13 @@ __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs, \ dprm->fetch(regs, (void *)addr, dest); \ } else \ *(type *)dest = 0; \ -} +} \ +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(deref, type)); DEFINE_BASIC_FETCH_FUNCS(deref) DEFINE_FETCH_deref(string) -__kprobes void FETCH_FUNC_NAME(deref, string_size)(struct pt_regs *regs, - void *data, void *dest) +void FETCH_FUNC_NAME(deref, string_size)(struct pt_regs *regs, + void *data, void *dest) { struct deref_fetch_param *dprm = data; unsigned long addr; @@ -140,16 +142,18 @@ __kprobes void FETCH_FUNC_NAME(deref, string_size)(struct pt_regs *regs, } else *(string_size *)dest = 0; } +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(deref, string_size)); -static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data) +static void update_deref_fetch_param(struct deref_fetch_param *data) { if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) update_deref_fetch_param(data->orig.data); else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) update_symbol_cache(data->orig.data); } +NOKPROBE_SYMBOL(update_deref_fetch_param); -static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) +static void free_deref_fetch_param(struct deref_fetch_param *data) { if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) free_deref_fetch_param(data->orig.data); @@ -157,6 +161,7 @@ static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) free_symbol_cache(data->orig.data); kfree(data); } +NOKPROBE_SYMBOL(free_deref_fetch_param); /* Bitfield fetch function */ struct bitfield_fetch_param { @@ -166,8 +171,8 @@ struct bitfield_fetch_param { }; #define DEFINE_FETCH_bitfield(type) \ -__kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs, \ - void *data, void *dest) \ +void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs, \ + void *data, void *dest) \ { \ struct bitfield_fetch_param *bprm = data; \ type buf = 0; \ @@ -177,13 +182,13 @@ __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs, \ buf >>= bprm->low_shift; \ } \ *(type *)dest = buf; \ -} - +} \ +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(bitfield, type)); DEFINE_BASIC_FETCH_FUNCS(bitfield) #define fetch_bitfield_string NULL #define fetch_bitfield_string_size NULL -static __kprobes void +static void update_bitfield_fetch_param(struct bitfield_fetch_param *data) { /* @@ -196,7 +201,7 @@ update_bitfield_fetch_param(struct bitfield_fetch_param *data) update_symbol_cache(data->orig.data); } -static __kprobes void +static void free_bitfield_fetch_param(struct bitfield_fetch_param *data) { /* @@ -255,17 +260,17 @@ fail: } /* Special function : only accept unsigned long */ -static __kprobes void fetch_kernel_stack_address(struct pt_regs *regs, - void *dummy, void *dest) +static void fetch_kernel_stack_address(struct pt_regs *regs, void *dummy, void *dest) { *(unsigned long *)dest = kernel_stack_pointer(regs); } +NOKPROBE_SYMBOL(fetch_kernel_stack_address); -static __kprobes void fetch_user_stack_address(struct pt_regs *regs, - void *dummy, void *dest) +static void fetch_user_stack_address(struct pt_regs *regs, void *dummy, void *dest) { *(unsigned long *)dest = user_stack_pointer(regs); } +NOKPROBE_SYMBOL(fetch_user_stack_address); static fetch_func_t get_fetch_size_function(const struct fetch_type *type, fetch_func_t orig_fn, diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index b73574a5f42..4f815fbce16 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -81,13 +81,13 @@ */ #define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs)) -static inline void *get_rloc_data(u32 *dl) +static nokprobe_inline void *get_rloc_data(u32 *dl) { return (u8 *)dl + get_rloc_offs(*dl); } /* For data_loc conversion */ -static inline void *get_loc_data(u32 *dl, void *ent) +static nokprobe_inline void *get_loc_data(u32 *dl, void *ent) { return (u8 *)ent + get_rloc_offs(*dl); } @@ -136,9 +136,8 @@ typedef u32 string_size; /* Printing in basic type function template */ #define DECLARE_BASIC_PRINT_TYPE_FUNC(type) \ -__kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ - const char *name, \ - void *data, void *ent); \ +int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \ + void *data, void *ent); \ extern const char PRINT_TYPE_FMT_NAME(type)[] DECLARE_BASIC_PRINT_TYPE_FUNC(u8); @@ -288,6 +287,11 @@ struct trace_probe { struct probe_arg args[]; }; +struct event_file_link { + struct ftrace_event_file *file; + struct list_head list; +}; + static inline bool trace_probe_is_enabled(struct trace_probe *tp) { return !!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE)); @@ -298,7 +302,7 @@ static inline bool trace_probe_is_registered(struct trace_probe *tp) return !!(tp->flags & TP_FLAG_REGISTERED); } -static inline __kprobes void call_fetch(struct fetch_param *fprm, +static nokprobe_inline void call_fetch(struct fetch_param *fprm, struct pt_regs *regs, void *dest) { return fprm->fn(regs, fprm->data, dest); @@ -316,6 +320,18 @@ static inline int is_good_name(const char *name) return 1; } +static inline struct event_file_link * +find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file) +{ + struct event_file_link *link; + + list_for_each_entry(link, &tp->files, list) + if (link->file == file) + return link; + + return NULL; +} + extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size, struct probe_arg *parg, bool is_return, bool is_kprobe); @@ -334,7 +350,7 @@ extern ssize_t traceprobe_probes_write(struct file *file, extern int traceprobe_command(const char *buf, int (*createfn)(int, char**)); /* Sum up total data length for dynamic arraies (strings) */ -static inline __kprobes int +static nokprobe_inline int __get_data_size(struct trace_probe *tp, struct pt_regs *regs) { int i, ret = 0; @@ -350,7 +366,7 @@ __get_data_size(struct trace_probe *tp, struct pt_regs *regs) } /* Store the value of each argument */ -static inline __kprobes void +static nokprobe_inline void store_trace_args(int ent_size, struct trace_probe *tp, struct pt_regs *regs, u8 *data, int maxlen) { diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 6e32635e5e5..19bd8928ce9 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -130,15 +130,9 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip, atomic_dec(&data->disabled); preempt_enable_notrace(); } - -static struct ftrace_ops trace_ops __read_mostly = -{ - .func = wakeup_tracer_call, - .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, -}; #endif /* CONFIG_FUNCTION_TRACER */ -static int register_wakeup_function(int graph, int set) +static int register_wakeup_function(struct trace_array *tr, int graph, int set) { int ret; @@ -150,7 +144,7 @@ static int register_wakeup_function(int graph, int set) ret = register_ftrace_graph(&wakeup_graph_return, &wakeup_graph_entry); else - ret = register_ftrace_function(&trace_ops); + ret = register_ftrace_function(tr->ops); if (!ret) function_enabled = true; @@ -158,7 +152,7 @@ static int register_wakeup_function(int graph, int set) return ret; } -static void unregister_wakeup_function(int graph) +static void unregister_wakeup_function(struct trace_array *tr, int graph) { if (!function_enabled) return; @@ -166,32 +160,34 @@ static void unregister_wakeup_function(int graph) if (graph) unregister_ftrace_graph(); else - unregister_ftrace_function(&trace_ops); + unregister_ftrace_function(tr->ops); function_enabled = false; } -static void wakeup_function_set(int set) +static void wakeup_function_set(struct trace_array *tr, int set) { if (set) - register_wakeup_function(is_graph(), 1); + register_wakeup_function(tr, is_graph(), 1); else - unregister_wakeup_function(is_graph()); + unregister_wakeup_function(tr, is_graph()); } -static int wakeup_flag_changed(struct tracer *tracer, u32 mask, int set) +static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set) { + struct tracer *tracer = tr->current_trace; + if (mask & TRACE_ITER_FUNCTION) - wakeup_function_set(set); + wakeup_function_set(tr, set); return trace_keep_overwrite(tracer, mask, set); } -static int start_func_tracer(int graph) +static int start_func_tracer(struct trace_array *tr, int graph) { int ret; - ret = register_wakeup_function(graph, 0); + ret = register_wakeup_function(tr, graph, 0); if (!ret && tracing_is_enabled()) tracer_enabled = 1; @@ -201,15 +197,16 @@ static int start_func_tracer(int graph) return ret; } -static void stop_func_tracer(int graph) +static void stop_func_tracer(struct trace_array *tr, int graph) { tracer_enabled = 0; - unregister_wakeup_function(graph); + unregister_wakeup_function(tr, graph); } #ifdef CONFIG_FUNCTION_GRAPH_TRACER -static int wakeup_set_flag(u32 old_flags, u32 bit, int set) +static int +wakeup_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { if (!(bit & TRACE_DISPLAY_GRAPH)) @@ -218,12 +215,12 @@ static int wakeup_set_flag(u32 old_flags, u32 bit, int set) if (!(is_graph() ^ set)) return 0; - stop_func_tracer(!set); + stop_func_tracer(tr, !set); wakeup_reset(wakeup_trace); - tracing_max_latency = 0; + tr->max_latency = 0; - return start_func_tracer(set); + return start_func_tracer(tr, set); } static int wakeup_graph_entry(struct ftrace_graph_ent *trace) @@ -311,7 +308,8 @@ __trace_function(struct trace_array *tr, #else #define __trace_function trace_function -static int wakeup_set_flag(u32 old_flags, u32 bit, int set) +static int +wakeup_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { return -EINVAL; } @@ -346,13 +344,13 @@ static void wakeup_print_header(struct seq_file *s) /* * Should this new latency be reported/recorded? */ -static int report_latency(cycle_t delta) +static int report_latency(struct trace_array *tr, cycle_t delta) { if (tracing_thresh) { if (delta < tracing_thresh) return 0; } else { - if (delta <= tracing_max_latency) + if (delta <= tr->max_latency) return 0; } return 1; @@ -420,11 +418,11 @@ probe_wakeup_sched_switch(void *ignore, T1 = ftrace_now(cpu); delta = T1-T0; - if (!report_latency(delta)) + if (!report_latency(wakeup_trace, delta)) goto out_unlock; if (likely(!is_tracing_stopped())) { - tracing_max_latency = delta; + wakeup_trace->max_latency = delta; update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu); } @@ -583,7 +581,7 @@ static void start_wakeup_tracer(struct trace_array *tr) */ smp_wmb(); - if (start_func_tracer(is_graph())) + if (start_func_tracer(tr, is_graph())) printk(KERN_ERR "failed to start wakeup tracer\n"); return; @@ -596,13 +594,15 @@ fail_deprobe: static void stop_wakeup_tracer(struct trace_array *tr) { tracer_enabled = 0; - stop_func_tracer(is_graph()); + stop_func_tracer(tr, is_graph()); unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL); unregister_trace_sched_wakeup_new(probe_wakeup, NULL); unregister_trace_sched_wakeup(probe_wakeup, NULL); unregister_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL); } +static bool wakeup_busy; + static int __wakeup_tracer_init(struct trace_array *tr) { save_flags = trace_flags; @@ -611,14 +611,20 @@ static int __wakeup_tracer_init(struct trace_array *tr) set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1); - tracing_max_latency = 0; + tr->max_latency = 0; wakeup_trace = tr; + ftrace_init_array_ops(tr, wakeup_tracer_call); start_wakeup_tracer(tr); + + wakeup_busy = true; return 0; } static int wakeup_tracer_init(struct trace_array *tr) { + if (wakeup_busy) + return -EBUSY; + wakeup_dl = 0; wakeup_rt = 0; return __wakeup_tracer_init(tr); @@ -626,6 +632,9 @@ static int wakeup_tracer_init(struct trace_array *tr) static int wakeup_rt_tracer_init(struct trace_array *tr) { + if (wakeup_busy) + return -EBUSY; + wakeup_dl = 0; wakeup_rt = 1; return __wakeup_tracer_init(tr); @@ -633,6 +642,9 @@ static int wakeup_rt_tracer_init(struct trace_array *tr) static int wakeup_dl_tracer_init(struct trace_array *tr) { + if (wakeup_busy) + return -EBUSY; + wakeup_dl = 1; wakeup_rt = 0; return __wakeup_tracer_init(tr); @@ -649,6 +661,8 @@ static void wakeup_tracer_reset(struct trace_array *tr) set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); + ftrace_reset_array_ops(tr); + wakeup_busy = false; } static void wakeup_tracer_start(struct trace_array *tr) @@ -680,6 +694,7 @@ static struct tracer wakeup_tracer __read_mostly = #endif .open = wakeup_trace_open, .close = wakeup_trace_close, + .allow_instances = true, .use_max_tr = true, }; @@ -690,7 +705,6 @@ static struct tracer wakeup_rt_tracer __read_mostly = .reset = wakeup_tracer_reset, .start = wakeup_tracer_start, .stop = wakeup_tracer_stop, - .wait_pipe = poll_wait_pipe, .print_max = true, .print_header = wakeup_print_header, .print_line = wakeup_print_line, @@ -702,6 +716,7 @@ static struct tracer wakeup_rt_tracer __read_mostly = #endif .open = wakeup_trace_open, .close = wakeup_trace_close, + .allow_instances = true, .use_max_tr = true, }; @@ -712,7 +727,6 @@ static struct tracer wakeup_dl_tracer __read_mostly = .reset = wakeup_tracer_reset, .start = wakeup_tracer_start, .stop = wakeup_tracer_stop, - .wait_pipe = poll_wait_pipe, .print_max = true, .print_header = wakeup_print_header, .print_line = wakeup_print_line, diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index e98fca60974..5ef60499dc8 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -65,7 +65,7 @@ static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count) /* Don't allow flipping of max traces now */ local_irq_save(flags); - arch_spin_lock(&ftrace_max_lock); + arch_spin_lock(&buf->tr->max_lock); cnt = ring_buffer_entries(buf->buffer); @@ -83,7 +83,7 @@ static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count) break; } tracing_on(); - arch_spin_unlock(&ftrace_max_lock); + arch_spin_unlock(&buf->tr->max_lock); local_irq_restore(flags); if (count) @@ -161,11 +161,6 @@ static struct ftrace_ops test_probe3 = { .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; -static struct ftrace_ops test_global = { - .func = trace_selftest_test_global_func, - .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, -}; - static void print_counts(void) { printk("(%d %d %d %d %d) ", @@ -185,7 +180,7 @@ static void reset_counts(void) trace_selftest_test_dyn_cnt = 0; } -static int trace_selftest_ops(int cnt) +static int trace_selftest_ops(struct trace_array *tr, int cnt) { int save_ftrace_enabled = ftrace_enabled; struct ftrace_ops *dyn_ops; @@ -220,7 +215,11 @@ static int trace_selftest_ops(int cnt) register_ftrace_function(&test_probe1); register_ftrace_function(&test_probe2); register_ftrace_function(&test_probe3); - register_ftrace_function(&test_global); + /* First time we are running with main function */ + if (cnt > 1) { + ftrace_init_array_ops(tr, trace_selftest_test_global_func); + register_ftrace_function(tr->ops); + } DYN_FTRACE_TEST_NAME(); @@ -232,8 +231,10 @@ static int trace_selftest_ops(int cnt) goto out; if (trace_selftest_test_probe3_cnt != 1) goto out; - if (trace_selftest_test_global_cnt == 0) - goto out; + if (cnt > 1) { + if (trace_selftest_test_global_cnt == 0) + goto out; + } DYN_FTRACE_TEST_NAME2(); @@ -269,8 +270,10 @@ static int trace_selftest_ops(int cnt) goto out_free; if (trace_selftest_test_probe3_cnt != 3) goto out_free; - if (trace_selftest_test_global_cnt == 0) - goto out; + if (cnt > 1) { + if (trace_selftest_test_global_cnt == 0) + goto out; + } if (trace_selftest_test_dyn_cnt == 0) goto out_free; @@ -295,7 +298,9 @@ static int trace_selftest_ops(int cnt) unregister_ftrace_function(&test_probe1); unregister_ftrace_function(&test_probe2); unregister_ftrace_function(&test_probe3); - unregister_ftrace_function(&test_global); + if (cnt > 1) + unregister_ftrace_function(tr->ops); + ftrace_reset_array_ops(tr); /* Make sure everything is off */ reset_counts(); @@ -315,9 +320,9 @@ static int trace_selftest_ops(int cnt) } /* Test dynamic code modification and ftrace filters */ -int trace_selftest_startup_dynamic_tracing(struct tracer *trace, - struct trace_array *tr, - int (*func)(void)) +static int trace_selftest_startup_dynamic_tracing(struct tracer *trace, + struct trace_array *tr, + int (*func)(void)) { int save_ftrace_enabled = ftrace_enabled; unsigned long count; @@ -388,7 +393,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, } /* Test the ops with global tracing running */ - ret = trace_selftest_ops(1); + ret = trace_selftest_ops(tr, 1); trace->reset(tr); out: @@ -399,7 +404,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, /* Test the ops with global tracing off */ if (!ret) - ret = trace_selftest_ops(2); + ret = trace_selftest_ops(tr, 2); return ret; } @@ -802,7 +807,7 @@ out: int trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) { - unsigned long save_max = tracing_max_latency; + unsigned long save_max = tr->max_latency; unsigned long count; int ret; @@ -814,7 +819,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) } /* reset the max latency */ - tracing_max_latency = 0; + tr->max_latency = 0; /* disable interrupts for a bit */ local_irq_disable(); udelay(100); @@ -841,7 +846,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) ret = -1; } - tracing_max_latency = save_max; + tr->max_latency = save_max; return ret; } @@ -851,7 +856,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) int trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) { - unsigned long save_max = tracing_max_latency; + unsigned long save_max = tr->max_latency; unsigned long count; int ret; @@ -876,7 +881,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) } /* reset the max latency */ - tracing_max_latency = 0; + tr->max_latency = 0; /* disable preemption for a bit */ preempt_disable(); udelay(100); @@ -903,7 +908,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) ret = -1; } - tracing_max_latency = save_max; + tr->max_latency = save_max; return ret; } @@ -913,7 +918,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) int trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *tr) { - unsigned long save_max = tracing_max_latency; + unsigned long save_max = tr->max_latency; unsigned long count; int ret; @@ -938,7 +943,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * } /* reset the max latency */ - tracing_max_latency = 0; + tr->max_latency = 0; /* disable preemption and interrupts for a bit */ preempt_disable(); @@ -973,7 +978,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * } /* do the test by disabling interrupts first this time */ - tracing_max_latency = 0; + tr->max_latency = 0; tracing_start(); trace->start(tr); @@ -1004,7 +1009,7 @@ out: tracing_start(); out_no_start: trace->reset(tr); - tracing_max_latency = save_max; + tr->max_latency = save_max; return ret; } @@ -1057,7 +1062,7 @@ static int trace_wakeup_test_thread(void *data) int trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) { - unsigned long save_max = tracing_max_latency; + unsigned long save_max = tr->max_latency; struct task_struct *p; struct completion is_ready; unsigned long count; @@ -1083,7 +1088,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) } /* reset the max latency */ - tracing_max_latency = 0; + tr->max_latency = 0; while (p->on_rq) { /* @@ -1113,7 +1118,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) trace->reset(tr); tracing_start(); - tracing_max_latency = save_max; + tr->max_latency = save_max; /* kill the thread */ kthread_stop(p); diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index e6be585cf06..8a4e5cb66a4 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -13,6 +13,7 @@ #include <linux/sysctl.h> #include <linux/init.h> #include <linux/fs.h> +#include <linux/magic.h> #include <asm/setup.h> @@ -50,11 +51,33 @@ static DEFINE_MUTEX(stack_sysctl_mutex); int stack_tracer_enabled; static int last_stack_tracer_enabled; +static inline void print_max_stack(void) +{ + long i; + int size; + + pr_emerg(" Depth Size Location (%d entries)\n" + " ----- ---- --------\n", + max_stack_trace.nr_entries - 1); + + for (i = 0; i < max_stack_trace.nr_entries; i++) { + if (stack_dump_trace[i] == ULONG_MAX) + break; + if (i+1 == max_stack_trace.nr_entries || + stack_dump_trace[i+1] == ULONG_MAX) + size = stack_dump_index[i]; + else + size = stack_dump_index[i] - stack_dump_index[i+1]; + + pr_emerg("%3ld) %8d %5d %pS\n", i, stack_dump_index[i], + size, (void *)stack_dump_trace[i]); + } +} + static inline void check_stack(unsigned long ip, unsigned long *stack) { - unsigned long this_size, flags; - unsigned long *p, *top, *start; + unsigned long this_size, flags; unsigned long *p, *top, *start; static int tracer_frame; int frame_size = ACCESS_ONCE(tracer_frame); int i; @@ -84,8 +107,12 @@ check_stack(unsigned long ip, unsigned long *stack) max_stack_size = this_size; - max_stack_trace.nr_entries = 0; - max_stack_trace.skip = 3; + max_stack_trace.nr_entries = 0; + + if (using_ftrace_ops_list_func()) + max_stack_trace.skip = 4; + else + max_stack_trace.skip = 3; save_stack_trace(&max_stack_trace); @@ -144,6 +171,12 @@ check_stack(unsigned long ip, unsigned long *stack) i++; } + if ((current != &init_task && + *(end_of_stack(current)) != STACK_END_MAGIC)) { + print_max_stack(); + BUG(); + } + out: arch_spin_unlock(&max_stack_lock); local_irq_restore(flags); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 79e52d93860..3c9b97e6b1f 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -108,8 +108,8 @@ static unsigned long get_user_stack_nth(struct pt_regs *regs, unsigned int n) * Uprobes-specific fetch functions */ #define DEFINE_FETCH_stack(type) \ -static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ - void *offset, void *dest) \ +static void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs, \ + void *offset, void *dest) \ { \ *(type *)dest = (type)get_user_stack_nth(regs, \ ((unsigned long)offset)); \ @@ -120,8 +120,8 @@ DEFINE_BASIC_FETCH_FUNCS(stack) #define fetch_stack_string_size NULL #define DEFINE_FETCH_memory(type) \ -static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ - void *addr, void *dest) \ +static void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs, \ + void *addr, void *dest) \ { \ type retval; \ void __user *vaddr = (void __force __user *) addr; \ @@ -136,8 +136,8 @@ DEFINE_BASIC_FETCH_FUNCS(memory) * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max * length and relative data location. */ -static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, - void *addr, void *dest) +static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, + void *addr, void *dest) { long ret; u32 rloc = *(u32 *)dest; @@ -158,8 +158,8 @@ static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, } } -static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, - void *addr, void *dest) +static void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, + void *addr, void *dest) { int len; void __user *vaddr = (void __force __user *) addr; @@ -184,8 +184,8 @@ static unsigned long translate_user_vaddr(void *file_offset) } #define DEFINE_FETCH_file_offset(type) \ -static __kprobes void FETCH_FUNC_NAME(file_offset, type)(struct pt_regs *regs,\ - void *offset, void *dest) \ +static void FETCH_FUNC_NAME(file_offset, type)(struct pt_regs *regs, \ + void *offset, void *dest)\ { \ void *vaddr = (void *)translate_user_vaddr(offset); \ \ @@ -260,6 +260,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) goto error; INIT_LIST_HEAD(&tu->list); + INIT_LIST_HEAD(&tu->tp.files); tu->consumer.handler = uprobe_dispatcher; if (is_ret) tu->consumer.ret_handler = uretprobe_dispatcher; @@ -293,7 +294,7 @@ static struct trace_uprobe *find_probe_event(const char *event, const char *grou struct trace_uprobe *tu; list_for_each_entry(tu, &uprobe_list, list) - if (strcmp(tu->tp.call.name, event) == 0 && + if (strcmp(ftrace_event_name(&tu->tp.call), event) == 0 && strcmp(tu->tp.call.class->system, group) == 0) return tu; @@ -323,7 +324,8 @@ static int register_trace_uprobe(struct trace_uprobe *tu) mutex_lock(&uprobe_lock); /* register as an event */ - old_tu = find_probe_event(tu->tp.call.name, tu->tp.call.class->system); + old_tu = find_probe_event(ftrace_event_name(&tu->tp.call), + tu->tp.call.class->system); if (old_tu) { /* delete old event */ ret = unregister_trace_uprobe(old_tu); @@ -598,7 +600,8 @@ static int probes_seq_show(struct seq_file *m, void *v) char c = is_ret_probe(tu) ? 'r' : 'p'; int i; - seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system, tu->tp.call.name); + seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system, + ftrace_event_name(&tu->tp.call)); seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset); for (i = 0; i < tu->tp.nr_args; i++) @@ -648,7 +651,8 @@ static int probes_profile_seq_show(struct seq_file *m, void *v) { struct trace_uprobe *tu = v; - seq_printf(m, " %s %-44s %15lu\n", tu->filename, tu->tp.call.name, tu->nhit); + seq_printf(m, " %s %-44s %15lu\n", tu->filename, + ftrace_event_name(&tu->tp.call), tu->nhit); return 0; } @@ -728,9 +732,15 @@ static int uprobe_buffer_enable(void) static void uprobe_buffer_disable(void) { + int cpu; + BUG_ON(!mutex_is_locked(&event_mutex)); if (--uprobe_buffer_refcnt == 0) { + for_each_possible_cpu(cpu) + free_page((unsigned long)per_cpu_ptr(uprobe_cpu_buffer, + cpu)->buf); + free_percpu(uprobe_cpu_buffer); uprobe_cpu_buffer = NULL; } @@ -758,31 +768,32 @@ static void uprobe_buffer_put(struct uprobe_cpu_buffer *ucb) mutex_unlock(&ucb->mutex); } -static void uprobe_trace_print(struct trace_uprobe *tu, - unsigned long func, struct pt_regs *regs) +static void __uprobe_trace_func(struct trace_uprobe *tu, + unsigned long func, struct pt_regs *regs, + struct uprobe_cpu_buffer *ucb, int dsize, + struct ftrace_event_file *ftrace_file) { struct uprobe_trace_entry_head *entry; struct ring_buffer_event *event; struct ring_buffer *buffer; - struct uprobe_cpu_buffer *ucb; void *data; - int size, dsize, esize; + int size, esize; struct ftrace_event_call *call = &tu->tp.call; - dsize = __get_data_size(&tu->tp, regs); - esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); + WARN_ON(call != ftrace_file->event_call); - if (WARN_ON_ONCE(!uprobe_cpu_buffer || tu->tp.size + dsize > PAGE_SIZE)) + if (WARN_ON_ONCE(tu->tp.size + dsize > PAGE_SIZE)) return; - ucb = uprobe_buffer_get(); - store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize); + if (ftrace_trigger_soft_disabled(ftrace_file)) + return; + esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); size = esize + tu->tp.size + dsize; - event = trace_current_buffer_lock_reserve(&buffer, call->event.type, - size, 0, 0); + event = trace_event_buffer_lock_reserve(&buffer, ftrace_file, + call->event.type, size, 0, 0); if (!event) - goto out; + return; entry = ring_buffer_event_data(event); if (is_ret_probe(tu)) { @@ -796,25 +807,36 @@ static void uprobe_trace_print(struct trace_uprobe *tu, memcpy(data, ucb->buf, tu->tp.size + dsize); - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(buffer, event, 0, 0); - -out: - uprobe_buffer_put(ucb); + event_trigger_unlock_commit(ftrace_file, buffer, event, entry, 0, 0); } /* uprobe handler */ -static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) +static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs, + struct uprobe_cpu_buffer *ucb, int dsize) { - if (!is_ret_probe(tu)) - uprobe_trace_print(tu, 0, regs); + struct event_file_link *link; + + if (is_ret_probe(tu)) + return 0; + + rcu_read_lock(); + list_for_each_entry_rcu(link, &tu->tp.files, list) + __uprobe_trace_func(tu, 0, regs, ucb, dsize, link->file); + rcu_read_unlock(); + return 0; } static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func, - struct pt_regs *regs) + struct pt_regs *regs, + struct uprobe_cpu_buffer *ucb, int dsize) { - uprobe_trace_print(tu, func, regs); + struct event_file_link *link; + + rcu_read_lock(); + list_for_each_entry_rcu(link, &tu->tp.files, list) + __uprobe_trace_func(tu, func, regs, ucb, dsize, link->file); + rcu_read_unlock(); } /* Event entry printers */ @@ -831,12 +853,14 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e tu = container_of(event, struct trace_uprobe, tp.call.event); if (is_ret_probe(tu)) { - if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", tu->tp.call.name, + if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", + ftrace_event_name(&tu->tp.call), entry->vaddr[1], entry->vaddr[0])) goto partial; data = DATAOF_TRACE_ENTRY(entry, true); } else { - if (!trace_seq_printf(s, "%s: (0x%lx)", tu->tp.call.name, + if (!trace_seq_printf(s, "%s: (0x%lx)", + ftrace_event_name(&tu->tp.call), entry->vaddr[0])) goto partial; data = DATAOF_TRACE_ENTRY(entry, false); @@ -861,37 +885,88 @@ typedef bool (*filter_func_t)(struct uprobe_consumer *self, struct mm_struct *mm); static int -probe_event_enable(struct trace_uprobe *tu, int flag, filter_func_t filter) +probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file, + filter_func_t filter) { - int ret = 0; + bool enabled = trace_probe_is_enabled(&tu->tp); + struct event_file_link *link = NULL; + int ret; - if (trace_probe_is_enabled(&tu->tp)) - return -EINTR; + if (file) { + if (tu->tp.flags & TP_FLAG_PROFILE) + return -EINTR; - ret = uprobe_buffer_enable(); - if (ret < 0) - return ret; + link = kmalloc(sizeof(*link), GFP_KERNEL); + if (!link) + return -ENOMEM; + + link->file = file; + list_add_tail_rcu(&link->list, &tu->tp.files); + + tu->tp.flags |= TP_FLAG_TRACE; + } else { + if (tu->tp.flags & TP_FLAG_TRACE) + return -EINTR; + + tu->tp.flags |= TP_FLAG_PROFILE; + } WARN_ON(!uprobe_filter_is_empty(&tu->filter)); - tu->tp.flags |= flag; + if (enabled) + return 0; + + ret = uprobe_buffer_enable(); + if (ret) + goto err_flags; + tu->consumer.filter = filter; ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); if (ret) - tu->tp.flags &= ~flag; + goto err_buffer; + + return 0; + err_buffer: + uprobe_buffer_disable(); + + err_flags: + if (file) { + list_del(&link->list); + kfree(link); + tu->tp.flags &= ~TP_FLAG_TRACE; + } else { + tu->tp.flags &= ~TP_FLAG_PROFILE; + } return ret; } -static void probe_event_disable(struct trace_uprobe *tu, int flag) +static void +probe_event_disable(struct trace_uprobe *tu, struct ftrace_event_file *file) { if (!trace_probe_is_enabled(&tu->tp)) return; + if (file) { + struct event_file_link *link; + + link = find_event_file_link(&tu->tp, file); + if (!link) + return; + + list_del_rcu(&link->list); + /* synchronize with u{,ret}probe_trace_func */ + synchronize_sched(); + kfree(link); + + if (!list_empty(&tu->tp.files)) + return; + } + WARN_ON(!uprobe_filter_is_empty(&tu->filter)); uprobe_unregister(tu->inode, tu->offset, &tu->consumer); - tu->tp.flags &= ~flag; + tu->tp.flags &= file ? ~TP_FLAG_TRACE : ~TP_FLAG_PROFILE; uprobe_buffer_disable(); } @@ -948,56 +1023,60 @@ uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event) return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm); } -static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) +static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) { bool done; write_lock(&tu->filter.rwlock); if (event->hw.tp_target) { - /* - * event->parent != NULL means copy_process(), we can avoid - * uprobe_apply(). current->mm must be probed and we can rely - * on dup_mmap() which preserves the already installed bp's. - * - * attr.enable_on_exec means that exec/mmap will install the - * breakpoints we need. - */ + list_del(&event->hw.tp_list); done = tu->filter.nr_systemwide || - event->parent || event->attr.enable_on_exec || + (event->hw.tp_target->flags & PF_EXITING) || uprobe_filter_event(tu, event); - list_add(&event->hw.tp_list, &tu->filter.perf_events); } else { + tu->filter.nr_systemwide--; done = tu->filter.nr_systemwide; - tu->filter.nr_systemwide++; } write_unlock(&tu->filter.rwlock); if (!done) - uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); + return uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); return 0; } -static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) +static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) { bool done; + int err; write_lock(&tu->filter.rwlock); if (event->hw.tp_target) { - list_del(&event->hw.tp_list); + /* + * event->parent != NULL means copy_process(), we can avoid + * uprobe_apply(). current->mm must be probed and we can rely + * on dup_mmap() which preserves the already installed bp's. + * + * attr.enable_on_exec means that exec/mmap will install the + * breakpoints we need. + */ done = tu->filter.nr_systemwide || - (event->hw.tp_target->flags & PF_EXITING) || + event->parent || event->attr.enable_on_exec || uprobe_filter_event(tu, event); + list_add(&event->hw.tp_list, &tu->filter.perf_events); } else { - tu->filter.nr_systemwide--; done = tu->filter.nr_systemwide; + tu->filter.nr_systemwide++; } write_unlock(&tu->filter.rwlock); - if (!done) - uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); - - return 0; + err = 0; + if (!done) { + err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); + if (err) + uprobe_perf_close(tu, event); + } + return err; } static bool uprobe_perf_filter(struct uprobe_consumer *uc, @@ -1014,31 +1093,24 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc, return ret; } -static void uprobe_perf_print(struct trace_uprobe *tu, - unsigned long func, struct pt_regs *regs) +static void __uprobe_perf_func(struct trace_uprobe *tu, + unsigned long func, struct pt_regs *regs, + struct uprobe_cpu_buffer *ucb, int dsize) { struct ftrace_event_call *call = &tu->tp.call; struct uprobe_trace_entry_head *entry; struct hlist_head *head; - struct uprobe_cpu_buffer *ucb; void *data; - int size, dsize, esize; + int size, esize; int rctx; - dsize = __get_data_size(&tu->tp, regs); esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); - if (WARN_ON_ONCE(!uprobe_cpu_buffer)) - return; - size = esize + tu->tp.size + dsize; size = ALIGN(size + sizeof(u32), sizeof(u64)) - sizeof(u32); if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) return; - ucb = uprobe_buffer_get(); - store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize); - preempt_disable(); head = this_cpu_ptr(call->perf_events); if (hlist_empty(head)) @@ -1068,46 +1140,49 @@ static void uprobe_perf_print(struct trace_uprobe *tu, perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); out: preempt_enable(); - uprobe_buffer_put(ucb); } /* uprobe profile handler */ -static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) +static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs, + struct uprobe_cpu_buffer *ucb, int dsize) { if (!uprobe_perf_filter(&tu->consumer, 0, current->mm)) return UPROBE_HANDLER_REMOVE; if (!is_ret_probe(tu)) - uprobe_perf_print(tu, 0, regs); + __uprobe_perf_func(tu, 0, regs, ucb, dsize); return 0; } static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func, - struct pt_regs *regs) + struct pt_regs *regs, + struct uprobe_cpu_buffer *ucb, int dsize) { - uprobe_perf_print(tu, func, regs); + __uprobe_perf_func(tu, func, regs, ucb, dsize); } #endif /* CONFIG_PERF_EVENTS */ -static -int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, void *data) +static int +trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, + void *data) { struct trace_uprobe *tu = event->data; + struct ftrace_event_file *file = data; switch (type) { case TRACE_REG_REGISTER: - return probe_event_enable(tu, TP_FLAG_TRACE, NULL); + return probe_event_enable(tu, file, NULL); case TRACE_REG_UNREGISTER: - probe_event_disable(tu, TP_FLAG_TRACE); + probe_event_disable(tu, file); return 0; #ifdef CONFIG_PERF_EVENTS case TRACE_REG_PERF_REGISTER: - return probe_event_enable(tu, TP_FLAG_PROFILE, uprobe_perf_filter); + return probe_event_enable(tu, NULL, uprobe_perf_filter); case TRACE_REG_PERF_UNREGISTER: - probe_event_disable(tu, TP_FLAG_PROFILE); + probe_event_disable(tu, NULL); return 0; case TRACE_REG_PERF_OPEN: @@ -1127,8 +1202,11 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) { struct trace_uprobe *tu; struct uprobe_dispatch_data udd; + struct uprobe_cpu_buffer *ucb; + int dsize, esize; int ret = 0; + tu = container_of(con, struct trace_uprobe, consumer); tu->nhit++; @@ -1137,13 +1215,23 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) current->utask->vaddr = (unsigned long) &udd; + if (WARN_ON_ONCE(!uprobe_cpu_buffer)) + return 0; + + dsize = __get_data_size(&tu->tp, regs); + esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); + + ucb = uprobe_buffer_get(); + store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize); + if (tu->tp.flags & TP_FLAG_TRACE) - ret |= uprobe_trace_func(tu, regs); + ret |= uprobe_trace_func(tu, regs, ucb, dsize); #ifdef CONFIG_PERF_EVENTS if (tu->tp.flags & TP_FLAG_PROFILE) - ret |= uprobe_perf_func(tu, regs); + ret |= uprobe_perf_func(tu, regs, ucb, dsize); #endif + uprobe_buffer_put(ucb); return ret; } @@ -1152,6 +1240,8 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con, { struct trace_uprobe *tu; struct uprobe_dispatch_data udd; + struct uprobe_cpu_buffer *ucb; + int dsize, esize; tu = container_of(con, struct trace_uprobe, consumer); @@ -1160,13 +1250,23 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con, current->utask->vaddr = (unsigned long) &udd; + if (WARN_ON_ONCE(!uprobe_cpu_buffer)) + return 0; + + dsize = __get_data_size(&tu->tp, regs); + esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); + + ucb = uprobe_buffer_get(); + store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize); + if (tu->tp.flags & TP_FLAG_TRACE) - uretprobe_trace_func(tu, func, regs); + uretprobe_trace_func(tu, func, regs, ucb, dsize); #ifdef CONFIG_PERF_EVENTS if (tu->tp.flags & TP_FLAG_PROFILE) - uretprobe_perf_func(tu, func, regs); + uretprobe_perf_func(tu, func, regs, ucb, dsize); #endif + uprobe_buffer_put(ucb); return 0; } @@ -1198,7 +1298,8 @@ static int register_uprobe_event(struct trace_uprobe *tu) ret = trace_add_event_call(call); if (ret) { - pr_info("Failed to register uprobe event: %s\n", call->name); + pr_info("Failed to register uprobe event: %s\n", + ftrace_event_name(call)); kfree(call->print_fmt); unregister_ftrace_event(&call->event); } diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 29f26540e9c..3490407dc7b 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2008 Mathieu Desnoyers + * Copyright (C) 2008-2014 Mathieu Desnoyers * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -33,43 +33,29 @@ extern struct tracepoint * const __stop___tracepoints_ptrs[]; /* Set to 1 to enable tracepoint debug output */ static const int tracepoint_debug; +#ifdef CONFIG_MODULES /* - * Tracepoints mutex protects the builtin and module tracepoints and the hash - * table, as well as the local module list. + * Tracepoint module list mutex protects the local module list. */ -static DEFINE_MUTEX(tracepoints_mutex); +static DEFINE_MUTEX(tracepoint_module_list_mutex); -#ifdef CONFIG_MODULES -/* Local list of struct module */ +/* Local list of struct tp_module */ static LIST_HEAD(tracepoint_module_list); #endif /* CONFIG_MODULES */ /* - * Tracepoint hash table, containing the active tracepoints. - * Protected by tracepoints_mutex. + * tracepoints_mutex protects the builtin and module tracepoints. + * tracepoints_mutex nests inside tracepoint_module_list_mutex. */ -#define TRACEPOINT_HASH_BITS 6 -#define TRACEPOINT_TABLE_SIZE (1 << TRACEPOINT_HASH_BITS) -static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE]; +static DEFINE_MUTEX(tracepoints_mutex); /* * Note about RCU : * It is used to delay the free of multiple probes array until a quiescent * state is reached. - * Tracepoint entries modifications are protected by the tracepoints_mutex. */ -struct tracepoint_entry { - struct hlist_node hlist; - struct tracepoint_func *funcs; - int refcount; /* Number of times armed. 0 if disarmed. */ - char name[0]; -}; - struct tp_probes { - union { - struct rcu_head rcu; - struct list_head list; - } u; + struct rcu_head rcu; struct tracepoint_func probes[0]; }; @@ -82,7 +68,7 @@ static inline void *allocate_probes(int count) static void rcu_free_old_probes(struct rcu_head *head) { - kfree(container_of(head, struct tp_probes, u.rcu)); + kfree(container_of(head, struct tp_probes, rcu)); } static inline void release_probes(struct tracepoint_func *old) @@ -90,38 +76,37 @@ static inline void release_probes(struct tracepoint_func *old) if (old) { struct tp_probes *tp_probes = container_of(old, struct tp_probes, probes[0]); - call_rcu_sched(&tp_probes->u.rcu, rcu_free_old_probes); + call_rcu_sched(&tp_probes->rcu, rcu_free_old_probes); } } -static void debug_print_probes(struct tracepoint_entry *entry) +static void debug_print_probes(struct tracepoint_func *funcs) { int i; - if (!tracepoint_debug || !entry->funcs) + if (!tracepoint_debug || !funcs) return; - for (i = 0; entry->funcs[i].func; i++) - printk(KERN_DEBUG "Probe %d : %p\n", i, entry->funcs[i].func); + for (i = 0; funcs[i].func; i++) + printk(KERN_DEBUG "Probe %d : %p\n", i, funcs[i].func); } -static struct tracepoint_func * -tracepoint_entry_add_probe(struct tracepoint_entry *entry, - void *probe, void *data) +static struct tracepoint_func *func_add(struct tracepoint_func **funcs, + struct tracepoint_func *tp_func) { int nr_probes = 0; struct tracepoint_func *old, *new; - if (WARN_ON(!probe)) + if (WARN_ON(!tp_func->func)) return ERR_PTR(-EINVAL); - debug_print_probes(entry); - old = entry->funcs; + debug_print_probes(*funcs); + old = *funcs; if (old) { /* (N -> N+1), (N != 0, 1) probes */ for (nr_probes = 0; old[nr_probes].func; nr_probes++) - if (old[nr_probes].func == probe && - old[nr_probes].data == data) + if (old[nr_probes].func == tp_func->func && + old[nr_probes].data == tp_func->data) return ERR_PTR(-EEXIST); } /* + 2 : one for new probe, one for NULL func */ @@ -130,33 +115,30 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, return ERR_PTR(-ENOMEM); if (old) memcpy(new, old, nr_probes * sizeof(struct tracepoint_func)); - new[nr_probes].func = probe; - new[nr_probes].data = data; + new[nr_probes] = *tp_func; new[nr_probes + 1].func = NULL; - entry->refcount = nr_probes + 1; - entry->funcs = new; - debug_print_probes(entry); + *funcs = new; + debug_print_probes(*funcs); return old; } -static void * -tracepoint_entry_remove_probe(struct tracepoint_entry *entry, - void *probe, void *data) +static void *func_remove(struct tracepoint_func **funcs, + struct tracepoint_func *tp_func) { int nr_probes = 0, nr_del = 0, i; struct tracepoint_func *old, *new; - old = entry->funcs; + old = *funcs; if (!old) return ERR_PTR(-ENOENT); - debug_print_probes(entry); + debug_print_probes(*funcs); /* (N -> M), (N > 1, M >= 0) probes */ - if (probe) { + if (tp_func->func) { for (nr_probes = 0; old[nr_probes].func; nr_probes++) { - if (old[nr_probes].func == probe && - old[nr_probes].data == data) + if (old[nr_probes].func == tp_func->func && + old[nr_probes].data == tp_func->data) nr_del++; } } @@ -167,9 +149,8 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, */ if (nr_probes - nr_del == 0) { /* N -> 0, (N > 1) */ - entry->funcs = NULL; - entry->refcount = 0; - debug_print_probes(entry); + *funcs = NULL; + debug_print_probes(*funcs); return old; } else { int j = 0; @@ -179,90 +160,34 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, if (new == NULL) return ERR_PTR(-ENOMEM); for (i = 0; old[i].func; i++) - if (old[i].func != probe || old[i].data != data) + if (old[i].func != tp_func->func + || old[i].data != tp_func->data) new[j++] = old[i]; new[nr_probes - nr_del].func = NULL; - entry->refcount = nr_probes - nr_del; - entry->funcs = new; + *funcs = new; } - debug_print_probes(entry); + debug_print_probes(*funcs); return old; } /* - * Get tracepoint if the tracepoint is present in the tracepoint hash table. - * Must be called with tracepoints_mutex held. - * Returns NULL if not present. + * Add the probe function to a tracepoint. */ -static struct tracepoint_entry *get_tracepoint(const char *name) +static int tracepoint_add_func(struct tracepoint *tp, + struct tracepoint_func *func) { - struct hlist_head *head; - struct tracepoint_entry *e; - u32 hash = jhash(name, strlen(name), 0); - - head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)]; - hlist_for_each_entry(e, head, hlist) { - if (!strcmp(name, e->name)) - return e; - } - return NULL; -} + struct tracepoint_func *old, *tp_funcs; -/* - * Add the tracepoint to the tracepoint hash table. Must be called with - * tracepoints_mutex held. - */ -static struct tracepoint_entry *add_tracepoint(const char *name) -{ - struct hlist_head *head; - struct tracepoint_entry *e; - size_t name_len = strlen(name) + 1; - u32 hash = jhash(name, name_len-1, 0); - - head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)]; - hlist_for_each_entry(e, head, hlist) { - if (!strcmp(name, e->name)) { - printk(KERN_NOTICE - "tracepoint %s busy\n", name); - return ERR_PTR(-EEXIST); /* Already there */ - } - } - /* - * Using kmalloc here to allocate a variable length element. Could - * cause some memory fragmentation if overused. - */ - e = kmalloc(sizeof(struct tracepoint_entry) + name_len, GFP_KERNEL); - if (!e) - return ERR_PTR(-ENOMEM); - memcpy(&e->name[0], name, name_len); - e->funcs = NULL; - e->refcount = 0; - hlist_add_head(&e->hlist, head); - return e; -} - -/* - * Remove the tracepoint from the tracepoint hash table. Must be called with - * mutex_lock held. - */ -static inline void remove_tracepoint(struct tracepoint_entry *e) -{ - hlist_del(&e->hlist); - kfree(e); -} + if (tp->regfunc && !static_key_enabled(&tp->key)) + tp->regfunc(); -/* - * Sets the probe callback corresponding to one tracepoint. - */ -static void set_tracepoint(struct tracepoint_entry **entry, - struct tracepoint *elem, int active) -{ - WARN_ON(strcmp((*entry)->name, elem->name) != 0); - - if (elem->regfunc && !static_key_enabled(&elem->key) && active) - elem->regfunc(); - else if (elem->unregfunc && static_key_enabled(&elem->key) && !active) - elem->unregfunc(); + tp_funcs = rcu_dereference_protected(tp->funcs, + lockdep_is_held(&tracepoints_mutex)); + old = func_add(&tp_funcs, func); + if (IS_ERR(old)) { + WARN_ON_ONCE(1); + return PTR_ERR(old); + } /* * rcu_assign_pointer has a smp_wmb() which makes sure that the new @@ -271,421 +196,218 @@ static void set_tracepoint(struct tracepoint_entry **entry, * include/linux/tracepoints.h. A matching smp_read_barrier_depends() * is used. */ - rcu_assign_pointer(elem->funcs, (*entry)->funcs); - if (active && !static_key_enabled(&elem->key)) - static_key_slow_inc(&elem->key); - else if (!active && static_key_enabled(&elem->key)) - static_key_slow_dec(&elem->key); + rcu_assign_pointer(tp->funcs, tp_funcs); + if (!static_key_enabled(&tp->key)) + static_key_slow_inc(&tp->key); + release_probes(old); + return 0; } /* - * Disable a tracepoint and its probe callback. + * Remove a probe function from a tracepoint. * Note: only waiting an RCU period after setting elem->call to the empty * function insures that the original callback is not used anymore. This insured * by preempt_disable around the call site. */ -static void disable_tracepoint(struct tracepoint *elem) -{ - if (elem->unregfunc && static_key_enabled(&elem->key)) - elem->unregfunc(); - - if (static_key_enabled(&elem->key)) - static_key_slow_dec(&elem->key); - rcu_assign_pointer(elem->funcs, NULL); -} - -/** - * tracepoint_update_probe_range - Update a probe range - * @begin: beginning of the range - * @end: end of the range - * - * Updates the probe callback corresponding to a range of tracepoints. - * Called with tracepoints_mutex held. - */ -static void tracepoint_update_probe_range(struct tracepoint * const *begin, - struct tracepoint * const *end) +static int tracepoint_remove_func(struct tracepoint *tp, + struct tracepoint_func *func) { - struct tracepoint * const *iter; - struct tracepoint_entry *mark_entry; - - if (!begin) - return; + struct tracepoint_func *old, *tp_funcs; - for (iter = begin; iter < end; iter++) { - mark_entry = get_tracepoint((*iter)->name); - if (mark_entry) { - set_tracepoint(&mark_entry, *iter, - !!mark_entry->refcount); - } else { - disable_tracepoint(*iter); - } + tp_funcs = rcu_dereference_protected(tp->funcs, + lockdep_is_held(&tracepoints_mutex)); + old = func_remove(&tp_funcs, func); + if (IS_ERR(old)) { + WARN_ON_ONCE(1); + return PTR_ERR(old); } -} - -#ifdef CONFIG_MODULES -void module_update_tracepoints(void) -{ - struct tp_module *tp_mod; - - list_for_each_entry(tp_mod, &tracepoint_module_list, list) - tracepoint_update_probe_range(tp_mod->tracepoints_ptrs, - tp_mod->tracepoints_ptrs + tp_mod->num_tracepoints); -} -#else /* CONFIG_MODULES */ -void module_update_tracepoints(void) -{ -} -#endif /* CONFIG_MODULES */ + if (!tp_funcs) { + /* Removed last function */ + if (tp->unregfunc && static_key_enabled(&tp->key)) + tp->unregfunc(); -/* - * Update probes, removing the faulty probes. - * Called with tracepoints_mutex held. - */ -static void tracepoint_update_probes(void) -{ - /* Core kernel tracepoints */ - tracepoint_update_probe_range(__start___tracepoints_ptrs, - __stop___tracepoints_ptrs); - /* tracepoints in modules. */ - module_update_tracepoints(); -} - -static struct tracepoint_func * -tracepoint_add_probe(const char *name, void *probe, void *data) -{ - struct tracepoint_entry *entry; - struct tracepoint_func *old; - - entry = get_tracepoint(name); - if (!entry) { - entry = add_tracepoint(name); - if (IS_ERR(entry)) - return (struct tracepoint_func *)entry; + if (static_key_enabled(&tp->key)) + static_key_slow_dec(&tp->key); } - old = tracepoint_entry_add_probe(entry, probe, data); - if (IS_ERR(old) && !entry->refcount) - remove_tracepoint(entry); - return old; + rcu_assign_pointer(tp->funcs, tp_funcs); + release_probes(old); + return 0; } /** * tracepoint_probe_register - Connect a probe to a tracepoint - * @name: tracepoint name + * @tp: tracepoint * @probe: probe handler + * @data: tracepoint data * * Returns 0 if ok, error value on error. - * The probe address must at least be aligned on the architecture pointer size. + * Note: if @tp is within a module, the caller is responsible for + * unregistering the probe before the module is gone. This can be + * performed either with a tracepoint module going notifier, or from + * within module exit functions. */ -int tracepoint_probe_register(const char *name, void *probe, void *data) +int tracepoint_probe_register(struct tracepoint *tp, void *probe, void *data) { - struct tracepoint_func *old; + struct tracepoint_func tp_func; + int ret; mutex_lock(&tracepoints_mutex); - old = tracepoint_add_probe(name, probe, data); - if (IS_ERR(old)) { - mutex_unlock(&tracepoints_mutex); - return PTR_ERR(old); - } - tracepoint_update_probes(); /* may update entry */ + tp_func.func = probe; + tp_func.data = data; + ret = tracepoint_add_func(tp, &tp_func); mutex_unlock(&tracepoints_mutex); - release_probes(old); - return 0; + return ret; } EXPORT_SYMBOL_GPL(tracepoint_probe_register); -static struct tracepoint_func * -tracepoint_remove_probe(const char *name, void *probe, void *data) -{ - struct tracepoint_entry *entry; - struct tracepoint_func *old; - - entry = get_tracepoint(name); - if (!entry) - return ERR_PTR(-ENOENT); - old = tracepoint_entry_remove_probe(entry, probe, data); - if (IS_ERR(old)) - return old; - if (!entry->refcount) - remove_tracepoint(entry); - return old; -} - /** * tracepoint_probe_unregister - Disconnect a probe from a tracepoint - * @name: tracepoint name + * @tp: tracepoint * @probe: probe function pointer + * @data: tracepoint data * - * We do not need to call a synchronize_sched to make sure the probes have - * finished running before doing a module unload, because the module unload - * itself uses stop_machine(), which insures that every preempt disabled section - * have finished. + * Returns 0 if ok, error value on error. */ -int tracepoint_probe_unregister(const char *name, void *probe, void *data) +int tracepoint_probe_unregister(struct tracepoint *tp, void *probe, void *data) { - struct tracepoint_func *old; + struct tracepoint_func tp_func; + int ret; mutex_lock(&tracepoints_mutex); - old = tracepoint_remove_probe(name, probe, data); - if (IS_ERR(old)) { - mutex_unlock(&tracepoints_mutex); - return PTR_ERR(old); - } - tracepoint_update_probes(); /* may update entry */ + tp_func.func = probe; + tp_func.data = data; + ret = tracepoint_remove_func(tp, &tp_func); mutex_unlock(&tracepoints_mutex); - release_probes(old); - return 0; + return ret; } EXPORT_SYMBOL_GPL(tracepoint_probe_unregister); -static LIST_HEAD(old_probes); -static int need_update; - -static void tracepoint_add_old_probes(void *old) +#ifdef CONFIG_MODULES +bool trace_module_has_bad_taint(struct module *mod) { - need_update = 1; - if (old) { - struct tp_probes *tp_probes = container_of(old, - struct tp_probes, probes[0]); - list_add(&tp_probes->u.list, &old_probes); - } + return mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP) | + (1 << TAINT_UNSIGNED_MODULE)); } -/** - * tracepoint_probe_register_noupdate - register a probe but not connect - * @name: tracepoint name - * @probe: probe handler - * - * caller must call tracepoint_probe_update_all() - */ -int tracepoint_probe_register_noupdate(const char *name, void *probe, - void *data) -{ - struct tracepoint_func *old; - - mutex_lock(&tracepoints_mutex); - old = tracepoint_add_probe(name, probe, data); - if (IS_ERR(old)) { - mutex_unlock(&tracepoints_mutex); - return PTR_ERR(old); - } - tracepoint_add_old_probes(old); - mutex_unlock(&tracepoints_mutex); - return 0; -} -EXPORT_SYMBOL_GPL(tracepoint_probe_register_noupdate); +static BLOCKING_NOTIFIER_HEAD(tracepoint_notify_list); /** - * tracepoint_probe_unregister_noupdate - remove a probe but not disconnect - * @name: tracepoint name - * @probe: probe function pointer + * register_tracepoint_notifier - register tracepoint coming/going notifier + * @nb: notifier block * - * caller must call tracepoint_probe_update_all() + * Notifiers registered with this function are called on module + * coming/going with the tracepoint_module_list_mutex held. + * The notifier block callback should expect a "struct tp_module" data + * pointer. */ -int tracepoint_probe_unregister_noupdate(const char *name, void *probe, - void *data) +int register_tracepoint_module_notifier(struct notifier_block *nb) { - struct tracepoint_func *old; - - mutex_lock(&tracepoints_mutex); - old = tracepoint_remove_probe(name, probe, data); - if (IS_ERR(old)) { - mutex_unlock(&tracepoints_mutex); - return PTR_ERR(old); - } - tracepoint_add_old_probes(old); - mutex_unlock(&tracepoints_mutex); - return 0; -} -EXPORT_SYMBOL_GPL(tracepoint_probe_unregister_noupdate); - -/** - * tracepoint_probe_update_all - update tracepoints - */ -void tracepoint_probe_update_all(void) -{ - LIST_HEAD(release_probes); - struct tp_probes *pos, *next; + struct tp_module *tp_mod; + int ret; - mutex_lock(&tracepoints_mutex); - if (!need_update) { - mutex_unlock(&tracepoints_mutex); - return; - } - if (!list_empty(&old_probes)) - list_replace_init(&old_probes, &release_probes); - need_update = 0; - tracepoint_update_probes(); - mutex_unlock(&tracepoints_mutex); - list_for_each_entry_safe(pos, next, &release_probes, u.list) { - list_del(&pos->u.list); - call_rcu_sched(&pos->u.rcu, rcu_free_old_probes); - } + mutex_lock(&tracepoint_module_list_mutex); + ret = blocking_notifier_chain_register(&tracepoint_notify_list, nb); + if (ret) + goto end; + list_for_each_entry(tp_mod, &tracepoint_module_list, list) + (void) nb->notifier_call(nb, MODULE_STATE_COMING, tp_mod); +end: + mutex_unlock(&tracepoint_module_list_mutex); + return ret; } -EXPORT_SYMBOL_GPL(tracepoint_probe_update_all); +EXPORT_SYMBOL_GPL(register_tracepoint_module_notifier); /** - * tracepoint_get_iter_range - Get a next tracepoint iterator given a range. - * @tracepoint: current tracepoints (in), next tracepoint (out) - * @begin: beginning of the range - * @end: end of the range + * unregister_tracepoint_notifier - unregister tracepoint coming/going notifier + * @nb: notifier block * - * Returns whether a next tracepoint has been found (1) or not (0). - * Will return the first tracepoint in the range if the input tracepoint is - * NULL. + * The notifier block callback should expect a "struct tp_module" data + * pointer. */ -static int tracepoint_get_iter_range(struct tracepoint * const **tracepoint, - struct tracepoint * const *begin, struct tracepoint * const *end) +int unregister_tracepoint_module_notifier(struct notifier_block *nb) { - if (!*tracepoint && begin != end) { - *tracepoint = begin; - return 1; - } - if (*tracepoint >= begin && *tracepoint < end) - return 1; - return 0; -} + struct tp_module *tp_mod; + int ret; -#ifdef CONFIG_MODULES -static void tracepoint_get_iter(struct tracepoint_iter *iter) -{ - int found = 0; - struct tp_module *iter_mod; - - /* Core kernel tracepoints */ - if (!iter->module) { - found = tracepoint_get_iter_range(&iter->tracepoint, - __start___tracepoints_ptrs, - __stop___tracepoints_ptrs); - if (found) - goto end; - } - /* Tracepoints in modules */ - mutex_lock(&tracepoints_mutex); - list_for_each_entry(iter_mod, &tracepoint_module_list, list) { - /* - * Sorted module list - */ - if (iter_mod < iter->module) - continue; - else if (iter_mod > iter->module) - iter->tracepoint = NULL; - found = tracepoint_get_iter_range(&iter->tracepoint, - iter_mod->tracepoints_ptrs, - iter_mod->tracepoints_ptrs - + iter_mod->num_tracepoints); - if (found) { - iter->module = iter_mod; - break; - } - } - mutex_unlock(&tracepoints_mutex); + mutex_lock(&tracepoint_module_list_mutex); + ret = blocking_notifier_chain_unregister(&tracepoint_notify_list, nb); + if (ret) + goto end; + list_for_each_entry(tp_mod, &tracepoint_module_list, list) + (void) nb->notifier_call(nb, MODULE_STATE_GOING, tp_mod); end: - if (!found) - tracepoint_iter_reset(iter); -} -#else /* CONFIG_MODULES */ -static void tracepoint_get_iter(struct tracepoint_iter *iter) -{ - int found = 0; - - /* Core kernel tracepoints */ - found = tracepoint_get_iter_range(&iter->tracepoint, - __start___tracepoints_ptrs, - __stop___tracepoints_ptrs); - if (!found) - tracepoint_iter_reset(iter); -} -#endif /* CONFIG_MODULES */ - -void tracepoint_iter_start(struct tracepoint_iter *iter) -{ - tracepoint_get_iter(iter); -} -EXPORT_SYMBOL_GPL(tracepoint_iter_start); + mutex_unlock(&tracepoint_module_list_mutex); + return ret; -void tracepoint_iter_next(struct tracepoint_iter *iter) -{ - iter->tracepoint++; - /* - * iter->tracepoint may be invalid because we blindly incremented it. - * Make sure it is valid by marshalling on the tracepoints, getting the - * tracepoints from following modules if necessary. - */ - tracepoint_get_iter(iter); } -EXPORT_SYMBOL_GPL(tracepoint_iter_next); +EXPORT_SYMBOL_GPL(unregister_tracepoint_module_notifier); -void tracepoint_iter_stop(struct tracepoint_iter *iter) +/* + * Ensure the tracer unregistered the module's probes before the module + * teardown is performed. Prevents leaks of probe and data pointers. + */ +static void tp_module_going_check_quiescent(struct tracepoint * const *begin, + struct tracepoint * const *end) { -} -EXPORT_SYMBOL_GPL(tracepoint_iter_stop); + struct tracepoint * const *iter; -void tracepoint_iter_reset(struct tracepoint_iter *iter) -{ -#ifdef CONFIG_MODULES - iter->module = NULL; -#endif /* CONFIG_MODULES */ - iter->tracepoint = NULL; + if (!begin) + return; + for (iter = begin; iter < end; iter++) + WARN_ON_ONCE((*iter)->funcs); } -EXPORT_SYMBOL_GPL(tracepoint_iter_reset); -#ifdef CONFIG_MODULES static int tracepoint_module_coming(struct module *mod) { - struct tp_module *tp_mod, *iter; + struct tp_module *tp_mod; int ret = 0; + if (!mod->num_tracepoints) + return 0; + /* * We skip modules that taint the kernel, especially those with different * module headers (for forced load), to make sure we don't cause a crash. - * Staging and out-of-tree GPL modules are fine. + * Staging, out-of-tree, and unsigned GPL modules are fine. */ - if (mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP))) + if (trace_module_has_bad_taint(mod)) return 0; - mutex_lock(&tracepoints_mutex); + mutex_lock(&tracepoint_module_list_mutex); tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL); if (!tp_mod) { ret = -ENOMEM; goto end; } - tp_mod->num_tracepoints = mod->num_tracepoints; - tp_mod->tracepoints_ptrs = mod->tracepoints_ptrs; - - /* - * tracepoint_module_list is kept sorted by struct module pointer - * address for iteration on tracepoints from a seq_file that can release - * the mutex between calls. - */ - list_for_each_entry_reverse(iter, &tracepoint_module_list, list) { - BUG_ON(iter == tp_mod); /* Should never be in the list twice */ - if (iter < tp_mod) { - /* We belong to the location right after iter. */ - list_add(&tp_mod->list, &iter->list); - goto module_added; - } - } - /* We belong to the beginning of the list */ - list_add(&tp_mod->list, &tracepoint_module_list); -module_added: - tracepoint_update_probe_range(mod->tracepoints_ptrs, - mod->tracepoints_ptrs + mod->num_tracepoints); + tp_mod->mod = mod; + list_add_tail(&tp_mod->list, &tracepoint_module_list); + blocking_notifier_call_chain(&tracepoint_notify_list, + MODULE_STATE_COMING, tp_mod); end: - mutex_unlock(&tracepoints_mutex); + mutex_unlock(&tracepoint_module_list_mutex); return ret; } -static int tracepoint_module_going(struct module *mod) +static void tracepoint_module_going(struct module *mod) { - struct tp_module *pos; + struct tp_module *tp_mod; - mutex_lock(&tracepoints_mutex); - tracepoint_update_probe_range(mod->tracepoints_ptrs, - mod->tracepoints_ptrs + mod->num_tracepoints); - list_for_each_entry(pos, &tracepoint_module_list, list) { - if (pos->tracepoints_ptrs == mod->tracepoints_ptrs) { - list_del(&pos->list); - kfree(pos); + if (!mod->num_tracepoints) + return; + + mutex_lock(&tracepoint_module_list_mutex); + list_for_each_entry(tp_mod, &tracepoint_module_list, list) { + if (tp_mod->mod == mod) { + blocking_notifier_call_chain(&tracepoint_notify_list, + MODULE_STATE_GOING, tp_mod); + list_del(&tp_mod->list); + kfree(tp_mod); + /* + * Called the going notifier before checking for + * quiescence. + */ + tp_module_going_check_quiescent(mod->tracepoints_ptrs, + mod->tracepoints_ptrs + mod->num_tracepoints); break; } } @@ -695,12 +417,11 @@ static int tracepoint_module_going(struct module *mod) * flag on "going", in case a module taints the kernel only after being * loaded. */ - mutex_unlock(&tracepoints_mutex); - return 0; + mutex_unlock(&tracepoint_module_list_mutex); } -int tracepoint_module_notify(struct notifier_block *self, - unsigned long val, void *data) +static int tracepoint_module_notify(struct notifier_block *self, + unsigned long val, void *data) { struct module *mod = data; int ret = 0; @@ -712,24 +433,58 @@ int tracepoint_module_notify(struct notifier_block *self, case MODULE_STATE_LIVE: break; case MODULE_STATE_GOING: - ret = tracepoint_module_going(mod); + tracepoint_module_going(mod); + break; + case MODULE_STATE_UNFORMED: break; } return ret; } -struct notifier_block tracepoint_module_nb = { +static struct notifier_block tracepoint_module_nb = { .notifier_call = tracepoint_module_notify, .priority = 0, }; -static int init_tracepoints(void) +static __init int init_tracepoints(void) { - return register_module_notifier(&tracepoint_module_nb); + int ret; + + ret = register_module_notifier(&tracepoint_module_nb); + if (ret) + pr_warning("Failed to register tracepoint module enter notifier\n"); + + return ret; } __initcall(init_tracepoints); #endif /* CONFIG_MODULES */ +static void for_each_tracepoint_range(struct tracepoint * const *begin, + struct tracepoint * const *end, + void (*fct)(struct tracepoint *tp, void *priv), + void *priv) +{ + struct tracepoint * const *iter; + + if (!begin) + return; + for (iter = begin; iter < end; iter++) + fct(*iter, priv); +} + +/** + * for_each_kernel_tracepoint - iteration on all kernel tracepoints + * @fct: callback + * @priv: private data + */ +void for_each_kernel_tracepoint(void (*fct)(struct tracepoint *tp, void *priv), + void *priv) +{ + for_each_tracepoint_range(__start___tracepoints_ptrs, + __stop___tracepoints_ptrs, fct, priv); +} +EXPORT_SYMBOL_GPL(for_each_kernel_tracepoint); + #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS /* NB: reg/unreg are called while guarded with the tracepoints_mutex */ @@ -737,33 +492,29 @@ static int sys_tracepoint_refcount; void syscall_regfunc(void) { - unsigned long flags; - struct task_struct *g, *t; + struct task_struct *p, *t; if (!sys_tracepoint_refcount) { - read_lock_irqsave(&tasklist_lock, flags); - do_each_thread(g, t) { - /* Skip kernel threads. */ - if (t->mm) - set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); - } while_each_thread(g, t); - read_unlock_irqrestore(&tasklist_lock, flags); + read_lock(&tasklist_lock); + for_each_process_thread(p, t) { + set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); + } + read_unlock(&tasklist_lock); } sys_tracepoint_refcount++; } void syscall_unregfunc(void) { - unsigned long flags; - struct task_struct *g, *t; + struct task_struct *p, *t; sys_tracepoint_refcount--; if (!sys_tracepoint_refcount) { - read_lock_irqsave(&tasklist_lock, flags); - do_each_thread(g, t) { + read_lock(&tasklist_lock); + for_each_process_thread(p, t) { clear_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); - } while_each_thread(g, t); - read_unlock_irqrestore(&tasklist_lock, flags); + } + read_unlock(&tasklist_lock); } } #endif diff --git a/kernel/up.c b/kernel/up.c index 509403e3fbc..1760bf3d146 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -22,16 +22,16 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, } EXPORT_SYMBOL(smp_call_function_single); -void __smp_call_function_single(int cpu, struct call_single_data *csd, - int wait) +int smp_call_function_single_async(int cpu, struct call_single_data *csd) { unsigned long flags; local_irq_save(flags); csd->func(csd->info); local_irq_restore(flags); + return 0; } -EXPORT_SYMBOL(__smp_call_function_single); +EXPORT_SYMBOL(smp_call_function_single_async); int on_each_cpu(smp_call_func_t func, void *info, int wait) { diff --git a/kernel/user.c b/kernel/user.c index c006131beb7..4efa39350e4 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -87,7 +87,6 @@ static DEFINE_SPINLOCK(uidhash_lock); struct user_struct root_user = { .__count = ATOMIC_INIT(1), .processes = ATOMIC_INIT(1), - .files = ATOMIC_INIT(0), .sigpending = ATOMIC_INIT(0), .locked_shm = 0, .uid = GLOBAL_ROOT_UID, @@ -222,5 +221,4 @@ static int __init uid_cache_init(void) return 0; } - -module_init(uid_cache_init); +subsys_initcall(uid_cache_init); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 240fb62cf39..fcc02560fd6 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -152,7 +152,7 @@ static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) /* Find the matching extent */ extents = map->nr_extents; - smp_read_barrier_depends(); + smp_rmb(); for (idx = 0; idx < extents; idx++) { first = map->extent[idx].first; last = first + map->extent[idx].count - 1; @@ -176,7 +176,7 @@ static u32 map_id_down(struct uid_gid_map *map, u32 id) /* Find the matching extent */ extents = map->nr_extents; - smp_read_barrier_depends(); + smp_rmb(); for (idx = 0; idx < extents; idx++) { first = map->extent[idx].first; last = first + map->extent[idx].count - 1; @@ -199,7 +199,7 @@ static u32 map_id_up(struct uid_gid_map *map, u32 id) /* Find the matching extent */ extents = map->nr_extents; - smp_read_barrier_depends(); + smp_rmb(); for (idx = 0; idx < extents; idx++) { first = map->extent[idx].lower_first; last = first + map->extent[idx].count - 1; @@ -225,7 +225,7 @@ static u32 map_id_up(struct uid_gid_map *map, u32 id) * * When there is no mapping defined for the user-namespace uid * pair INVALID_UID is returned. Callers are expected to test - * for and handle handle INVALID_UID being returned. INVALID_UID + * for and handle INVALID_UID being returned. INVALID_UID * may be tested for using uid_valid(). */ kuid_t make_kuid(struct user_namespace *ns, uid_t uid) @@ -286,7 +286,7 @@ EXPORT_SYMBOL(from_kuid_munged); /** * make_kgid - Map a user-namespace gid pair into a kgid. * @ns: User namespace that the gid is in - * @uid: group identifier + * @gid: group identifier * * Maps a user-namespace gid pair into a kernel internal kgid, * and returns that kgid. @@ -482,7 +482,8 @@ static int projid_m_show(struct seq_file *seq, void *v) return 0; } -static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map) +static void *m_start(struct seq_file *seq, loff_t *ppos, + struct uid_gid_map *map) { struct uid_gid_extent *extent = NULL; loff_t pos = *ppos; @@ -546,7 +547,8 @@ struct seq_operations proc_projid_seq_operations = { .show = projid_m_show, }; -static bool mappings_overlap(struct uid_gid_map *new_map, struct uid_gid_extent *extent) +static bool mappings_overlap(struct uid_gid_map *new_map, + struct uid_gid_extent *extent) { u32 upper_first, lower_first, upper_last, lower_last; unsigned idx; @@ -615,9 +617,8 @@ static ssize_t map_write(struct file *file, const char __user *buf, * were written before the count of the extents. * * To achieve this smp_wmb() is used on guarantee the write - * order and smp_read_barrier_depends() is guaranteed that we - * don't have crazy architectures returning stale data. - * + * order and smp_rmb() is guaranteed that we don't have crazy + * architectures returning stale data. */ mutex_lock(&id_map_mutex); @@ -654,7 +655,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, ret = -EINVAL; pos = kbuf; new_map.nr_extents = 0; - for (;pos; pos = next_line) { + for (; pos; pos = next_line) { extent = &new_map.extent[new_map.nr_extents]; /* Find the end of line and ensure I don't look past it */ @@ -688,13 +689,16 @@ static ssize_t map_write(struct file *file, const char __user *buf, /* Verify we have been given valid starting values */ if ((extent->first == (u32) -1) || - (extent->lower_first == (u32) -1 )) + (extent->lower_first == (u32) -1)) goto out; - /* Verify count is not zero and does not cause the extent to wrap */ + /* Verify count is not zero and does not cause the + * extent to wrap + */ if ((extent->first + extent->count) <= extent->first) goto out; - if ((extent->lower_first + extent->count) <= extent->lower_first) + if ((extent->lower_first + extent->count) <= + extent->lower_first) goto out; /* Do the ranges in extent overlap any previous extents? */ @@ -752,7 +756,8 @@ out: return ret; } -ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) +ssize_t proc_uid_map_write(struct file *file, const char __user *buf, + size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; @@ -768,7 +773,8 @@ ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t siz &ns->uid_map, &ns->parent->uid_map); } -ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) +ssize_t proc_gid_map_write(struct file *file, const char __user *buf, + size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; @@ -784,7 +790,8 @@ ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t siz &ns->gid_map, &ns->parent->gid_map); } -ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) +ssize_t proc_projid_map_write(struct file *file, const char __user *buf, + size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; @@ -801,7 +808,7 @@ ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t &ns->projid_map, &ns->parent->projid_map); } -static bool new_idmap_permitted(const struct file *file, +static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, struct uid_gid_map *new_map) { @@ -812,8 +819,7 @@ static bool new_idmap_permitted(const struct file *file, kuid_t uid = make_kuid(ns->parent, id); if (uid_eq(uid, file->f_cred->fsuid)) return true; - } - else if (cap_setid == CAP_SETGID) { + } else if (cap_setid == CAP_SETGID) { kgid_t gid = make_kgid(ns->parent, id); if (gid_eq(gid, file->f_cred->fsgid)) return true; @@ -902,4 +908,4 @@ static __init int user_namespaces_init(void) user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC); return 0; } -module_init(user_namespaces_init); +subsys_initcall(user_namespaces_init); diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 4f69f9a5e22..c8eac43267e 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -17,7 +17,7 @@ #ifdef CONFIG_PROC_SYSCTL -static void *get_uts(ctl_table *table, int write) +static void *get_uts(struct ctl_table *table, int write) { char *which = table->data; struct uts_namespace *uts_ns; @@ -32,7 +32,7 @@ static void *get_uts(ctl_table *table, int write) return which; } -static void put_uts(ctl_table *table, int write, void *which) +static void put_uts(struct ctl_table *table, int write, void *which) { if (!write) up_read(&uts_sem); @@ -44,14 +44,14 @@ static void put_uts(ctl_table *table, int write, void *which) * Special case of dostring for the UTS structure. This has locks * to observe. Should this be in kernel/sys.c ???? */ -static int proc_do_uts_string(ctl_table *table, int write, +static int proc_do_uts_string(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table uts_table; int r; memcpy(&uts_table, table, sizeof(uts_table)); uts_table.data = get_uts(table, write); - r = proc_dostring(&uts_table,write,buffer,lenp, ppos); + r = proc_dostring(&uts_table, write, buffer, lenp, ppos); put_uts(table, write, uts_table.data); if (write) @@ -135,4 +135,4 @@ static int __init utsname_sysctl_init(void) return 0; } -__initcall(utsname_sysctl_init); +device_initcall(utsname_sysctl_init); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 4431610f049..c3319bd1b04 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -31,6 +31,12 @@ int watchdog_user_enabled = 1; int __read_mostly watchdog_thresh = 10; +#ifdef CONFIG_SMP +int __read_mostly sysctl_softlockup_all_cpu_backtrace; +#else +#define sysctl_softlockup_all_cpu_backtrace 0 +#endif + static int __read_mostly watchdog_running; static u64 __read_mostly sample_period; @@ -47,6 +53,7 @@ static DEFINE_PER_CPU(bool, watchdog_nmi_touch); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); #endif +static unsigned long soft_lockup_nmi_warn; /* boot commands */ /* @@ -95,6 +102,15 @@ static int __init nosoftlockup_setup(char *str) } __setup("nosoftlockup", nosoftlockup_setup); /* */ +#ifdef CONFIG_SMP +static int __init softlockup_all_cpu_backtrace_setup(char *str) +{ + sysctl_softlockup_all_cpu_backtrace = + !!simple_strtol(str, NULL, 0); + return 1; +} +__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); +#endif /* * Hard-lockup warnings should be triggered after just a few seconds. Soft- @@ -138,7 +154,11 @@ static void __touch_watchdog(void) void touch_softlockup_watchdog(void) { - __this_cpu_write(watchdog_touch_ts, 0); + /* + * Preemption can be enabled. It doesn't matter which CPU's timestamp + * gets zeroed here, so use the raw_ operation. + */ + raw_cpu_write(watchdog_touch_ts, 0); } EXPORT_SYMBOL(touch_softlockup_watchdog); @@ -158,14 +178,14 @@ void touch_all_softlockup_watchdogs(void) #ifdef CONFIG_HARDLOCKUP_DETECTOR void touch_nmi_watchdog(void) { - if (watchdog_user_enabled) { - unsigned cpu; - - for_each_present_cpu(cpu) { - if (per_cpu(watchdog_nmi_touch, cpu) != true) - per_cpu(watchdog_nmi_touch, cpu) = true; - } - } + /* + * Using __raw here because some code paths have + * preemption enabled. If preemption is enabled + * then interrupts should be enabled too, in which + * case we shouldn't have to worry about the watchdog + * going off. + */ + __raw_get_cpu_var(watchdog_nmi_touch) = true; touch_softlockup_watchdog(); } EXPORT_SYMBOL(touch_nmi_watchdog); @@ -267,6 +287,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts); struct pt_regs *regs = get_irq_regs(); int duration; + int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace; /* kick the hardlockup detector */ watchdog_interrupt_count(); @@ -313,6 +334,17 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) if (__this_cpu_read(soft_watchdog_warn) == true) return HRTIMER_RESTART; + if (softlockup_all_cpu_backtrace) { + /* Prevent multiple soft-lockup reports if one cpu is already + * engaged in dumping cpu back traces + */ + if (test_and_set_bit(0, &soft_lockup_nmi_warn)) { + /* Someone else will report us. Let's give up */ + __this_cpu_write(soft_watchdog_warn, true); + return HRTIMER_RESTART; + } + } + printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", smp_processor_id(), duration, current->comm, task_pid_nr(current)); @@ -323,6 +355,17 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) else dump_stack(); + if (softlockup_all_cpu_backtrace) { + /* Avoid generating two back traces for current + * given that one is already made above + */ + trigger_allbutself_cpu_backtrace(); + + clear_bit(0, &soft_lockup_nmi_warn); + /* Barrier to sync with other cpus */ + smp_mb__after_atomic(); + } + if (softlockup_panic) panic("softlockup: hung tasks"); __this_cpu_write(soft_watchdog_warn, true); @@ -505,7 +548,6 @@ static void restart_watchdog_hrtimer(void *info) static void update_timers(int cpu) { - struct call_single_data data = {.func = restart_watchdog_hrtimer}; /* * Make sure that perf event counter will adopt to a new * sampling period. Updating the sampling period directly would @@ -515,7 +557,7 @@ static void update_timers(int cpu) * might be late already so we have to restart the timer as well. */ watchdog_nmi_disable(cpu); - __smp_call_function_single(cpu, &data, 1); + smp_call_function_single(cpu, restart_watchdog_hrtimer, NULL, 1); watchdog_nmi_enable(cpu); } @@ -524,10 +566,8 @@ static void update_timers_all_cpus(void) int cpu; get_online_cpus(); - preempt_disable(); for_each_online_cpu(cpu) update_timers(cpu); - preempt_enable(); put_online_cpus(); } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 82ef9f3b747..35974ac6960 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -65,15 +65,12 @@ enum { * be executing on any CPU. The pool behaves as an unbound one. * * Note that DISASSOCIATED should be flipped only while holding - * manager_mutex to avoid changing binding state while - * create_worker() is in progress. + * attach_mutex to avoid changing binding state while + * worker_attach_to_pool() is in progress. */ - POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ - POOL_FREEZING = 1 << 3, /* freeze in progress */ /* worker flags */ - WORKER_STARTED = 1 << 0, /* started */ WORKER_DIE = 1 << 1, /* die die die */ WORKER_IDLE = 1 << 2, /* is idle */ WORKER_PREP = 1 << 3, /* preparing to run works */ @@ -100,10 +97,10 @@ enum { /* * Rescue workers are used only on emergencies and shared by - * all cpus. Give -20. + * all cpus. Give MIN_NICE. */ - RESCUER_NICE_LEVEL = -20, - HIGHPRI_NICE_LEVEL = -20, + RESCUER_NICE_LEVEL = MIN_NICE, + HIGHPRI_NICE_LEVEL = MIN_NICE, WQ_NAME_LEN = 24, }; @@ -124,8 +121,7 @@ enum { * cpu or grabbing pool->lock is enough for read access. If * POOL_DISASSOCIATED is set, it's identical to L. * - * MG: pool->manager_mutex and pool->lock protected. Writes require both - * locks. Reads can happen under either lock. + * A: pool->attach_mutex protected. * * PL: wq_pool_mutex protected. * @@ -163,8 +159,11 @@ struct worker_pool { /* see manage_workers() for details on the two manager mutexes */ struct mutex manager_arb; /* manager arbitration */ - struct mutex manager_mutex; /* manager exclusion */ - struct idr worker_idr; /* MG: worker IDs and iteration */ + struct mutex attach_mutex; /* attach/detach exclusion */ + struct list_head workers; /* A: attached workers */ + struct completion *detach_completion; /* all workers detached */ + + struct ida worker_ida; /* worker IDs for task name */ struct workqueue_attrs *attrs; /* I: worker attributes */ struct hlist_node hash_node; /* PL: unbound_pool_hash node */ @@ -340,16 +339,6 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, lockdep_is_held(&wq->mutex), \ "sched RCU or wq->mutex should be held") -#ifdef CONFIG_LOCKDEP -#define assert_manager_or_pool_lock(pool) \ - WARN_ONCE(debug_locks && \ - !lockdep_is_held(&(pool)->manager_mutex) && \ - !lockdep_is_held(&(pool)->lock), \ - "pool->manager_mutex or ->lock should be held") -#else -#define assert_manager_or_pool_lock(pool) do { } while (0) -#endif - #define for_each_cpu_worker_pool(pool, cpu) \ for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \ (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \ @@ -375,17 +364,16 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, /** * for_each_pool_worker - iterate through all workers of a worker_pool * @worker: iteration cursor - * @wi: integer used for iteration * @pool: worker_pool to iterate workers of * - * This must be called with either @pool->manager_mutex or ->lock held. + * This must be called with @pool->attach_mutex. * * The if/else clause exists only for the lockdep assertion and can be * ignored. */ -#define for_each_pool_worker(worker, wi, pool) \ - idr_for_each_entry(&(pool)->worker_idr, (worker), (wi)) \ - if (({ assert_manager_or_pool_lock((pool)); false; })) { } \ +#define for_each_pool_worker(worker, pool) \ + list_for_each_entry((worker), &(pool)->workers, node) \ + if (({ lockdep_assert_held(&pool->attach_mutex); false; })) { } \ else /** @@ -516,6 +504,13 @@ void destroy_work_on_stack(struct work_struct *work) } EXPORT_SYMBOL_GPL(destroy_work_on_stack); +void destroy_delayed_work_on_stack(struct delayed_work *work) +{ + destroy_timer_on_stack(&work->timer); + debug_object_free(&work->work, &work_debug_descr); +} +EXPORT_SYMBOL_GPL(destroy_delayed_work_on_stack); + #else static inline void debug_work_activate(struct work_struct *work) { } static inline void debug_work_deactivate(struct work_struct *work) { } @@ -756,13 +751,6 @@ static bool need_to_create_worker(struct worker_pool *pool) return need_more_worker(pool) && !may_start_working(pool); } -/* Do I need to be the manager? */ -static bool need_to_manage_workers(struct worker_pool *pool) -{ - return need_to_create_worker(pool) || - (pool->flags & POOL_MANAGE_WORKERS); -} - /* Do we have too many workers and should some go away? */ static bool too_many_workers(struct worker_pool *pool) { @@ -784,8 +772,8 @@ static bool too_many_workers(struct worker_pool *pool) * Wake up functions. */ -/* Return the first worker. Safe with preemption disabled */ -static struct worker *first_worker(struct worker_pool *pool) +/* Return the first idle worker. Safe with preemption disabled */ +static struct worker *first_idle_worker(struct worker_pool *pool) { if (unlikely(list_empty(&pool->idle_list))) return NULL; @@ -804,7 +792,7 @@ static struct worker *first_worker(struct worker_pool *pool) */ static void wake_up_worker(struct worker_pool *pool) { - struct worker *worker = first_worker(pool); + struct worker *worker = first_idle_worker(pool); if (likely(worker)) wake_up_process(worker->task); @@ -878,7 +866,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu) */ if (atomic_dec_and_test(&pool->nr_running) && !list_empty(&pool->worklist)) - to_wakeup = first_worker(pool); + to_wakeup = first_idle_worker(pool); return to_wakeup ? to_wakeup->task : NULL; } @@ -1614,70 +1602,6 @@ static void worker_leave_idle(struct worker *worker) list_del_init(&worker->entry); } -/** - * worker_maybe_bind_and_lock - try to bind %current to worker_pool and lock it - * @pool: target worker_pool - * - * Bind %current to the cpu of @pool if it is associated and lock @pool. - * - * Works which are scheduled while the cpu is online must at least be - * scheduled to a worker which is bound to the cpu so that if they are - * flushed from cpu callbacks while cpu is going down, they are - * guaranteed to execute on the cpu. - * - * This function is to be used by unbound workers and rescuers to bind - * themselves to the target cpu and may race with cpu going down or - * coming online. kthread_bind() can't be used because it may put the - * worker to already dead cpu and set_cpus_allowed_ptr() can't be used - * verbatim as it's best effort and blocking and pool may be - * [dis]associated in the meantime. - * - * This function tries set_cpus_allowed() and locks pool and verifies the - * binding against %POOL_DISASSOCIATED which is set during - * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker - * enters idle state or fetches works without dropping lock, it can - * guarantee the scheduling requirement described in the first paragraph. - * - * CONTEXT: - * Might sleep. Called without any lock but returns with pool->lock - * held. - * - * Return: - * %true if the associated pool is online (@worker is successfully - * bound), %false if offline. - */ -static bool worker_maybe_bind_and_lock(struct worker_pool *pool) -__acquires(&pool->lock) -{ - while (true) { - /* - * The following call may fail, succeed or succeed - * without actually migrating the task to the cpu if - * it races with cpu hotunplug operation. Verify - * against POOL_DISASSOCIATED. - */ - if (!(pool->flags & POOL_DISASSOCIATED)) - set_cpus_allowed_ptr(current, pool->attrs->cpumask); - - spin_lock_irq(&pool->lock); - if (pool->flags & POOL_DISASSOCIATED) - return false; - if (task_cpu(current) == pool->cpu && - cpumask_equal(¤t->cpus_allowed, pool->attrs->cpumask)) - return true; - spin_unlock_irq(&pool->lock); - - /* - * We've raced with CPU hot[un]plug. Give it a breather - * and retry migration. cond_resched() is required here; - * otherwise, we might deadlock against cpu_stop trying to - * bring down the CPU on non-preemptive kernel. - */ - cpu_relax(); - cond_resched(); - } -} - static struct worker *alloc_worker(void) { struct worker *worker; @@ -1686,6 +1610,7 @@ static struct worker *alloc_worker(void) if (worker) { INIT_LIST_HEAD(&worker->entry); INIT_LIST_HEAD(&worker->scheduled); + INIT_LIST_HEAD(&worker->node); /* on creation a worker is in !idle && prep state */ worker->flags = WORKER_PREP; } @@ -1693,12 +1618,68 @@ static struct worker *alloc_worker(void) } /** + * worker_attach_to_pool() - attach a worker to a pool + * @worker: worker to be attached + * @pool: the target pool + * + * Attach @worker to @pool. Once attached, the %WORKER_UNBOUND flag and + * cpu-binding of @worker are kept coordinated with the pool across + * cpu-[un]hotplugs. + */ +static void worker_attach_to_pool(struct worker *worker, + struct worker_pool *pool) +{ + mutex_lock(&pool->attach_mutex); + + /* + * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any + * online CPUs. It'll be re-applied when any of the CPUs come up. + */ + set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); + + /* + * The pool->attach_mutex ensures %POOL_DISASSOCIATED remains + * stable across this function. See the comments above the + * flag definition for details. + */ + if (pool->flags & POOL_DISASSOCIATED) + worker->flags |= WORKER_UNBOUND; + + list_add_tail(&worker->node, &pool->workers); + + mutex_unlock(&pool->attach_mutex); +} + +/** + * worker_detach_from_pool() - detach a worker from its pool + * @worker: worker which is attached to its pool + * @pool: the pool @worker is attached to + * + * Undo the attaching which had been done in worker_attach_to_pool(). The + * caller worker shouldn't access to the pool after detached except it has + * other reference to the pool. + */ +static void worker_detach_from_pool(struct worker *worker, + struct worker_pool *pool) +{ + struct completion *detach_completion = NULL; + + mutex_lock(&pool->attach_mutex); + list_del(&worker->node); + if (list_empty(&pool->workers)) + detach_completion = pool->detach_completion; + mutex_unlock(&pool->attach_mutex); + + if (detach_completion) + complete(detach_completion); +} + +/** * create_worker - create a new workqueue worker * @pool: pool the new worker will belong to * - * Create a new worker which is bound to @pool. The returned worker - * can be started by calling start_worker() or destroyed using - * destroy_worker(). + * Create a new worker which is attached to @pool. The new worker must be + * started by start_worker(). * * CONTEXT: * Might sleep. Does GFP_KERNEL allocations. @@ -1712,19 +1693,8 @@ static struct worker *create_worker(struct worker_pool *pool) int id = -1; char id_buf[16]; - lockdep_assert_held(&pool->manager_mutex); - - /* - * ID is needed to determine kthread name. Allocate ID first - * without installing the pointer. - */ - idr_preload(GFP_KERNEL); - spin_lock_irq(&pool->lock); - - id = idr_alloc(&pool->worker_idr, NULL, 0, 0, GFP_NOWAIT); - - spin_unlock_irq(&pool->lock); - idr_preload_end(); + /* ID is needed to determine kthread name */ + id = ida_simple_get(&pool->worker_ida, 0, 0, GFP_KERNEL); if (id < 0) goto fail; @@ -1751,33 +1721,14 @@ static struct worker *create_worker(struct worker_pool *pool) /* prevent userland from meddling with cpumask of workqueue workers */ worker->task->flags |= PF_NO_SETAFFINITY; - /* - * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any - * online CPUs. It'll be re-applied when any of the CPUs come up. - */ - set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); - - /* - * The caller is responsible for ensuring %POOL_DISASSOCIATED - * remains stable across this function. See the comments above the - * flag definition for details. - */ - if (pool->flags & POOL_DISASSOCIATED) - worker->flags |= WORKER_UNBOUND; - - /* successful, commit the pointer to idr */ - spin_lock_irq(&pool->lock); - idr_replace(&pool->worker_idr, worker, worker->id); - spin_unlock_irq(&pool->lock); + /* successful, attach the worker to the pool */ + worker_attach_to_pool(worker, pool); return worker; fail: - if (id >= 0) { - spin_lock_irq(&pool->lock); - idr_remove(&pool->worker_idr, id); - spin_unlock_irq(&pool->lock); - } + if (id >= 0) + ida_simple_remove(&pool->worker_ida, id); kfree(worker); return NULL; } @@ -1793,7 +1744,6 @@ fail: */ static void start_worker(struct worker *worker) { - worker->flags |= WORKER_STARTED; worker->pool->nr_workers++; worker_enter_idle(worker); wake_up_process(worker->task); @@ -1811,8 +1761,6 @@ static int create_and_start_worker(struct worker_pool *pool) { struct worker *worker; - mutex_lock(&pool->manager_mutex); - worker = create_worker(pool); if (worker) { spin_lock_irq(&pool->lock); @@ -1820,8 +1768,6 @@ static int create_and_start_worker(struct worker_pool *pool) spin_unlock_irq(&pool->lock); } - mutex_unlock(&pool->manager_mutex); - return worker ? 0 : -ENOMEM; } @@ -1829,39 +1775,30 @@ static int create_and_start_worker(struct worker_pool *pool) * destroy_worker - destroy a workqueue worker * @worker: worker to be destroyed * - * Destroy @worker and adjust @pool stats accordingly. + * Destroy @worker and adjust @pool stats accordingly. The worker should + * be idle. * * CONTEXT: - * spin_lock_irq(pool->lock) which is released and regrabbed. + * spin_lock_irq(pool->lock). */ static void destroy_worker(struct worker *worker) { struct worker_pool *pool = worker->pool; - lockdep_assert_held(&pool->manager_mutex); lockdep_assert_held(&pool->lock); /* sanity check frenzy */ if (WARN_ON(worker->current_work) || - WARN_ON(!list_empty(&worker->scheduled))) + WARN_ON(!list_empty(&worker->scheduled)) || + WARN_ON(!(worker->flags & WORKER_IDLE))) return; - if (worker->flags & WORKER_STARTED) - pool->nr_workers--; - if (worker->flags & WORKER_IDLE) - pool->nr_idle--; + pool->nr_workers--; + pool->nr_idle--; list_del_init(&worker->entry); worker->flags |= WORKER_DIE; - - idr_remove(&pool->worker_idr, worker->id); - - spin_unlock_irq(&pool->lock); - - kthread_stop(worker->task); - kfree(worker); - - spin_lock_irq(&pool->lock); + wake_up_process(worker->task); } static void idle_worker_timeout(unsigned long __pool) @@ -1870,7 +1807,7 @@ static void idle_worker_timeout(unsigned long __pool) spin_lock_irq(&pool->lock); - if (too_many_workers(pool)) { + while (too_many_workers(pool)) { struct worker *worker; unsigned long expires; @@ -1878,13 +1815,12 @@ static void idle_worker_timeout(unsigned long __pool) worker = list_entry(pool->idle_list.prev, struct worker, entry); expires = worker->last_active + IDLE_WORKER_TIMEOUT; - if (time_before(jiffies, expires)) + if (time_before(jiffies, expires)) { mod_timer(&pool->idle_timer, expires); - else { - /* it's been idle for too long, wake up manager */ - pool->flags |= POOL_MANAGE_WORKERS; - wake_up_worker(pool); + break; } + + destroy_worker(worker); } spin_unlock_irq(&pool->lock); @@ -1902,6 +1838,12 @@ static void send_mayday(struct work_struct *work) /* mayday mayday mayday */ if (list_empty(&pwq->mayday_node)) { + /* + * If @pwq is for an unbound wq, its base ref may be put at + * any time due to an attribute change. Pin @pwq until the + * rescuer is done with it. + */ + get_pwq(pwq); list_add_tail(&pwq->mayday_node, &wq->maydays); wake_up_process(wq->rescuer->task); } @@ -1997,44 +1939,6 @@ restart: } /** - * maybe_destroy_worker - destroy workers which have been idle for a while - * @pool: pool to destroy workers for - * - * Destroy @pool workers which have been idle for longer than - * IDLE_WORKER_TIMEOUT. - * - * LOCKING: - * spin_lock_irq(pool->lock) which may be released and regrabbed - * multiple times. Called only from manager. - * - * Return: - * %false if no action was taken and pool->lock stayed locked, %true - * otherwise. - */ -static bool maybe_destroy_workers(struct worker_pool *pool) -{ - bool ret = false; - - while (too_many_workers(pool)) { - struct worker *worker; - unsigned long expires; - - worker = list_entry(pool->idle_list.prev, struct worker, entry); - expires = worker->last_active + IDLE_WORKER_TIMEOUT; - - if (time_before(jiffies, expires)) { - mod_timer(&pool->idle_timer, expires); - break; - } - - destroy_worker(worker); - ret = true; - } - - return ret; -} - -/** * manage_workers - manage worker pool * @worker: self * @@ -2063,8 +1967,6 @@ static bool manage_workers(struct worker *worker) bool ret = false; /* - * Managership is governed by two mutexes - manager_arb and - * manager_mutex. manager_arb handles arbitration of manager role. * Anyone who successfully grabs manager_arb wins the arbitration * and becomes the manager. mutex_trylock() on pool->manager_arb * failure while holding pool->lock reliably indicates that someone @@ -2073,40 +1975,12 @@ static bool manage_workers(struct worker *worker) * grabbing manager_arb is responsible for actually performing * manager duties. If manager_arb is grabbed and released without * actual management, the pool may stall indefinitely. - * - * manager_mutex is used for exclusion of actual management - * operations. The holder of manager_mutex can be sure that none - * of management operations, including creation and destruction of - * workers, won't take place until the mutex is released. Because - * manager_mutex doesn't interfere with manager role arbitration, - * it is guaranteed that the pool's management, while may be - * delayed, won't be disturbed by someone else grabbing - * manager_mutex. */ if (!mutex_trylock(&pool->manager_arb)) return ret; - /* - * With manager arbitration won, manager_mutex would be free in - * most cases. trylock first without dropping @pool->lock. - */ - if (unlikely(!mutex_trylock(&pool->manager_mutex))) { - spin_unlock_irq(&pool->lock); - mutex_lock(&pool->manager_mutex); - spin_lock_irq(&pool->lock); - ret = true; - } - - pool->flags &= ~POOL_MANAGE_WORKERS; - - /* - * Destroy and then create so that may_start_working() is true - * on return. - */ - ret |= maybe_destroy_workers(pool); ret |= maybe_create_worker(pool); - mutex_unlock(&pool->manager_mutex); mutex_unlock(&pool->manager_arb); return ret; } @@ -2294,6 +2168,11 @@ woke_up: spin_unlock_irq(&pool->lock); WARN_ON_ONCE(!list_empty(&worker->entry)); worker->task->flags &= ~PF_WQ_WORKER; + + set_task_comm(worker->task, "kworker/dying"); + ida_simple_remove(&pool->worker_ida, worker->id); + worker_detach_from_pool(worker, pool); + kfree(worker); return 0; } @@ -2341,9 +2220,6 @@ recheck: worker_set_flags(worker, WORKER_PREP, false); sleep: - if (unlikely(need_to_manage_workers(pool)) && manage_workers(worker)) - goto recheck; - /* * pool->lock is held and there's no work to process and no need to * manage, sleep. Workers are woken up only while holding @@ -2384,6 +2260,7 @@ static int rescuer_thread(void *__rescuer) struct worker *rescuer = __rescuer; struct workqueue_struct *wq = rescuer->rescue_wq; struct list_head *scheduled = &rescuer->scheduled; + bool should_stop; set_user_nice(current, RESCUER_NICE_LEVEL); @@ -2395,11 +2272,15 @@ static int rescuer_thread(void *__rescuer) repeat: set_current_state(TASK_INTERRUPTIBLE); - if (kthread_should_stop()) { - __set_current_state(TASK_RUNNING); - rescuer->task->flags &= ~PF_WQ_WORKER; - return 0; - } + /* + * By the time the rescuer is requested to stop, the workqueue + * shouldn't have any work pending, but @wq->maydays may still have + * pwq(s) queued. This can happen by non-rescuer workers consuming + * all the work items before the rescuer got to them. Go through + * @wq->maydays processing before acting on should_stop so that the + * list is always empty on exit. + */ + should_stop = kthread_should_stop(); /* see whether any pwq is asking for help */ spin_lock_irq(&wq_mayday_lock); @@ -2415,8 +2296,9 @@ repeat: spin_unlock_irq(&wq_mayday_lock); - /* migrate to the target cpu if possible */ - worker_maybe_bind_and_lock(pool); + worker_attach_to_pool(rescuer, pool); + + spin_lock_irq(&pool->lock); rescuer->pool = pool; /* @@ -2429,6 +2311,17 @@ repeat: move_linked_works(work, scheduled, &n); process_scheduled_works(rescuer); + spin_unlock_irq(&pool->lock); + + worker_detach_from_pool(rescuer, pool); + + spin_lock_irq(&pool->lock); + + /* + * Put the reference grabbed by send_mayday(). @pool won't + * go away while we're holding its lock. + */ + put_pwq(pwq); /* * Leave this pool. If keep_working() is %true, notify a @@ -2445,6 +2338,12 @@ repeat: spin_unlock_irq(&wq_mayday_lock); + if (should_stop) { + __set_current_state(TASK_RUNNING); + rescuer->task->flags &= ~PF_WQ_WORKER; + return 0; + } + /* rescuers should never participate in concurrency management */ WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING)); schedule(); @@ -3218,7 +3117,7 @@ static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr, return -ENOMEM; if (sscanf(buf, "%d", &attrs->nice) == 1 && - attrs->nice >= -20 && attrs->nice <= 19) + attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE) ret = apply_workqueue_attrs(wq, attrs); else ret = -EINVAL; @@ -3385,6 +3284,7 @@ int workqueue_sysfs_register(struct workqueue_struct *wq) } } + dev_set_uevent_suppress(&wq_dev->dev, false); kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD); return 0; } @@ -3513,9 +3413,10 @@ static int init_worker_pool(struct worker_pool *pool) (unsigned long)pool); mutex_init(&pool->manager_arb); - mutex_init(&pool->manager_mutex); - idr_init(&pool->worker_idr); + mutex_init(&pool->attach_mutex); + INIT_LIST_HEAD(&pool->workers); + ida_init(&pool->worker_ida); INIT_HLIST_NODE(&pool->hash_node); pool->refcnt = 1; @@ -3530,7 +3431,7 @@ static void rcu_free_pool(struct rcu_head *rcu) { struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu); - idr_destroy(&pool->worker_idr); + ida_destroy(&pool->worker_ida); free_workqueue_attrs(pool->attrs); kfree(pool); } @@ -3548,6 +3449,7 @@ static void rcu_free_pool(struct rcu_head *rcu) */ static void put_unbound_pool(struct worker_pool *pool) { + DECLARE_COMPLETION_ONSTACK(detach_completion); struct worker *worker; lockdep_assert_held(&wq_pool_mutex); @@ -3568,18 +3470,24 @@ static void put_unbound_pool(struct worker_pool *pool) /* * Become the manager and destroy all workers. Grabbing * manager_arb prevents @pool's workers from blocking on - * manager_mutex. + * attach_mutex. */ mutex_lock(&pool->manager_arb); - mutex_lock(&pool->manager_mutex); - spin_lock_irq(&pool->lock); - while ((worker = first_worker(pool))) + spin_lock_irq(&pool->lock); + while ((worker = first_idle_worker(pool))) destroy_worker(worker); WARN_ON(pool->nr_workers || pool->nr_idle); - spin_unlock_irq(&pool->lock); - mutex_unlock(&pool->manager_mutex); + + mutex_lock(&pool->attach_mutex); + if (!list_empty(&pool->workers)) + pool->detach_completion = &detach_completion; + mutex_unlock(&pool->attach_mutex); + + if (pool->detach_completion) + wait_for_completion(pool->detach_completion); + mutex_unlock(&pool->manager_arb); /* shut down the timers */ @@ -3625,9 +3533,6 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) if (!pool || init_worker_pool(pool) < 0) goto fail; - if (workqueue_freezing) - pool->flags |= POOL_FREEZING; - lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */ copy_workqueue_attrs(pool->attrs, attrs); @@ -3734,7 +3639,12 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) spin_lock_irq(&pwq->pool->lock); - if (!freezable || !(pwq->pool->flags & POOL_FREEZING)) { + /* + * During [un]freezing, the caller is responsible for ensuring that + * this function is called at least once after @workqueue_freezing + * is updated and visible. + */ + if (!freezable || !workqueue_freezing) { pwq->max_active = wq->saved_max_active; while (!list_empty(&pwq->delayed_works) && @@ -4066,17 +3976,13 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, * Let's determine what needs to be done. If the target cpumask is * different from wq's, we need to compare it to @pwq's and create * a new one if they don't match. If the target cpumask equals - * wq's, the default pwq should be used. If @pwq is already the - * default one, nothing to do; otherwise, install the default one. + * wq's, the default pwq should be used. */ if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) { if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask)) goto out_unlock; } else { - if (pwq == wq->dfl_pwq) - goto out_unlock; - else - goto use_dfl_pwq; + goto use_dfl_pwq; } mutex_unlock(&wq->mutex); @@ -4084,9 +3990,10 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, /* create a new pwq */ pwq = alloc_unbound_pwq(wq, target_attrs); if (!pwq) { - pr_warning("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n", - wq->name); - goto out_unlock; + pr_warn("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n", + wq->name); + mutex_lock(&wq->mutex); + goto use_dfl_pwq; } /* @@ -4561,28 +4468,27 @@ static void wq_unbind_fn(struct work_struct *work) int cpu = smp_processor_id(); struct worker_pool *pool; struct worker *worker; - int wi; for_each_cpu_worker_pool(pool, cpu) { WARN_ON_ONCE(cpu != smp_processor_id()); - mutex_lock(&pool->manager_mutex); + mutex_lock(&pool->attach_mutex); spin_lock_irq(&pool->lock); /* - * We've blocked all manager operations. Make all workers + * We've blocked all attach/detach operations. Make all workers * unbound and set DISASSOCIATED. Before this, all workers * except for the ones which are still executing works from * before the last CPU down must be on the cpu. After * this, they may become diasporas. */ - for_each_pool_worker(worker, wi, pool) + for_each_pool_worker(worker, pool) worker->flags |= WORKER_UNBOUND; pool->flags |= POOL_DISASSOCIATED; spin_unlock_irq(&pool->lock); - mutex_unlock(&pool->manager_mutex); + mutex_unlock(&pool->attach_mutex); /* * Call schedule() so that we cross rq->lock and thus can @@ -4622,9 +4528,8 @@ static void wq_unbind_fn(struct work_struct *work) static void rebind_workers(struct worker_pool *pool) { struct worker *worker; - int wi; - lockdep_assert_held(&pool->manager_mutex); + lockdep_assert_held(&pool->attach_mutex); /* * Restore CPU affinity of all workers. As all idle workers should @@ -4633,13 +4538,13 @@ static void rebind_workers(struct worker_pool *pool) * of all workers first and then clear UNBOUND. As we're called * from CPU_ONLINE, the following shouldn't fail. */ - for_each_pool_worker(worker, wi, pool) + for_each_pool_worker(worker, pool) WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask) < 0); spin_lock_irq(&pool->lock); - for_each_pool_worker(worker, wi, pool) { + for_each_pool_worker(worker, pool) { unsigned int worker_flags = worker->flags; /* @@ -4691,9 +4596,8 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu) { static cpumask_t cpumask; struct worker *worker; - int wi; - lockdep_assert_held(&pool->manager_mutex); + lockdep_assert_held(&pool->attach_mutex); /* is @cpu allowed for @pool? */ if (!cpumask_test_cpu(cpu, pool->attrs->cpumask)) @@ -4705,7 +4609,7 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu) return; /* as we're called from CPU_ONLINE, the following shouldn't fail */ - for_each_pool_worker(worker, wi, pool) + for_each_pool_worker(worker, pool) WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask) < 0); } @@ -4738,7 +4642,7 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb, mutex_lock(&wq_pool_mutex); for_each_pool(pool, pi) { - mutex_lock(&pool->manager_mutex); + mutex_lock(&pool->attach_mutex); if (pool->cpu == cpu) { spin_lock_irq(&pool->lock); @@ -4750,7 +4654,7 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb, restore_unbound_workers_cpumask(pool, cpu); } - mutex_unlock(&pool->manager_mutex); + mutex_unlock(&pool->attach_mutex); } /* update NUMA affinity of unbound workqueues */ @@ -4849,24 +4753,14 @@ EXPORT_SYMBOL_GPL(work_on_cpu); */ void freeze_workqueues_begin(void) { - struct worker_pool *pool; struct workqueue_struct *wq; struct pool_workqueue *pwq; - int pi; mutex_lock(&wq_pool_mutex); WARN_ON_ONCE(workqueue_freezing); workqueue_freezing = true; - /* set FREEZING */ - for_each_pool(pool, pi) { - spin_lock_irq(&pool->lock); - WARN_ON_ONCE(pool->flags & POOL_FREEZING); - pool->flags |= POOL_FREEZING; - spin_unlock_irq(&pool->lock); - } - list_for_each_entry(wq, &workqueues, list) { mutex_lock(&wq->mutex); for_each_pwq(pwq, wq) @@ -4936,21 +4830,13 @@ void thaw_workqueues(void) { struct workqueue_struct *wq; struct pool_workqueue *pwq; - struct worker_pool *pool; - int pi; mutex_lock(&wq_pool_mutex); if (!workqueue_freezing) goto out_unlock; - /* clear FREEZING */ - for_each_pool(pool, pi) { - spin_lock_irq(&pool->lock); - WARN_ON_ONCE(!(pool->flags & POOL_FREEZING)); - pool->flags &= ~POOL_FREEZING; - spin_unlock_irq(&pool->lock); - } + workqueue_freezing = false; /* restore max_active and repopulate worklist */ list_for_each_entry(wq, &workqueues, list) { @@ -4960,7 +4846,6 @@ void thaw_workqueues(void) mutex_unlock(&wq->mutex); } - workqueue_freezing = false; out_unlock: mutex_unlock(&wq_pool_mutex); } @@ -4995,7 +4880,7 @@ static void __init wq_numa_init(void) BUG_ON(!tbl); for_each_node(node) - BUG_ON(!alloc_cpumask_var_node(&tbl[node], GFP_KERNEL, + BUG_ON(!zalloc_cpumask_var_node(&tbl[node], GFP_KERNEL, node_online(node) ? node : NUMA_NO_NODE)); for_each_possible_cpu(cpu) { diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index 7e2204db0b1..45215870ac6 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -37,6 +37,8 @@ struct worker { struct task_struct *task; /* I: worker task */ struct worker_pool *pool; /* I: the associated pool */ /* L: for rescuers */ + struct list_head node; /* A: anchored at pool->workers */ + /* A: runs through worker->node */ unsigned long last_active; /* L: last active timestamp */ unsigned int flags; /* X: flags */ |
